diff options
Diffstat (limited to 'module/zfs')
43 files changed, 6124 insertions, 744 deletions
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index fe0d5b523..fefb29654 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -81,6 +81,9 @@ $(MODULE)-objs += vdev.o $(MODULE)-objs += vdev_cache.o $(MODULE)-objs += vdev_disk.o $(MODULE)-objs += vdev_file.o +$(MODULE)-objs += vdev_indirect.o +$(MODULE)-objs += vdev_indirect_births.o +$(MODULE)-objs += vdev_indirect_mapping.o $(MODULE)-objs += vdev_label.o $(MODULE)-objs += vdev_mirror.o $(MODULE)-objs += vdev_missing.o @@ -88,6 +91,7 @@ $(MODULE)-objs += vdev_queue.o $(MODULE)-objs += vdev_raidz.o $(MODULE)-objs += vdev_raidz_math.o $(MODULE)-objs += vdev_raidz_math_scalar.o +$(MODULE)-objs += vdev_removal.o $(MODULE)-objs += vdev_root.o $(MODULE)-objs += zap.o $(MODULE)-objs += zap_leaf.o diff --git a/module/zfs/arc.c b/module/zfs/arc.c index d73d6ffcc..040e94365 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -6227,7 +6227,7 @@ top: devw = hdr->b_l2hdr.b_dev->l2ad_writing; addr = hdr->b_l2hdr.b_daddr; /* - * Lock out device removal. + * Lock out L2ARC device removal. */ if (vdev_is_dead(vd) || !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index 1708eb710..66ab84df5 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -175,6 +175,12 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) return (0); } +boolean_t +bpobj_is_open(const bpobj_t *bpo) +{ + return (bpo->bpo_object != 0); +} + void bpobj_close(bpobj_t *bpo) { @@ -193,11 +199,11 @@ bpobj_close(bpobj_t *bpo) mutex_destroy(&bpo->bpo_lock); } -static boolean_t -bpobj_hasentries(bpobj_t *bpo) +boolean_t +bpobj_is_empty(bpobj_t *bpo) { - return (bpo->bpo_phys->bpo_num_blkptrs != 0 || - (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs != 0)); + return (bpo->bpo_phys->bpo_num_blkptrs == 0 && + (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0)); } static int @@ -210,11 +216,9 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, int err = 0; dmu_buf_t *dbuf = NULL; + ASSERT(bpobj_is_open(bpo)); mutex_enter(&bpo->bpo_lock); - if (!bpobj_hasentries(bpo)) - goto out; - if (free) dmu_buf_will_dirty(bpo->bpo_dbuf, tx); @@ -344,7 +348,7 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, out: /* If there are no entries, there should be no bytes. */ - if (!bpobj_hasentries(bpo)) { + if (bpobj_is_empty(bpo)) { ASSERT0(bpo->bpo_phys->bpo_bytes); ASSERT0(bpo->bpo_phys->bpo_comp); ASSERT0(bpo->bpo_phys->bpo_uncomp); @@ -379,6 +383,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) bpobj_t subbpo; uint64_t used, comp, uncomp, subsubobjs; + ASSERT(bpobj_is_open(bpo)); + ASSERT(subobj != 0); ASSERT(bpo->bpo_havesubobj); ASSERT(bpo->bpo_havecomp); ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); @@ -391,7 +397,7 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); - if (!bpobj_hasentries(&subbpo)) { + if (bpobj_is_empty(&subbpo)) { /* No point in having an empty subobj. */ bpobj_close(&subbpo); bpobj_free(bpo->bpo_os, subobj, tx); @@ -465,6 +471,7 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) int blkoff; blkptr_t *bparray; + ASSERT(bpobj_is_open(bpo)); ASSERT(!BP_IS_HOLE(bp)); ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); @@ -550,6 +557,7 @@ space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) int bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { + ASSERT(bpobj_is_open(bpo)); mutex_enter(&bpo->bpo_lock); *usedp = bpo->bpo_phys->bpo_bytes; @@ -576,6 +584,8 @@ bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, struct space_range_arg sra = { 0 }; int err; + ASSERT(bpobj_is_open(bpo)); + /* * As an optimization, if they want the whole txg range, just * get bpo_bytes rather than iterating over the bps. diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 1a6e560d2..b0ae6cc72 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -47,6 +47,7 @@ #include <sys/trace_dbuf.h> #include <sys/callb.h> #include <sys/abd.h> +#include <sys/vdev.h> kstat_t *dbuf_ksp; @@ -3530,6 +3531,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) db->db_data_pending = dr; mutex_exit(&db->db_mtx); + dbuf_write(dr, db->db_buf, tx); zio = dr->dr_zio; @@ -4054,6 +4056,142 @@ dbuf_write_override_done(zio_t *zio) abd_put(zio->io_abd); } +typedef struct dbuf_remap_impl_callback_arg { + objset_t *drica_os; + uint64_t drica_blk_birth; + dmu_tx_t *drica_tx; +} dbuf_remap_impl_callback_arg_t; + +static void +dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size, + void *arg) +{ + dbuf_remap_impl_callback_arg_t *drica = arg; + objset_t *os = drica->drica_os; + spa_t *spa = dmu_objset_spa(os); + dmu_tx_t *tx = drica->drica_tx; + + ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); + + if (os == spa_meta_objset(spa)) { + spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx); + } else { + dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset, + size, drica->drica_blk_birth, tx); + } +} + +static void +dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx) +{ + blkptr_t bp_copy = *bp; + spa_t *spa = dmu_objset_spa(dn->dn_objset); + dbuf_remap_impl_callback_arg_t drica; + + ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); + + drica.drica_os = dn->dn_objset; + drica.drica_blk_birth = bp->blk_birth; + drica.drica_tx = tx; + if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, + &drica)) { + /* + * The struct_rwlock prevents dbuf_read_impl() from + * dereferencing the BP while we are changing it. To + * avoid lock contention, only grab it when we are actually + * changing the BP. + */ + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + *bp = bp_copy; + rw_exit(&dn->dn_struct_rwlock); + } +} + +/* + * Returns true if a dbuf_remap would modify the dbuf. We do this by attempting + * to remap a copy of every bp in the dbuf. + */ +boolean_t +dbuf_can_remap(const dmu_buf_impl_t *db) +{ + spa_t *spa = dmu_objset_spa(db->db_objset); + blkptr_t *bp = db->db.db_data; + boolean_t ret = B_FALSE; + + ASSERT3U(db->db_level, >, 0); + ASSERT3S(db->db_state, ==, DB_CACHED); + + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { + blkptr_t bp_copy = bp[i]; + if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) { + ret = B_TRUE; + break; + } + } + spa_config_exit(spa, SCL_VDEV, FTAG); + + return (ret); +} + +boolean_t +dnode_needs_remap(const dnode_t *dn) +{ + spa_t *spa = dmu_objset_spa(dn->dn_objset); + boolean_t ret = B_FALSE; + + if (dn->dn_phys->dn_nlevels == 0) { + return (B_FALSE); + } + + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + for (int j = 0; j < dn->dn_phys->dn_nblkptr; j++) { + blkptr_t bp_copy = dn->dn_phys->dn_blkptr[j]; + if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) { + ret = B_TRUE; + break; + } + } + spa_config_exit(spa, SCL_VDEV, FTAG); + + return (ret); +} + +/* + * Remap any existing BP's to concrete vdevs, if possible. + */ +static void +dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + spa_t *spa = dmu_objset_spa(db->db_objset); + ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); + + if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)) + return; + + if (db->db_level > 0) { + blkptr_t *bp = db->db.db_data; + for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { + dbuf_remap_impl(dn, &bp[i], tx); + } + } else if (db->db.db_object == DMU_META_DNODE_OBJECT) { + dnode_phys_t *dnp = db->db.db_data; + ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==, + DMU_OT_DNODE); + for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; + i += dnp[i].dn_extra_slots + 1) { + for (int j = 0; j < dnp[i].dn_nblkptr; j++) { + dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx); + } + } + } +} + + /* Issue I/O to commit a dirty buffer to disk. */ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) @@ -4087,6 +4225,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) } else { dbuf_release_bp(db); } + dbuf_remap(dn, db, tx); } } diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 24516834f..681033d71 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -760,15 +760,15 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) for (type = 0; type < DDT_TYPES; type++) { for (class = 0; class < DDT_CLASSES; class++) { error = ddt_object_lookup(ddt, type, class, dde); - if (error != ENOENT) + if (error != ENOENT) { + ASSERT0(error); break; + } } if (error != ENOENT) break; } - ASSERT(error == 0 || error == ENOENT); - ddt_enter(ddt); ASSERT(dde->dde_loaded == B_FALSE); @@ -1181,7 +1181,7 @@ ddt_sync(spa_t *spa, uint64_t txg) tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); rio = zio_root(spa, NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL); /* * This function may cause an immediate scan of ddt blocks (see diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 64c898198..0352393dc 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -73,6 +73,13 @@ unsigned long zfs_per_txg_dirty_frees_percent = 30; */ int zfs_dmu_offset_next_sync = 0; +/* + * This can be used for testing, to ensure that certain actions happen + * while in the middle of a remap (which might otherwise complete too + * quickly). + */ +int zfs_object_remap_one_indirect_delay_ticks = 0; + const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { DMU_BSWAP_UINT8, TRUE, FALSE, "unallocated" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "object directory" }, @@ -1114,6 +1121,123 @@ dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, dmu_buf_rele_array(dbp, numbufs, FTAG); } +static int +dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn, + uint64_t last_removal_txg, uint64_t offset) +{ + uint64_t l1blkid = dbuf_whichblock(dn, 1, offset); + int err = 0; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG); + ASSERT3P(dbuf, !=, NULL); + + /* + * If the block hasn't been written yet, this default will ensure + * we don't try to remap it. + */ + uint64_t birth = UINT64_MAX; + ASSERT3U(last_removal_txg, !=, UINT64_MAX); + if (dbuf->db_blkptr != NULL) + birth = dbuf->db_blkptr->blk_birth; + rw_exit(&dn->dn_struct_rwlock); + + /* + * If this L1 was already written after the last removal, then we've + * already tried to remap it. + */ + if (birth <= last_removal_txg && + dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 && + dbuf_can_remap(dbuf)) { + dmu_tx_t *tx = dmu_tx_create(os); + dmu_tx_hold_remap_l1indirect(tx, dn->dn_object); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err == 0) { + (void) dbuf_dirty(dbuf, tx); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + } + + dbuf_rele(dbuf, FTAG); + + delay(zfs_object_remap_one_indirect_delay_ticks); + + return (err); +} + +/* + * Remap all blockpointers in the object, if possible, so that they reference + * only concrete vdevs. + * + * To do this, iterate over the L0 blockpointers and remap any that reference + * an indirect vdev. Note that we only examine L0 blockpointers; since we + * cannot guarantee that we can remap all blockpointer anyways (due to split + * blocks), we do not want to make the code unnecessarily complicated to + * catch the unlikely case that there is an L1 block on an indirect vdev that + * contains no indirect blockpointers. + */ +int +dmu_object_remap_indirects(objset_t *os, uint64_t object, + uint64_t last_removal_txg) +{ + uint64_t offset, l1span; + int err; + dnode_t *dn; + + err = dnode_hold(os, object, FTAG, &dn); + if (err != 0) { + return (err); + } + + if (dn->dn_nlevels <= 1) { + if (issig(JUSTLOOKING) && issig(FORREAL)) { + err = SET_ERROR(EINTR); + } + + /* + * If the dnode has no indirect blocks, we cannot dirty them. + * We still want to remap the blkptr(s) in the dnode if + * appropriate, so mark it as dirty. + */ + if (err == 0 && dnode_needs_remap(dn)) { + dmu_tx_t *tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, dn->dn_object); + if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) { + dnode_setdirty(dn, tx); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + } + + dnode_rele(dn, FTAG); + return (err); + } + + offset = 0; + l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT + + dn->dn_datablkshift); + /* + * Find the next L1 indirect that is not a hole. + */ + while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) { + if (issig(JUSTLOOKING) && issig(FORREAL)) { + err = SET_ERROR(EINTR); + break; + } + if ((err = dmu_object_remap_one_indirect(os, dn, + last_removal_txg, offset)) != 0) { + break; + } + offset += l1span; + } + + dnode_rele(dn, FTAG); + return (err); +} + void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index a0982f6ec..a44f485b7 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -55,6 +55,7 @@ #include <sys/zfs_onexit.h> #include <sys/dsl_destroy.h> #include <sys/vdev.h> +#include <sys/zfeature.h> #include <sys/policy.h> #include <sys/spa_impl.h> #include <sys/dmu_send.h> @@ -383,6 +384,10 @@ dnode_multilist_index_func(multilist_t *ml, void *obj) multilist_get_num_sublists(ml)); } +/* + * Instantiates the objset_t in-memory structure corresponding to the + * objset_phys_t that's pointed to by the specified blkptr_t. + */ int dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, objset_t **osp) @@ -392,6 +397,17 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); + /* + * The $ORIGIN dataset (if it exists) doesn't have an associated + * objset, so there's no reason to open it. The $ORIGIN dataset + * will not exist on pools older than SPA_VERSION_ORIGIN. + */ + if (ds != NULL && spa_get_dsl(spa) != NULL && + spa_get_dsl(spa)->dp_origin_snap != NULL) { + ASSERT3P(ds->ds_dir, !=, + spa_get_dsl(spa)->dp_origin_snap->ds_dir); + } + os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); os->os_dsl_dataset = ds; os->os_spa = spa; @@ -1321,6 +1337,101 @@ dmu_objset_clone(const char *clone, const char *origin) 6, ZFS_SPACE_CHECK_NORMAL)); } +static int +dmu_objset_remap_indirects_impl(objset_t *os, uint64_t last_removed_txg) +{ + int error = 0; + uint64_t object = 0; + while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { + error = dmu_object_remap_indirects(os, object, + last_removed_txg); + /* + * If the ZPL removed the object before we managed to dnode_hold + * it, we would get an ENOENT. If the ZPL declares its intent + * to remove the object (dnode_free) before we manage to + * dnode_hold it, we would get an EEXIST. In either case, we + * want to continue remapping the other objects in the objset; + * in all other cases, we want to break early. + */ + if (error != 0 && error != ENOENT && error != EEXIST) { + break; + } + } + if (error == ESRCH) { + error = 0; + } + return (error); +} + +int +dmu_objset_remap_indirects(const char *fsname) +{ + int error = 0; + objset_t *os = NULL; + uint64_t last_removed_txg; + uint64_t remap_start_txg; + dsl_dir_t *dd; + + error = dmu_objset_hold(fsname, FTAG, &os); + if (error != 0) { + return (error); + } + dd = dmu_objset_ds(os)->ds_dir; + + if (!spa_feature_is_enabled(dmu_objset_spa(os), + SPA_FEATURE_OBSOLETE_COUNTS)) { + dmu_objset_rele(os, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + if (dsl_dataset_is_snapshot(dmu_objset_ds(os))) { + dmu_objset_rele(os, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * If there has not been a removal, we're done. + */ + last_removed_txg = spa_get_last_removal_txg(dmu_objset_spa(os)); + if (last_removed_txg == -1ULL) { + dmu_objset_rele(os, FTAG); + return (0); + } + + /* + * If we have remapped since the last removal, we're done. + */ + if (dsl_dir_is_zapified(dd)) { + uint64_t last_remap_txg; + if (zap_lookup(spa_meta_objset(dmu_objset_spa(os)), + dd->dd_object, DD_FIELD_LAST_REMAP_TXG, + sizeof (last_remap_txg), 1, &last_remap_txg) == 0 && + last_remap_txg > last_removed_txg) { + dmu_objset_rele(os, FTAG); + return (0); + } + } + + dsl_dataset_long_hold(dmu_objset_ds(os), FTAG); + dsl_pool_rele(dmu_objset_pool(os), FTAG); + + remap_start_txg = spa_last_synced_txg(dmu_objset_spa(os)); + error = dmu_objset_remap_indirects_impl(os, last_removed_txg); + if (error == 0) { + /* + * We update the last_remap_txg to be the start txg so that + * we can guarantee that every block older than last_remap_txg + * that can be remapped has been remapped. + */ + error = dsl_dir_update_last_remap_txg(dd, remap_start_txg); + } + + dsl_dataset_long_rele(dmu_objset_ds(os), FTAG); + dsl_dataset_rele(dmu_objset_ds(os), FTAG); + + return (error); +} + int dmu_objset_snapshot_one(const char *fsname, const char *snapname) { diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index f72859ba1..861769b40 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -316,6 +316,23 @@ dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) } void +dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg == 0); + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_WRITE, 0, 0); + if (txh == NULL) + return; + + dnode_t *dn = txh->txh_dnode; + (void) refcount_add_many(&txh->txh_space_towrite, + 1ULL << dn->dn_indblkshift, FTAG); + dmu_tx_count_dnode(txh); +} + +void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) { dmu_tx_hold_t *txh; diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index a2bf80993..e22560ed1 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -213,9 +213,19 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data) int epbs, max_dist_blks, pf_nblks, ipf_nblks; uint64_t end_of_access_blkid; end_of_access_blkid = blkid + nblks; + spa_t *spa = zf->zf_dnode->dn_objset->os_spa; if (zfs_prefetch_disable) return; + /* + * If we haven't yet loaded the indirect vdevs' mappings, we + * can only read from blocks that we carefully ensure are on + * concrete vdevs (or previously-loaded indirect vdevs). So we + * can't allow the predictive prefetcher to attempt reads of other + * blocks (e.g. of the MOS's dnode obejct). + */ + if (!spa_indirect_vdevs_loaded(spa)) + return; /* * As a fast path for small (single-block) files, ignore access diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index a379527a0..80d5f33d9 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -2077,8 +2077,7 @@ done: { int txgoff = tx->tx_txg & TXG_MASK; if (dn->dn_free_ranges[txgoff] == NULL) { - dn->dn_free_ranges[txgoff] = - range_tree_create(NULL, NULL, &dn->dn_mtx); + dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL); } range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks); diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 40c2df1e7..9823f3183 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -46,6 +46,7 @@ #include <sys/zfs_context.h> #include <sys/zfs_ioctl.h> #include <sys/spa.h> +#include <sys/vdev.h> #include <sys/zfs_znode.h> #include <sys/zfs_onexit.h> #include <sys/zvol.h> @@ -82,6 +83,11 @@ int zfs_max_recordsize = 1 * 1024 * 1024; extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds); +static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, + uint64_t obj, dmu_tx_t *tx); +static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, + dmu_tx_t *tx); + extern int spa_asize_inflation; static zil_header_t zero_zil; @@ -157,6 +163,47 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) DD_USED_REFRSRV, DD_USED_HEAD, tx); } +/* + * Called when the specified segment has been remapped, and is thus no + * longer referenced in the head dataset. The vdev must be indirect. + * + * If the segment is referenced by a snapshot, put it on the remap deadlist. + * Otherwise, add this segment to the obsolete spacemap. + */ +void +dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset, + uint64_t size, uint64_t birth, dmu_tx_t *tx) +{ + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(birth <= tx->tx_txg); + ASSERT(!ds->ds_is_snapshot); + + if (birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { + spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx); + } else { + blkptr_t fakebp; + dva_t *dva = &fakebp.blk_dva[0]; + + ASSERT(ds != NULL); + + mutex_enter(&ds->ds_remap_deadlist_lock); + if (!dsl_dataset_remap_deadlist_exists(ds)) { + dsl_dataset_create_remap_deadlist(ds, tx); + } + mutex_exit(&ds->ds_remap_deadlist_lock); + + BP_ZERO(&fakebp); + fakebp.blk_birth = birth; + DVA_SET_VDEV(dva, vdev); + DVA_SET_OFFSET(dva, offset); + DVA_SET_ASIZE(dva, size); + + dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, tx); + } +} + int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, boolean_t async) @@ -275,8 +322,10 @@ dsl_dataset_evict_async(void *dbu) } bplist_destroy(&ds->ds_pending_deadlist); - if (ds->ds_deadlist.dl_os != NULL) + if (dsl_deadlist_is_open(&ds->ds_deadlist)) dsl_deadlist_close(&ds->ds_deadlist); + if (dsl_deadlist_is_open(&ds->ds_remap_deadlist)) + dsl_deadlist_close(&ds->ds_remap_deadlist); if (ds->ds_dir) dsl_dir_async_rele(ds->ds_dir, ds); @@ -286,6 +335,7 @@ dsl_dataset_evict_async(void *dbu) mutex_destroy(&ds->ds_lock); mutex_destroy(&ds->ds_opening_lock); mutex_destroy(&ds->ds_sendstream_lock); + mutex_destroy(&ds->ds_remap_deadlist_lock); refcount_destroy(&ds->ds_longholds); rrw_destroy(&ds->ds_bp_rwlock); @@ -417,15 +467,23 @@ dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj, ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0; list_link_init(&ds->ds_synced_link); + err = dsl_dir_hold_obj(dp, dsl_dataset_phys(ds)->ds_dir_obj, + NULL, ds, &ds->ds_dir); + if (err != 0) { + kmem_free(ds, sizeof (dsl_dataset_t)); + dmu_buf_rele(dbuf, tag); + return (err); + } + mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ds->ds_remap_deadlist_lock, + NULL, MUTEX_DEFAULT, NULL); rrw_init(&ds->ds_bp_rwlock, B_FALSE); refcount_create(&ds->ds_longholds); bplist_create(&ds->ds_pending_deadlist); - dsl_deadlist_open(&ds->ds_deadlist, - mos, dsl_dataset_phys(ds)->ds_deadlist_obj); list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t), offsetof(dmu_sendarg_t, dsa_link)); @@ -451,20 +509,6 @@ dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj, } } - err = dsl_dir_hold_obj(dp, - dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir); - if (err != 0) { - mutex_destroy(&ds->ds_lock); - mutex_destroy(&ds->ds_opening_lock); - mutex_destroy(&ds->ds_sendstream_lock); - refcount_destroy(&ds->ds_longholds); - bplist_destroy(&ds->ds_pending_deadlist); - dsl_deadlist_close(&ds->ds_deadlist); - kmem_free(ds, sizeof (dsl_dataset_t)); - dmu_buf_rele(dbuf, tag); - return (err); - } - if (!ds->ds_is_snapshot) { ds->ds_snapname[0] = '\0'; if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { @@ -505,6 +549,15 @@ dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj, ds->ds_reserved = ds->ds_quota = 0; } + dsl_deadlist_open(&ds->ds_deadlist, + mos, dsl_dataset_phys(ds)->ds_deadlist_obj); + uint64_t remap_deadlist_obj = + dsl_dataset_get_remap_deadlist_object(ds); + if (remap_deadlist_obj != 0) { + dsl_deadlist_open(&ds->ds_remap_deadlist, mos, + remap_deadlist_obj); + } + dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync, dsl_dataset_evict_async, &ds->ds_dbuf); if (err == 0) @@ -513,6 +566,8 @@ dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj, if (err != 0 || winner != NULL) { bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); + if (dsl_deadlist_is_open(&ds->ds_remap_deadlist)) + dsl_deadlist_close(&ds->ds_remap_deadlist); if (ds->ds_prev) dsl_dataset_rele(ds->ds_prev, ds); dsl_dir_rele(ds->ds_dir, ds); @@ -1457,10 +1512,27 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dsl_deadlist_add_key(&ds->ds_deadlist, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); + if (dsl_dataset_remap_deadlist_exists(ds)) { + uint64_t remap_deadlist_obj = + dsl_dataset_get_remap_deadlist_object(ds); + /* + * Move the remap_deadlist to the snapshot. The head + * will create a new remap deadlist on demand, from + * dsl_dataset_block_remapped(). + */ + dsl_dataset_unset_remap_deadlist_object(ds, tx); + dsl_deadlist_close(&ds->ds_remap_deadlist); + + dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_REMAP_DEADLIST, + sizeof (remap_deadlist_obj), 1, &remap_deadlist_obj, tx)); + } + ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg); dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj; dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg; dsl_dataset_phys(ds)->ds_unique_bytes = 0; + if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; @@ -3365,6 +3437,41 @@ dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, return (0); } +static void +dsl_dataset_swap_remap_deadlists(dsl_dataset_t *clone, + dsl_dataset_t *origin, dmu_tx_t *tx) +{ + uint64_t clone_remap_dl_obj, origin_remap_dl_obj; + dsl_pool_t *dp = dmu_tx_pool(tx); + + ASSERT(dsl_pool_sync_context(dp)); + + clone_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(clone); + origin_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(origin); + + if (clone_remap_dl_obj != 0) { + dsl_deadlist_close(&clone->ds_remap_deadlist); + dsl_dataset_unset_remap_deadlist_object(clone, tx); + } + if (origin_remap_dl_obj != 0) { + dsl_deadlist_close(&origin->ds_remap_deadlist); + dsl_dataset_unset_remap_deadlist_object(origin, tx); + } + + if (clone_remap_dl_obj != 0) { + dsl_dataset_set_remap_deadlist_object(origin, + clone_remap_dl_obj, tx); + dsl_deadlist_open(&origin->ds_remap_deadlist, + dp->dp_meta_objset, clone_remap_dl_obj); + } + if (origin_remap_dl_obj != 0) { + dsl_dataset_set_remap_deadlist_object(clone, + origin_remap_dl_obj, tx); + dsl_deadlist_open(&clone->ds_remap_deadlist, + dp->dp_meta_objset, origin_remap_dl_obj); + } +} + void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_dataset_t *origin_head, dmu_tx_t *tx) @@ -3534,6 +3641,7 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_dataset_phys(clone)->ds_deadlist_obj); dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset, dsl_dataset_phys(origin_head)->ds_deadlist_obj); + dsl_dataset_swap_remap_deadlists(clone, origin_head, tx); dsl_scan_ds_clone_swapped(origin_head, clone, tx); @@ -4042,6 +4150,93 @@ dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds) ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0); } +uint64_t +dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds) +{ + uint64_t remap_deadlist_obj; + int err; + + if (!dsl_dataset_is_zapified(ds)) + return (0); + + err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, + DS_FIELD_REMAP_DEADLIST, sizeof (remap_deadlist_obj), 1, + &remap_deadlist_obj); + + if (err != 0) { + VERIFY3S(err, ==, ENOENT); + return (0); + } + + ASSERT(remap_deadlist_obj != 0); + return (remap_deadlist_obj); +} + +boolean_t +dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds) +{ + EQUIV(dsl_deadlist_is_open(&ds->ds_remap_deadlist), + dsl_dataset_get_remap_deadlist_object(ds) != 0); + return (dsl_deadlist_is_open(&ds->ds_remap_deadlist)); +} + +static void +dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, uint64_t obj, + dmu_tx_t *tx) +{ + ASSERT(obj != 0); + dsl_dataset_zapify(ds, tx); + VERIFY0(zap_add(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, + DS_FIELD_REMAP_DEADLIST, sizeof (obj), 1, &obj, tx)); +} + +static void +dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + VERIFY0(zap_remove(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_REMAP_DEADLIST, tx)); +} + +void +dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t remap_deadlist_object; + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dsl_dataset_remap_deadlist_exists(ds)); + + remap_deadlist_object = ds->ds_remap_deadlist.dl_object; + dsl_deadlist_close(&ds->ds_remap_deadlist); + dsl_deadlist_free(spa_meta_objset(spa), remap_deadlist_object, tx); + dsl_dataset_unset_remap_deadlist_object(ds, tx); + spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); +} + +void +dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t remap_deadlist_obj; + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(MUTEX_HELD(&ds->ds_remap_deadlist_lock)); + /* + * Currently we only create remap deadlists when there are indirect + * vdevs with referenced mappings. + */ + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); + + remap_deadlist_obj = dsl_deadlist_clone( + &ds->ds_deadlist, UINT64_MAX, + dsl_dataset_phys(ds)->ds_prev_snap_obj, tx); + dsl_dataset_set_remap_deadlist_object(ds, + remap_deadlist_obj, tx); + dsl_deadlist_open(&ds->ds_remap_deadlist, spa_meta_objset(spa), + remap_deadlist_obj); + spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); +} + #if defined(_KERNEL) && defined(HAVE_SPL) #if defined(_LP64) module_param(zfs_max_recordsize, int, 0644); diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 0be0d7420..10846a324 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -93,6 +93,8 @@ dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) { dmu_object_info_t doi; + ASSERT(!dsl_deadlist_is_open(dl)); + mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); dl->dl_os = os; dl->dl_object = object; @@ -111,18 +113,26 @@ dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) dl->dl_havetree = B_FALSE; } +boolean_t +dsl_deadlist_is_open(dsl_deadlist_t *dl) +{ + return (dl->dl_os != NULL); +} + void dsl_deadlist_close(dsl_deadlist_t *dl) { void *cookie = NULL; dsl_deadlist_entry_t *dle; - dl->dl_os = NULL; + ASSERT(dsl_deadlist_is_open(dl)); mutex_destroy(&dl->dl_lock); if (dl->dl_oldfmt) { dl->dl_oldfmt = B_FALSE; bpobj_close(&dl->dl_bpobj); + dl->dl_os = NULL; + dl->dl_object = 0; return; } @@ -137,6 +147,8 @@ dsl_deadlist_close(dsl_deadlist_t *dl) dmu_buf_rele(dl->dl_dbuf, dl); dl->dl_dbuf = NULL; dl->dl_phys = NULL; + dl->dl_os = NULL; + dl->dl_object = 0; } uint64_t @@ -311,7 +323,7 @@ static void dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, uint64_t mrs_obj, dmu_tx_t *tx) { - dsl_deadlist_t dl; + dsl_deadlist_t dl = { 0 }; dsl_pool_t *dp = dmu_objset_pool(os); dsl_deadlist_open(&dl, os, dlobj); @@ -367,6 +379,7 @@ void dsl_deadlist_space(dsl_deadlist_t *dl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { + ASSERT(dsl_deadlist_is_open(dl)); if (dl->dl_oldfmt) { VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj, usedp, compp, uncompp)); diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index d0fcacaed..e11508c90 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -209,6 +209,10 @@ dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) if (clone->ds_dir->dd_origin_txg > mintxg) { dsl_deadlist_remove_key(&clone->ds_deadlist, mintxg, tx); + if (dsl_dataset_remap_deadlist_exists(clone)) { + dsl_deadlist_remove_key( + &clone->ds_remap_deadlist, mintxg, tx); + } dsl_dataset_remove_clones_key(clone, mintxg, tx); } dsl_dataset_rele(clone, FTAG); @@ -219,6 +223,39 @@ dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) kmem_free(zc, sizeof (zap_cursor_t)); } +static void +dsl_destroy_snapshot_handle_remaps(dsl_dataset_t *ds, dsl_dataset_t *ds_next, + dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + /* Move blocks to be obsoleted to pool's obsolete list. */ + if (dsl_dataset_remap_deadlist_exists(ds_next)) { + if (!bpobj_is_open(&dp->dp_obsolete_bpobj)) + dsl_pool_create_obsolete_bpobj(dp, tx); + + dsl_deadlist_move_bpobj(&ds_next->ds_remap_deadlist, + &dp->dp_obsolete_bpobj, + dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); + } + + /* Merge our deadlist into next's and free it. */ + if (dsl_dataset_remap_deadlist_exists(ds)) { + uint64_t remap_deadlist_object = + dsl_dataset_get_remap_deadlist_object(ds); + ASSERT(remap_deadlist_object != 0); + + mutex_enter(&ds_next->ds_remap_deadlist_lock); + if (!dsl_dataset_remap_deadlist_exists(ds_next)) + dsl_dataset_create_remap_deadlist(ds_next, tx); + mutex_exit(&ds_next->ds_remap_deadlist_lock); + + dsl_deadlist_merge(&ds_next->ds_remap_deadlist, + remap_deadlist_object, tx); + dsl_dataset_destroy_remap_deadlist(ds, tx); + } +} + void dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) { @@ -333,11 +370,14 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) dsl_deadlist_merge(&ds_next->ds_deadlist, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); } + dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_deadlist_obj = 0; + dsl_destroy_snapshot_handle_remaps(ds, ds_next, tx); + /* Collapse range in clone heads */ dsl_dataset_remove_clones_key(ds, dsl_dataset_phys(ds)->ds_creation_txg, tx); @@ -371,6 +411,10 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds)); dsl_deadlist_remove_key(&hds->ds_deadlist, dsl_dataset_phys(ds)->ds_creation_txg, tx); + if (dsl_dataset_remap_deadlist_exists(hds)) { + dsl_deadlist_remove_key(&hds->ds_remap_deadlist, + dsl_dataset_phys(ds)->ds_creation_txg, tx); + } dsl_dataset_rele(hds, FTAG); } else { @@ -826,14 +870,18 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) /* * Destroy the deadlist. Unless it's a clone, the - * deadlist should be empty. (If it's a clone, it's - * safe to ignore the deadlist contents.) + * deadlist should be empty since the dataset has no snapshots. + * (If it's a clone, it's safe to ignore the deadlist contents + * since they are still referenced by the origin snapshot.) */ dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_deadlist_obj = 0; + if (dsl_dataset_remap_deadlist_exists(ds)) + dsl_dataset_destroy_remap_deadlist(ds, tx); + objset_t *os; VERIFY0(dmu_objset_from_ds(ds, &os)); diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index bf130eb99..a866c3074 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -129,6 +129,11 @@ extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd); static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); +typedef struct ddulrt_arg { + dsl_dir_t *ddulrta_dd; + uint64_t ddlrta_txg; +} ddulrt_arg_t; + static void dsl_dir_evict_async(void *dbu) { @@ -750,6 +755,35 @@ dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr) return (enforce); } +static void +dsl_dir_update_last_remap_txg_sync(void *varg, dmu_tx_t *tx) +{ + ddulrt_arg_t *arg = varg; + uint64_t last_remap_txg; + dsl_dir_t *dd = arg->ddulrta_dd; + objset_t *mos = dd->dd_pool->dp_meta_objset; + + dsl_dir_zapify(dd, tx); + if (zap_lookup(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG, + sizeof (last_remap_txg), 1, &last_remap_txg) != 0 || + last_remap_txg < arg->ddlrta_txg) { + VERIFY0(zap_update(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG, + sizeof (arg->ddlrta_txg), 1, &arg->ddlrta_txg, tx)); + } +} + +int +dsl_dir_update_last_remap_txg(dsl_dir_t *dd, uint64_t txg) +{ + ddulrt_arg_t arg; + arg.ddulrta_dd = dd; + arg.ddlrta_txg = txg; + + return (dsl_sync_task(spa_name(dd->dd_pool->dp_spa), + NULL, dsl_dir_update_last_remap_txg_sync, &arg, + 1, ZFS_SPACE_CHECK_RESERVED)); +} + /* * Check if adding additional child filesystem(s) would exceed any filesystem * limits or adding additional snapshot(s) would exceed any snapshot limits. @@ -947,7 +981,6 @@ dsl_dir_is_clone(dsl_dir_t *dd) dd->dd_pool->dp_origin_snap->ds_object)); } - uint64_t dsl_dir_get_used(dsl_dir_t *dd) { @@ -1042,6 +1075,19 @@ dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count) } } +int +dsl_dir_get_remaptxg(dsl_dir_t *dd, uint64_t *count) +{ + if (dsl_dir_is_zapified(dd)) { + objset_t *os = dd->dd_pool->dp_meta_objset; + return (zap_lookup(os, dd->dd_object, DD_FIELD_LAST_REMAP_TXG, + sizeof (*count), 1, count)); + } else { + return (ENOENT); + } + +} + void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) { @@ -1073,6 +1119,10 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT, count); } + if (dsl_dir_get_remaptxg(dd, &count) == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REMAPTXG, + count); + } if (dsl_dir_is_clone(dd)) { char buf[ZFS_MAX_DATASET_NAME_LEN]; diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index db2e67742..094b6bec0 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -300,9 +300,25 @@ dsl_pool_open(dsl_pool_t *dp) dp->dp_meta_objset, obj)); } + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj); + if (err == 0) { + VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, + dp->dp_meta_objset, obj)); + } else if (err == ENOENT) { + /* + * We might not have created the remap bpobj yet. + */ + err = 0; + } else { + goto out; + } + } + /* - * Note: errors ignored, because the leak dir will not exist if we - * have not encountered a leak yet. + * Note: errors ignored, because the these special dirs, used for + * space accounting, are only created on demand. */ (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, &dp->dp_leak_dir); @@ -348,21 +364,22 @@ dsl_pool_close(dsl_pool_t *dp) * includes pool-opening context), it actually only got a "ref" * and not a hold, so just drop that here. */ - if (dp->dp_origin_snap) + if (dp->dp_origin_snap != NULL) dsl_dataset_rele(dp->dp_origin_snap, dp); - if (dp->dp_mos_dir) + if (dp->dp_mos_dir != NULL) dsl_dir_rele(dp->dp_mos_dir, dp); - if (dp->dp_free_dir) + if (dp->dp_free_dir != NULL) dsl_dir_rele(dp->dp_free_dir, dp); - if (dp->dp_leak_dir) + if (dp->dp_leak_dir != NULL) dsl_dir_rele(dp->dp_leak_dir, dp); - if (dp->dp_root_dir) + if (dp->dp_root_dir != NULL) dsl_dir_rele(dp->dp_root_dir, dp); bpobj_close(&dp->dp_free_bpobj); + bpobj_close(&dp->dp_obsolete_bpobj); /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ - if (dp->dp_meta_objset) + if (dp->dp_meta_objset != NULL) dmu_objset_evict(dp->dp_meta_objset); txg_list_destroy(&dp->dp_dirty_datasets); @@ -390,13 +407,42 @@ dsl_pool_close(dsl_pool_t *dp) mutex_destroy(&dp->dp_lock); cv_destroy(&dp->dp_spaceavail_cv); taskq_destroy(dp->dp_iput_taskq); - if (dp->dp_blkstats) { + if (dp->dp_blkstats != NULL) { mutex_destroy(&dp->dp_blkstats->zab_lock); vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); } kmem_free(dp, sizeof (dsl_pool_t)); } +void +dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) +{ + uint64_t obj; + /* + * Currently, we only create the obsolete_bpobj where there are + * indirect vdevs with referenced mappings. + */ + ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_DEVICE_REMOVAL)); + /* create and open the obsolete_bpobj */ + obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); + VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj)); + VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); + spa_feature_incr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); +} + +void +dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) +{ + spa_feature_decr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + VERIFY0(zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_OBSOLETE_BPOBJ, tx)); + bpobj_free(dp->dp_meta_objset, + dp->dp_obsolete_bpobj.bpo_object, tx); + bpobj_close(&dp->dp_obsolete_bpobj); +} + dsl_pool_t * dsl_pool_create(spa_t *spa, nvlist_t *zplprops, dsl_crypto_params_t *dcp, uint64_t txg) diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 90534b4fa..53953a6c5 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright 2016 Gary Mills * Copyright (c) 2017 Datto Inc. * Copyright 2017 Joyent, Inc. @@ -165,6 +165,7 @@ int zfs_scan_mem_lim_fact = 20; /* fraction of physmem */ int zfs_scan_mem_lim_soft_fact = 20; /* fraction of mem lim above */ int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */ +int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */ int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ int zfs_scan_checkpoint_intval = 7200; /* in seconds */ @@ -172,7 +173,7 @@ int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; /* max number of blocks to free in a single TXG */ -unsigned long zfs_free_max_blocks = 100000; +unsigned long zfs_async_block_max_blocks = 100000; /* * We wait a few txgs after importing a pool to begin scanning so that @@ -2112,7 +2113,6 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; dsl_dataset_t *ds; - objset_t *os; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); @@ -2156,18 +2156,23 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) goto out; } - if (dmu_objset_from_ds(ds, &os)) - goto out; - /* - * Only the ZIL in the head (non-snapshot) is valid. Even though + * Only the ZIL in the head (non-snapshot) is valid. Even though * snapshots can have ZIL block pointers (which may be the same - * BP as in the head), they must be ignored. So we traverse the - * ZIL here, rather than in scan_recurse(), because the regular - * snapshot block-sharing rules don't apply to it. + * BP as in the head), they must be ignored. In addition, $ORIGIN + * doesn't have a objset (i.e. its ds_bp is a hole) so we don't + * need to look for a ZIL in it either. So we traverse the ZIL here, + * rather than in scan_recurse(), because the regular snapshot + * block-sharing rules don't apply to it. */ - if (!ds->ds_is_snapshot) + if (!dsl_dataset_is_snapshot(ds) && + ds->ds_dir != dp->dp_origin_snap->ds_dir) { + objset_t *os; + if (dmu_objset_from_ds(ds, &os) != 0) { + goto out; + } dsl_scan_zil(dp, &os->os_zil_header); + } /* * Iterate over the bps in this ds. @@ -2839,19 +2844,19 @@ scan_io_queues_run(dsl_scan_t *scn) } static boolean_t -dsl_scan_free_should_suspend(dsl_scan_t *scn) +dsl_scan_async_block_should_pause(dsl_scan_t *scn) { uint64_t elapsed_nanosecs; if (zfs_recover) return (B_FALSE); - if (scn->scn_visited_this_txg >= zfs_free_max_blocks) + if (scn->scn_visited_this_txg >= zfs_async_block_max_blocks) return (B_TRUE); elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || - (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms && + (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms && txg_sync_waiting(scn->scn_dp)) || spa_shutting_down(scn->scn_dp->dp_spa)); } @@ -2863,7 +2868,7 @@ dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) if (!scn->scn_is_bptree || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { - if (dsl_scan_free_should_suspend(scn)) + if (dsl_scan_async_block_should_pause(scn)) return (SET_ERROR(ERESTART)); } @@ -2911,6 +2916,22 @@ dsl_scan_update_stats(dsl_scan_t *scn) scn->scn_zios_this_txg = zio_count_total; } +static int +dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg; + const dva_t *dva = &bp->blk_dva[0]; + + if (dsl_scan_async_block_should_pause(scn)) + return (SET_ERROR(ERESTART)); + + spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa, + DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), + DVA_GET_ASIZE(dva), tx); + scn->scn_visited_this_txg++; + return (0); +} + boolean_t dsl_scan_active(dsl_scan_t *scn) { @@ -3047,6 +3068,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (zfs_free_bpobj_enabled && spa_version(spa) >= SPA_VERSION_DEADLISTS) { scn->scn_is_bptree = B_FALSE; + scn->scn_async_block_min_time_ms = zfs_free_min_time_ms; scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, @@ -3146,6 +3168,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); } + if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) { /* finished; verify that space accounting went to zero */ ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); @@ -3153,6 +3176,24 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes); } + EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj), + 0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_OBSOLETE_BPOBJ)); + if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) { + ASSERT(spa_feature_is_active(dp->dp_spa, + SPA_FEATURE_OBSOLETE_COUNTS)); + + scn->scn_is_bptree = B_FALSE; + scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms; + err = bpobj_iterate(&dp->dp_obsolete_bpobj, + dsl_scan_obsolete_block_cb, scn, tx); + if (err != 0 && err != ERESTART) + zfs_panic_recover("error %u from bpobj_iterate()", err); + + if (bpobj_is_empty(&dp->dp_obsolete_bpobj)) + dsl_pool_destroy_obsolete_bpobj(dp, tx); + } + if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn)) return; @@ -3685,8 +3726,7 @@ scan_io_queue_create(vdev_t *vd) q->q_vd = vd; cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops, - &q->q_exts_by_size, ext_size_compare, - &q->q_vd->vdev_scan_io_queue_lock, zfs_scan_max_ext_gap); + &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap); avl_create(&q->q_sios_by_addr, sio_addr_compare, sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node)); @@ -3739,11 +3779,8 @@ dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd) VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL); tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue; svd->vdev_scan_io_queue = NULL; - if (tvd->vdev_scan_io_queue != NULL) { + if (tvd->vdev_scan_io_queue != NULL) tvd->vdev_scan_io_queue->q_vd = tvd; - range_tree_set_lock(tvd->vdev_scan_io_queue->q_exts_by_addr, - &tvd->vdev_scan_io_queue_lock); - } mutex_exit(&tvd->vdev_scan_io_queue_lock); mutex_exit(&svd->vdev_scan_io_queue_lock); @@ -3869,6 +3906,9 @@ MODULE_PARM_DESC(zfs_scan_vdev_limit, module_param(zfs_scrub_min_time_ms, int, 0644); MODULE_PARM_DESC(zfs_scrub_min_time_ms, "Min millisecs to scrub per txg"); +module_param(zfs_obsolete_min_time_ms, int, 0644); +MODULE_PARM_DESC(zfs_obsolete_min_time_ms, "Min millisecs to obsolete per txg"); + module_param(zfs_free_min_time_ms, int, 0644); MODULE_PARM_DESC(zfs_free_min_time_ms, "Min millisecs to free per txg"); @@ -3882,8 +3922,9 @@ module_param(zfs_no_scrub_prefetch, int, 0644); MODULE_PARM_DESC(zfs_no_scrub_prefetch, "Set to disable scrub prefetching"); /* CSTYLED */ -module_param(zfs_free_max_blocks, ulong, 0644); -MODULE_PARM_DESC(zfs_free_max_blocks, "Max number of blocks freed in one txg"); +module_param(zfs_async_block_max_blocks, ulong, 0644); +MODULE_PARM_DESC(zfs_async_block_max_blocks, + "Max number of blocks freed in one txg"); module_param(zfs_free_bpobj_enabled, int, 0644); MODULE_PARM_DESC(zfs_free_bpobj_enabled, "Enable processing of the free_bpobj"); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 6320fd388..25090f089 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -33,6 +33,7 @@ #include <sys/zio.h> #include <sys/spa_impl.h> #include <sys/zfeature.h> +#include <sys/vdev_indirect_mapping.h> #define WITH_DF_BLOCK_ALLOCATOR @@ -47,7 +48,8 @@ */ unsigned long metaslab_aliquot = 512 << 10; -uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ +/* force gang blocks */ +unsigned long metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* * The in-core space map representation is more compact than its on-disk form. @@ -169,6 +171,11 @@ int metaslab_bias_enabled = B_TRUE; /* + * Enable/disable remapping of indirect DVAs to their concrete vdevs. + */ +boolean_t zfs_remap_blkptr_enable = B_TRUE; + +/* * Enable/disable segment-based metaslab selection. */ int zfs_metaslab_segment_weight_enabled = B_TRUE; @@ -202,6 +209,8 @@ uint64_t metaslab_trace_max_entries = 5000; static uint64_t metaslab_weight(metaslab_t *); static void metaslab_set_fragmentation(metaslab_t *); +static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, uint64_t); +static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); #ifdef _METASLAB_TRACING kmem_cache_t *metaslab_alloc_trace_cache; @@ -323,7 +332,7 @@ metaslab_class_histogram_verify(metaslab_class_t *mc) * Skip any holes, uninitialized top-levels, or * vdevs that are not in this metalab class. */ - if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || + if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } @@ -358,10 +367,10 @@ metaslab_class_fragmentation(metaslab_class_t *mc) metaslab_group_t *mg = tvd->vdev_mg; /* - * Skip any holes, uninitialized top-levels, or - * vdevs that are not in this metalab class. + * Skip any holes, uninitialized top-levels, + * or vdevs that are not in this metalab class. */ - if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || + if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } @@ -406,7 +415,7 @@ metaslab_class_expandable_space(metaslab_class_t *mc) vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; - if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || + if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } @@ -505,6 +514,8 @@ metaslab_group_alloc_update(metaslab_group_t *mg) boolean_t was_initialized; ASSERT(vd == vd->vdev_top); + ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, + SCL_ALLOC); mutex_enter(&mg->mg_lock); was_allocatable = mg->mg_allocatable; @@ -615,7 +626,7 @@ metaslab_group_activate(metaslab_group_t *mg) metaslab_class_t *mc = mg->mg_class; metaslab_group_t *mgprev, *mgnext; - ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); + ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); ASSERT(mc->mc_rotor != mg); ASSERT(mg->mg_prev == NULL); @@ -641,13 +652,22 @@ metaslab_group_activate(metaslab_group_t *mg) mc->mc_rotor = mg; } +/* + * Passivate a metaslab group and remove it from the allocation rotor. + * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating + * a metaslab group. This function will momentarily drop spa_config_locks + * that are lower than the SCL_ALLOC lock (see comment below). + */ void metaslab_group_passivate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; + spa_t *spa = mc->mc_spa; metaslab_group_t *mgprev, *mgnext; + int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); - ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); + ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, + (SCL_ALLOC | SCL_ZIO)); if (--mg->mg_activation_count != 0) { ASSERT(mc->mc_rotor != mg); @@ -657,7 +677,23 @@ metaslab_group_passivate(metaslab_group_t *mg) return; } + /* + * The spa_config_lock is an array of rwlocks, ordered as + * follows (from highest to lowest): + * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > + * SCL_ZIO > SCL_FREE > SCL_VDEV + * (For more information about the spa_config_lock see spa_misc.c) + * The higher the lock, the broader its coverage. When we passivate + * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO + * config locks. However, the metaslab group's taskq might be trying + * to preload metaslabs so we must drop the SCL_ZIO lock and any + * lower locks to allow the I/O to complete. At a minimum, + * we continue to hold the SCL_ALLOC lock, which prevents any future + * allocations from taking place and any changes to the vdev tree. + */ + spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); taskq_wait_outstanding(mg->mg_taskq, 0); + spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); metaslab_group_alloc_update(mg); mgprev = mg->mg_prev; @@ -1269,6 +1305,12 @@ metaslab_load(metaslab_t *msp) ASSERT(!msp->ms_loading); msp->ms_loading = B_TRUE; + /* + * Nobody else can manipulate a loading metaslab, so it's now safe + * to drop the lock. This way we don't have to hold the lock while + * reading the spacemap from disk. + */ + mutex_exit(&msp->ms_lock); /* * If the space map has not been allocated yet, then treat @@ -1281,6 +1323,8 @@ metaslab_load(metaslab_t *msp) range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size); success = (error == 0); + + mutex_enter(&msp->ms_lock); msp->ms_loading = B_FALSE; if (success) { @@ -1318,6 +1362,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; @@ -1329,7 +1374,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, */ if (object != 0) { error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, - ms->ms_size, vd->vdev_ashift, &ms->ms_lock); + ms->ms_size, vd->vdev_ashift); if (error != 0) { kmem_free(ms, sizeof (metaslab_t)); @@ -1347,7 +1392,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, * data fault on any attempt to use this metaslab before it's ready. */ ms->ms_tree = range_tree_create_impl(&rt_avl_ops, &ms->ms_size_tree, - metaslab_rangesize_compare, &ms->ms_lock, 0); + metaslab_rangesize_compare, 0); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms); @@ -1416,6 +1461,7 @@ metaslab_fini(metaslab_t *msp) mutex_exit(&msp->ms_lock); cv_destroy(&msp->ms_load_cv); mutex_destroy(&msp->ms_lock); + mutex_destroy(&msp->ms_sync_lock); kmem_free(msp, sizeof (metaslab_t)); } @@ -1780,14 +1826,11 @@ metaslab_weight(metaslab_t *msp) ASSERT(MUTEX_HELD(&msp->ms_lock)); /* - * This vdev is in the process of being removed so there is nothing + * If this vdev is in the process of being removed, there is nothing * for us to do here. */ - if (vd->vdev_removing) { - ASSERT0(space_map_allocated(msp->ms_sm)); - ASSERT0(vd->vdev_ms_shift); + if (vd->vdev_removing) return (0); - } metaslab_set_fragmentation(msp); @@ -1922,10 +1965,13 @@ metaslab_group_preload(metaslab_group_t *mg) } mutex_enter(&mg->mg_lock); + /* * Load the next potential metaslabs */ for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { + ASSERT3P(msp->ms_group, ==, mg); + /* * We preload only the maximum number of metaslabs specified * by metaslab_preload_limit. If a metaslab is being forced @@ -1952,7 +1998,7 @@ metaslab_group_preload(metaslab_group_t *mg) * * 2. The minimal on-disk space map representation is zfs_condense_pct/100 * times the size than the free space range tree representation - * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB). + * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB). * * 3. The on-disk size of the space map should actually decrease. * @@ -2049,7 +2095,7 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) * a relatively inexpensive operation since we expect these trees to * have a small number of nodes. */ - condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock); + condense_tree = range_tree_create(NULL, NULL); range_tree_add(condense_tree, msp->ms_start, msp->ms_size); /* @@ -2082,7 +2128,6 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) mutex_exit(&msp->ms_lock); space_map_truncate(sm, tx); - mutex_enter(&msp->ms_lock); /* * While we would ideally like to create a space map representation @@ -2099,6 +2144,7 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) range_tree_destroy(condense_tree); space_map_write(sm, msp->ms_tree, SM_FREE, tx); + mutex_enter(&msp->ms_lock); msp->ms_condensing = B_FALSE; } @@ -2148,10 +2194,14 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * The only state that can actually be changing concurrently with * metaslab_sync() is the metaslab's ms_tree. No other thread can * be modifying this txg's alloctree, freeingtree, freedtree, or - * space_map_phys_t. Therefore, we only hold ms_lock to satify - * space map ASSERTs. We drop it whenever we call into the DMU, - * because the DMU can call down to us (e.g. via zio_free()) at - * any time. + * space_map_phys_t. We drop ms_lock whenever we could call + * into the DMU, because the DMU can call down to us + * (e.g. via zio_free()) at any time. + * + * The spa_vdev_remove_thread() can be reading metaslab state + * concurrently, and it is locked out by the ms_sync_lock. Note + * that the ms_lock is insufficient for this, because it is dropped + * by space_map_write(). */ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); @@ -2163,11 +2213,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, - msp->ms_start, msp->ms_size, vd->vdev_ashift, - &msp->ms_lock)); + msp->ms_start, msp->ms_size, vd->vdev_ashift)); ASSERT(msp->ms_sm != NULL); } + mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); /* @@ -2183,13 +2233,15 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) metaslab_should_condense(msp)) { metaslab_condense(msp, txg, tx); } else { + mutex_exit(&msp->ms_lock); space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx); + mutex_enter(&msp->ms_lock); } if (msp->ms_loaded) { /* - * When the space map is loaded, we have an accruate + * When the space map is loaded, we have an accurate * histogram in the range tree. This gives us an opportunity * to bring the space map's histogram up-to-date so we clear * it first before updating it. @@ -2257,6 +2309,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * msp->ms_id, sizeof (uint64_t), &object, tx); } + mutex_exit(&msp->ms_sync_lock); dmu_tx_commit(tx); } @@ -2286,23 +2339,19 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) for (int t = 0; t < TXG_SIZE; t++) { ASSERT(msp->ms_alloctree[t] == NULL); - msp->ms_alloctree[t] = range_tree_create(NULL, msp, - &msp->ms_lock); + msp->ms_alloctree[t] = range_tree_create(NULL, NULL); } ASSERT3P(msp->ms_freeingtree, ==, NULL); - msp->ms_freeingtree = range_tree_create(NULL, msp, - &msp->ms_lock); + msp->ms_freeingtree = range_tree_create(NULL, NULL); ASSERT3P(msp->ms_freedtree, ==, NULL); - msp->ms_freedtree = range_tree_create(NULL, msp, - &msp->ms_lock); + msp->ms_freedtree = range_tree_create(NULL, NULL); for (int t = 0; t < TXG_DEFER_SIZE; t++) { ASSERT(msp->ms_defertree[t] == NULL); - msp->ms_defertree[t] = range_tree_create(NULL, msp, - &msp->ms_lock); + msp->ms_defertree[t] = range_tree_create(NULL, NULL); } vdev_space_update(vd, 0, 0, msp->ms_size); @@ -2312,7 +2361,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - metaslab_class_get_alloc(spa_normal_class(spa)); - if (free_space <= spa_get_slop_space(spa)) { + if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { defer_allowed = B_FALSE; } @@ -2383,19 +2432,33 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) metaslab_unload(msp); } + ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); + ASSERT0(range_tree_space(msp->ms_freeingtree)); + ASSERT0(range_tree_space(msp->ms_freedtree)); + mutex_exit(&msp->ms_lock); } void metaslab_sync_reassess(metaslab_group_t *mg) { + spa_t *spa = mg->mg_class->mc_spa; + + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); metaslab_group_alloc_update(mg); mg->mg_fragmentation = metaslab_group_fragmentation(mg); /* - * Preload the next potential metaslabs + * Preload the next potential metaslabs but only on active + * metaslab groups. We can get into a state where the metaslab + * is no longer active since we dirty metaslabs as we remove a + * a device, thus potentially making the metaslab group eligible + * for preloading. */ - metaslab_group_preload(mg); + if (mg->mg_activation_count > 0) { + metaslab_group_preload(mg); + } + spa_config_exit(spa, SCL_ALLOC, FTAG); } static uint64_t @@ -2875,7 +2938,7 @@ int ditto_same_vdev_distance_shift = 3; /* * Allocate a block for the specified i/o. */ -static int +int metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, zio_alloc_list_t *zal) @@ -2921,10 +2984,11 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, /* * It's possible the vdev we're using as the hint no - * longer exists (i.e. removed). Consult the rotor when + * longer exists or its mg has been closed (e.g. by + * device removal). Consult the rotor when * all else fails. */ - if (vd != NULL) { + if (vd != NULL && vd->vdev_mg != NULL) { mg = vd->vdev_mg; if (flags & METASLAB_HINTBP_AVOID && @@ -3116,18 +3180,228 @@ next: return (SET_ERROR(ENOSPC)); } +void +metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, + uint64_t txg) +{ + metaslab_t *msp; + ASSERTV(spa_t *spa = vd->vdev_spa); + + ASSERT3U(txg, ==, spa->spa_syncing_txg); + ASSERT(vdev_is_concrete(vd)); + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); + ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); + + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + VERIFY(!msp->ms_condensing); + VERIFY3U(offset, >=, msp->ms_start); + VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); + VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); + VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); + + metaslab_check_free_impl(vd, offset, asize); + mutex_enter(&msp->ms_lock); + if (range_tree_space(msp->ms_freeingtree) == 0) { + vdev_dirty(vd, VDD_METASLAB, msp, txg); + } + range_tree_add(msp->ms_freeingtree, offset, asize); + mutex_exit(&msp->ms_lock); +} + +/* ARGSUSED */ +void +metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + uint64_t *txgp = arg; + + if (vd->vdev_ops->vdev_op_remap != NULL) + vdev_indirect_mark_obsolete(vd, offset, size, *txgp); + else + metaslab_free_impl(vd, offset, size, *txgp); +} + +static void +metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, + uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); + + if (txg > spa_freeze_txg(spa)) + return; + + if (spa->spa_vdev_removal != NULL && + spa->spa_vdev_removal->svr_vdev == vd && + vdev_is_concrete(vd)) { + /* + * Note: we check if the vdev is concrete because when + * we complete the removal, we first change the vdev to be + * an indirect vdev (in open context), and then (in syncing + * context) clear spa_vdev_removal. + */ + free_from_removing_vdev(vd, offset, size, txg); + } else if (vd->vdev_ops->vdev_op_remap != NULL) { + vdev_indirect_mark_obsolete(vd, offset, size, txg); + vd->vdev_ops->vdev_op_remap(vd, offset, size, + metaslab_free_impl_cb, &txg); + } else { + metaslab_free_concrete(vd, offset, size, txg); + } +} + +typedef struct remap_blkptr_cb_arg { + blkptr_t *rbca_bp; + spa_remap_cb_t rbca_cb; + vdev_t *rbca_remap_vd; + uint64_t rbca_remap_offset; + void *rbca_cb_arg; +} remap_blkptr_cb_arg_t; + +void +remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + remap_blkptr_cb_arg_t *rbca = arg; + blkptr_t *bp = rbca->rbca_bp; + + /* We can not remap split blocks. */ + if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) + return; + ASSERT0(inner_offset); + + if (rbca->rbca_cb != NULL) { + /* + * At this point we know that we are not handling split + * blocks and we invoke the callback on the previous + * vdev which must be indirect. + */ + ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); + + rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, + rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); + + /* set up remap_blkptr_cb_arg for the next call */ + rbca->rbca_remap_vd = vd; + rbca->rbca_remap_offset = offset; + } + + /* + * The phys birth time is that of dva[0]. This ensures that we know + * when each dva was written, so that resilver can determine which + * blocks need to be scrubbed (i.e. those written during the time + * the vdev was offline). It also ensures that the key used in + * the ARC hash table is unique (i.e. dva[0] + phys_birth). If + * we didn't change the phys_birth, a lookup in the ARC for a + * remapped BP could find the data that was previously stored at + * this vdev + offset. + */ + vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, + DVA_GET_VDEV(&bp->blk_dva[0])); + vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; + bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, + DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); + + DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); + DVA_SET_OFFSET(&bp->blk_dva[0], offset); +} + /* - * Free the block represented by DVA in the context of the specified - * transaction group. + * If the block pointer contains any indirect DVAs, modify them to refer to + * concrete DVAs. Note that this will sometimes not be possible, leaving + * the indirect DVA in place. This happens if the indirect DVA spans multiple + * segments in the mapping (i.e. it is a "split block"). + * + * If the BP was remapped, calls the callback on the original dva (note the + * callback can be called multiple times if the original indirect DVA refers + * to another indirect DVA, etc). + * + * Returns TRUE if the BP was remapped. */ -static void -metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) +boolean_t +spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) { - uint64_t vdev = DVA_GET_VDEV(dva); + remap_blkptr_cb_arg_t rbca; + + if (!zfs_remap_blkptr_enable) + return (B_FALSE); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) + return (B_FALSE); + + /* + * Dedup BP's can not be remapped, because ddt_phys_select() depends + * on DVA[0] being the same in the BP as in the DDT (dedup table). + */ + if (BP_GET_DEDUP(bp)) + return (B_FALSE); + + /* + * Gang blocks can not be remapped, because + * zio_checksum_gang_verifier() depends on the DVA[0] that's in + * the BP used to read the gang block header (GBH) being the same + * as the DVA[0] that we allocated for the GBH. + */ + if (BP_IS_GANG(bp)) + return (B_FALSE); + + /* + * Embedded BP's have no DVA to remap. + */ + if (BP_GET_NDVAS(bp) < 1) + return (B_FALSE); + + /* + * Note: we only remap dva[0]. If we remapped other dvas, we + * would no longer know what their phys birth txg is. + */ + dva_t *dva = &bp->blk_dva[0]; + uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); - vdev_t *vd; + vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); + + if (vd->vdev_ops->vdev_op_remap == NULL) + return (B_FALSE); + + rbca.rbca_bp = bp; + rbca.rbca_cb = callback; + rbca.rbca_remap_vd = vd; + rbca.rbca_remap_offset = offset; + rbca.rbca_cb_arg = arg; + + /* + * remap_blkptr_cb() will be called in order for each level of + * indirection, until a concrete vdev is reached or a split block is + * encountered. old_vd and old_offset are updated within the callback + * as we go from the one indirect vdev to the next one (either concrete + * or indirect again) in that order. + */ + vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); + + /* Check if the DVA wasn't remapped because it is a split block */ + if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Undo the allocation of a DVA which happened in the given transaction group. + */ +void +metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) +{ metaslab_t *msp; + vdev_t *vd; + uint64_t vdev = DVA_GET_VDEV(dva); + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t size = DVA_GET_ASIZE(dva); + + ASSERT(DVA_IS_VALID(dva)); + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (txg > spa_freeze_txg(spa)) return; @@ -3140,91 +3414,51 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) return; } - msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + ASSERT(!vd->vdev_removing); + ASSERT(vdev_is_concrete(vd)); + ASSERT0(vd->vdev_indirect_config.vic_mapping_object); + ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); if (DVA_GET_GANG(dva)) size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); - mutex_enter(&msp->ms_lock); - - if (now) { - range_tree_remove(msp->ms_alloctree[txg & TXG_MASK], - offset, size); + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - VERIFY(!msp->ms_condensing); - VERIFY3U(offset, >=, msp->ms_start); - VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); - VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, - msp->ms_size); - VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); - VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); - range_tree_add(msp->ms_tree, offset, size); - msp->ms_max_size = metaslab_block_maxsize(msp); - } else { - VERIFY3U(txg, ==, spa->spa_syncing_txg); - if (range_tree_space(msp->ms_freeingtree) == 0) - vdev_dirty(vd, VDD_METASLAB, msp, txg); - range_tree_add(msp->ms_freeingtree, offset, size); - } + mutex_enter(&msp->ms_lock); + range_tree_remove(msp->ms_alloctree[txg & TXG_MASK], + offset, size); + VERIFY(!msp->ms_condensing); + VERIFY3U(offset, >=, msp->ms_start); + VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); + VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, + msp->ms_size); + VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); + VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); + range_tree_add(msp->ms_tree, offset, size); mutex_exit(&msp->ms_lock); } /* - * Intent log support: upon opening the pool after a crash, notify the SPA - * of blocks that the intent log has allocated for immediate write, but - * which are still considered free by the SPA because the last transaction - * group didn't commit yet. + * Free the block represented by DVA in the context of the specified + * transaction group. */ -static int -metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) +void +metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); - vdev_t *vd; - metaslab_t *msp; - int error = 0; + vdev_t *vd = vdev_lookup_top(spa, vdev); ASSERT(DVA_IS_VALID(dva)); + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); - if ((vd = vdev_lookup_top(spa, vdev)) == NULL || - (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) - return (SET_ERROR(ENXIO)); - - msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - - if (DVA_GET_GANG(dva)) + if (DVA_GET_GANG(dva)) { size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); - - mutex_enter(&msp->ms_lock); - - if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) - error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); - - if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size)) - error = SET_ERROR(ENOENT); - - if (error || txg == 0) { /* txg == 0 indicates dry run */ - mutex_exit(&msp->ms_lock); - return (error); } - VERIFY(!msp->ms_condensing); - VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); - VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); - VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); - range_tree_remove(msp->ms_tree, offset, size); - - if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ - if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) - vdev_dirty(vd, VDD_METASLAB, msp, txg); - range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); - } - - mutex_exit(&msp->ms_lock); - - return (0); + metaslab_free_impl(vd, offset, size, txg); } /* @@ -3275,6 +3509,122 @@ metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio) mutex_exit(&mc->mc_lock); } +static int +metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, + uint64_t txg) +{ + metaslab_t *msp; + spa_t *spa = vd->vdev_spa; + int error = 0; + + if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) + return (ENXIO); + + ASSERT3P(vd->vdev_ms, !=, NULL); + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + mutex_enter(&msp->ms_lock); + + if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) + error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); + + if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size)) + error = SET_ERROR(ENOENT); + + if (error || txg == 0) { /* txg == 0 indicates dry run */ + mutex_exit(&msp->ms_lock); + return (error); + } + + VERIFY(!msp->ms_condensing); + VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); + VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); + VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); + range_tree_remove(msp->ms_tree, offset, size); + + if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ + if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) + vdev_dirty(vd, VDD_METASLAB, msp, txg); + range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); + } + + mutex_exit(&msp->ms_lock); + + return (0); +} + +typedef struct metaslab_claim_cb_arg_t { + uint64_t mcca_txg; + int mcca_error; +} metaslab_claim_cb_arg_t; + +/* ARGSUSED */ +static void +metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + metaslab_claim_cb_arg_t *mcca_arg = arg; + + if (mcca_arg->mcca_error == 0) { + mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, + size, mcca_arg->mcca_txg); + } +} + +int +metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) +{ + if (vd->vdev_ops->vdev_op_remap != NULL) { + metaslab_claim_cb_arg_t arg; + + /* + * Only zdb(1M) can claim on indirect vdevs. This is used + * to detect leaks of mapped space (that are not accounted + * for in the obsolete counts, spacemap, or bpobj). + */ + ASSERT(!spa_writeable(vd->vdev_spa)); + arg.mcca_error = 0; + arg.mcca_txg = txg; + + vd->vdev_ops->vdev_op_remap(vd, offset, size, + metaslab_claim_impl_cb, &arg); + + if (arg.mcca_error == 0) { + arg.mcca_error = metaslab_claim_concrete(vd, + offset, size, txg); + } + return (arg.mcca_error); + } else { + return (metaslab_claim_concrete(vd, offset, size, txg)); + } +} + +/* + * Intent log support: upon opening the pool after a crash, notify the SPA + * of blocks that the intent log has allocated for immediate write, but + * which are still considered free by the SPA because the last transaction + * group didn't commit yet. + */ +static int +metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) +{ + uint64_t vdev = DVA_GET_VDEV(dva); + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t size = DVA_GET_ASIZE(dva); + vdev_t *vd; + + if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { + return (SET_ERROR(ENXIO)); + } + + ASSERT(DVA_IS_VALID(dva)); + + if (DVA_GET_GANG(dva)) + size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + + return (metaslab_claim_impl(vd, offset, size, txg)); +} + int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, @@ -3304,7 +3654,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, txg, flags, zal); if (error != 0) { for (d--; d >= 0; d--) { - metaslab_free_dva(spa, &dva[d], txg, B_TRUE); + metaslab_unalloc_dva(spa, &dva[d], txg); metaslab_group_alloc_decrement(spa, DVA_GET_VDEV(&dva[d]), zio, flags); bzero(&dva[d], sizeof (dva_t)); @@ -3342,8 +3692,13 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); - for (int d = 0; d < ndvas; d++) - metaslab_free_dva(spa, &dva[d], txg, now); + for (int d = 0; d < ndvas; d++) { + if (now) { + metaslab_unalloc_dva(spa, &dva[d], txg); + } else { + metaslab_free_dva(spa, &dva[d], txg); + } + } spa_config_exit(spa, SCL_FREE, FTAG); } @@ -3428,6 +3783,49 @@ metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) spa_config_exit(spa, SCL_VDEV, FTAG); } +/* ARGSUSED */ +static void +metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + if (vd->vdev_ops == &vdev_indirect_ops) + return; + + metaslab_check_free_impl(vd, offset, size); +} + +static void +metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) +{ + metaslab_t *msp; + ASSERTV(spa_t *spa = vd->vdev_spa); + + if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) + return; + + if (vd->vdev_ops->vdev_op_remap != NULL) { + vd->vdev_ops->vdev_op_remap(vd, offset, size, + metaslab_check_free_impl_cb, NULL); + return; + } + + ASSERT(vdev_is_concrete(vd)); + ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); + + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + mutex_enter(&msp->ms_lock); + if (msp->ms_loaded) + range_tree_verify(msp->ms_tree, offset, size); + + range_tree_verify(msp->ms_freeingtree, offset, size); + range_tree_verify(msp->ms_freedtree, offset, size); + for (int j = 0; j < TXG_DEFER_SIZE; j++) + range_tree_verify(msp->ms_defertree[j], offset, size); + mutex_exit(&msp->ms_lock); +} + void metaslab_check_free(spa_t *spa, const blkptr_t *bp) { @@ -3440,15 +3838,13 @@ metaslab_check_free(spa_t *spa, const blkptr_t *bp) vdev_t *vd = vdev_lookup_top(spa, vdev); uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); - metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - if (msp->ms_loaded) - range_tree_verify(msp->ms_tree, offset, size); + if (DVA_GET_GANG(&bp->blk_dva[i])) + size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + + ASSERT3P(vd, !=, NULL); - range_tree_verify(msp->ms_freeingtree, offset, size); - range_tree_verify(msp->ms_freedtree, offset, size); - for (int j = 0; j < TXG_DEFER_SIZE; j++) - range_tree_verify(msp->ms_defertree[j], offset, size); + metaslab_check_free_impl(vd, offset, size); } spa_config_exit(spa, SCL_VDEV, FTAG); } @@ -3502,4 +3898,9 @@ MODULE_PARM_DESC(zfs_metaslab_segment_weight_enabled, module_param(zfs_metaslab_switch_threshold, int, 0644); MODULE_PARM_DESC(zfs_metaslab_switch_threshold, "segment-based metaslab selection maximum buckets before switching"); + +/* CSTYLED */ +module_param(metaslab_gang_bang, ulong, 0644); +MODULE_PARM_DESC(metaslab_gang_bang, + "blocks larger than this size are forced to be gang blocks"); #endif /* _KERNEL && HAVE_SPL */ diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index 4d3c7401e..48aab673a 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -210,15 +210,13 @@ mmp_random_leaf_impl(vdev_t *vd, int *fail_mask) { int child_idx; - if (!vdev_writeable(vd)) { - *fail_mask |= MMP_FAIL_NOT_WRITABLE; - return (NULL); - } - if (vd->vdev_ops->vdev_op_leaf) { vdev_t *ret; - if (vd->vdev_mmp_pending != 0) { + if (!vdev_writeable(vd)) { + *fail_mask |= MMP_FAIL_NOT_WRITABLE; + ret = NULL; + } else if (vd->vdev_mmp_pending != 0) { *fail_mask |= MMP_FAIL_WRITE_PENDING; ret = NULL; } else { @@ -228,6 +226,9 @@ mmp_random_leaf_impl(vdev_t *vd, int *fail_mask) return (ret); } + if (vd->vdev_children == 0) + return (NULL); + child_idx = spa_get_random(vd->vdev_children); for (int offset = vd->vdev_children; offset > 0; offset--) { vdev_t *leaf; diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 01ef463ec..baa655d39 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -125,18 +125,6 @@ range_tree_stat_verify(range_tree_t *rt) } } -/* - * Changes out the lock used by the range tree. Useful when you are moving - * the range tree between containing structures without having to recreate - * it. Both the old and new locks must be held by the caller. - */ -void -range_tree_set_lock(range_tree_t *rt, kmutex_t *lp) -{ - ASSERT(MUTEX_HELD(rt->rt_lock) && MUTEX_HELD(lp)); - rt->rt_lock = lp; -} - static void range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) { @@ -147,7 +135,6 @@ range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) ASSERT3U(idx, <, sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); - ASSERT(MUTEX_HELD(rt->rt_lock)); rt->rt_histogram[idx]++; ASSERT3U(rt->rt_histogram[idx], !=, 0); } @@ -162,7 +149,6 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) ASSERT3U(idx, <, sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); - ASSERT(MUTEX_HELD(rt->rt_lock)); ASSERT3U(rt->rt_histogram[idx], !=, 0); rt->rt_histogram[idx]--; } @@ -184,14 +170,13 @@ range_tree_seg_compare(const void *x1, const void *x2) range_tree_t * range_tree_create_impl(range_tree_ops_t *ops, void *arg, - int (*avl_compare) (const void *, const void *), kmutex_t *lp, uint64_t gap) + int (*avl_compare) (const void *, const void *), uint64_t gap) { range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); avl_create(&rt->rt_root, range_tree_seg_compare, sizeof (range_seg_t), offsetof(range_seg_t, rs_node)); - rt->rt_lock = lp; rt->rt_ops = ops; rt->rt_gap = gap; rt->rt_arg = arg; @@ -204,9 +189,9 @@ range_tree_create_impl(range_tree_ops_t *ops, void *arg, } range_tree_t * -range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp) +range_tree_create(range_tree_ops_t *ops, void *arg) { - return (range_tree_create_impl(ops, arg, NULL, lp, 0)); + return (range_tree_create_impl(ops, arg, NULL, 0)); } void @@ -224,8 +209,6 @@ range_tree_destroy(range_tree_t *rt) void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta) { - ASSERT(MUTEX_HELD(rt->rt_lock)); - ASSERT3U(rs->rs_fill + delta, !=, 0); ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start); @@ -246,7 +229,6 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) uint64_t bridge_size = 0; boolean_t merge_before, merge_after; - ASSERT(MUTEX_HELD(rt->rt_lock)); ASSERT3U(size, !=, 0); ASSERT3U(fill, <=, size); @@ -383,7 +365,6 @@ range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, uint64_t end = start + size; boolean_t left_over, right_over; - ASSERT(MUTEX_HELD(rt->rt_lock)); VERIFY3U(size, !=, 0); VERIFY3U(size, <=, rt->rt_space); @@ -493,8 +474,6 @@ range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, { int64_t delta = newsize - (rs->rs_end - rs->rs_start); - ASSERT(MUTEX_HELD(rt->rt_lock)); - range_tree_stat_decr(rt, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); @@ -516,7 +495,6 @@ range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) range_seg_t rsearch; uint64_t end = start + size; - ASSERT(MUTEX_HELD(rt->rt_lock)); VERIFY(size != 0); rsearch.rs_start = start; @@ -538,11 +516,9 @@ range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size) { range_seg_t *rs; - mutex_enter(rt->rt_lock); rs = range_tree_find(rt, off, size); if (rs != NULL) panic("freeing free block; rs=%p", (void *)rs); - mutex_exit(rt->rt_lock); } boolean_t @@ -560,6 +536,9 @@ range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size) { range_seg_t *rs; + if (size == 0) + return; + while ((rs = range_tree_find_impl(rt, start, size)) != NULL) { uint64_t free_start = MAX(rs->rs_start, start); uint64_t free_end = MIN(rs->rs_end, start + size); @@ -572,7 +551,6 @@ range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) { range_tree_t *rt; - ASSERT(MUTEX_HELD((*rtsrc)->rt_lock)); ASSERT0(range_tree_space(*rtdst)); ASSERT0(avl_numnodes(&(*rtdst)->rt_root)); @@ -587,8 +565,6 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) range_seg_t *rs; void *cookie = NULL; - ASSERT(MUTEX_HELD(rt->rt_lock)); - if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL) rt->rt_ops->rtop_vacate(rt, rt->rt_arg); @@ -607,8 +583,6 @@ range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) { range_seg_t *rs; - ASSERT(MUTEX_HELD(rt->rt_lock)); - for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) func(arg, rs->rs_start, rs->rs_end - rs->rs_start); } @@ -616,7 +590,6 @@ range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) range_seg_t * range_tree_first(range_tree_t *rt) { - ASSERT(MUTEX_HELD(rt->rt_lock)); return (avl_first(&rt->rt_root)); } diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 53b5aabf0..08fc7bbda 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -52,6 +52,9 @@ #include <sys/zil.h> #include <sys/ddt.h> #include <sys/vdev_impl.h> +#include <sys/vdev_removal.h> +#include <sys/vdev_indirect_mapping.h> +#include <sys/vdev_indirect_births.h> #include <sys/vdev_disk.h> #include <sys/metaslab.h> #include <sys/metaslab_impl.h> @@ -59,6 +62,7 @@ #include <sys/uberblock_impl.h> #include <sys/txg.h> #include <sys/avl.h> +#include <sys/bpobj.h> #include <sys/dmu_traverse.h> #include <sys/dmu_objset.h> #include <sys/unique.h> @@ -96,7 +100,7 @@ * The interval, in seconds, at which failed configuration cache file writes * should be retried. */ -static int zfs_ccw_retry_interval = 300; +int zfs_ccw_retry_interval = 300; typedef enum zti_modes { ZTI_MODE_FIXED, /* value is # of threads (min 1) */ @@ -150,14 +154,11 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ }; -static sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, - const char *name); -static void spa_event_post(sysevent_t *ev); static void spa_sync_version(void *arg, dmu_tx_t *tx); static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, - spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, + spa_load_state_t state, spa_import_type_t type, boolean_t trust_config, char **ereport); static void spa_vdev_resilver_done(spa_t *spa); @@ -817,7 +818,7 @@ spa_change_guid(spa_t *spa) spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); if (error == 0) { - spa_config_sync(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); } @@ -1149,6 +1150,9 @@ spa_activate(spa_t *spa, int mode) spa_create_zio_taskqs(spa); } + for (size_t i = 0; i < TXG_SIZE; i++) + spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 0); + list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); list_create(&spa->spa_evicting_os_list, sizeof (objset_t), @@ -1245,6 +1249,12 @@ spa_deactivate(spa_t *spa) } } + for (size_t i = 0; i < TXG_SIZE; i++) { + ASSERT3P(spa->spa_txg_zio[i], !=, NULL); + VERIFY0(zio_wait(spa->spa_txg_zio[i])); + spa->spa_txg_zio[i] = NULL; + } + metaslab_class_destroy(spa->spa_normal_class); spa->spa_normal_class = NULL; @@ -1385,6 +1395,13 @@ spa_unload(spa_t *spa) spa->spa_async_zio_root = NULL; } + if (spa->spa_vdev_removal != NULL) { + spa_vdev_removal_destroy(spa->spa_vdev_removal); + spa->spa_vdev_removal = NULL; + } + + spa_condense_fini(spa); + bpobj_close(&spa->spa_deferred_bpobj); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); @@ -1442,6 +1459,8 @@ spa_unload(spa_t *spa) spa->spa_async_suspended = 0; + spa->spa_indirect_vdevs_loaded = B_FALSE; + if (spa->spa_comment != NULL) { spa_strfree(spa->spa_comment); spa->spa_comment = NULL; @@ -1456,7 +1475,7 @@ spa_unload(spa_t *spa) * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. */ -static void +void spa_load_spares(spa_t *spa) { nvlist_t **spares; @@ -1573,7 +1592,7 @@ spa_load_spares(spa_t *spa) * Devices which are already active have their details maintained, and are * not re-opened. */ -static void +void spa_load_l2cache(spa_t *spa) { nvlist_t **l2cache = NULL; @@ -1734,7 +1753,7 @@ spa_check_removed(vdev_t *vd) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && - !vd->vdev_ishole) { + vdev_is_concrete(vd)) { zfs_post_autoreplace(vd->vdev_spa, vd); spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); } @@ -1817,27 +1836,26 @@ spa_config_valid(spa_t *spa, nvlist_t *config) /* * Resolve any "missing" vdevs in the current configuration. + * Also trust the MOS config about any "indirect" vdevs. * If we find that the MOS config has more accurate information * about the top-level vdev then use that vdev instead. */ - if (tvd->vdev_ops == &vdev_missing_ops && - mtvd->vdev_ops != &vdev_missing_ops) { - - if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) - continue; + if ((tvd->vdev_ops == &vdev_missing_ops && + mtvd->vdev_ops != &vdev_missing_ops) || + (mtvd->vdev_ops == &vdev_indirect_ops && + tvd->vdev_ops != &vdev_indirect_ops)) { /* * Device specific actions. */ if (mtvd->vdev_islog) { + if (!(spa->spa_import_flags & + ZFS_IMPORT_MISSING_LOG)) { + continue; + } + spa_set_log_state(spa, SPA_LOG_CLEAR); - } else { - /* - * XXX - once we have 'readonly' pool - * support we should be able to handle - * missing data devices by transitioning - * the pool to readonly. - */ + } else if (mtvd->vdev_ops != &vdev_indirect_ops) { continue; } @@ -1851,10 +1869,6 @@ spa_config_valid(spa_t *spa, nvlist_t *config) vdev_add_child(rvd, mtvd); vdev_add_child(mrvd, tvd); - spa_config_exit(spa, SCL_ALL, FTAG); - vdev_load(mtvd); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - vdev_reopen(rvd); } else { if (mtvd->vdev_islog) { @@ -1873,6 +1887,14 @@ spa_config_valid(spa_t *spa, nvlist_t *config) */ spa_config_valid_zaps(tvd, mtvd); } + + /* + * Never trust this info from userland; always use what's + * in the MOS. This prevents it from getting out of sync + * with the rest of the info in the MOS. + */ + tvd->vdev_removing = mtvd->vdev_removing; + tvd->vdev_indirect_config = mtvd->vdev_indirect_config; } vdev_free(mrvd); @@ -1949,11 +1971,11 @@ spa_activate_log(spa_t *spa) } int -spa_offline_log(spa_t *spa) +spa_reset_logs(spa_t *spa) { int error; - error = dmu_objset_find(spa_name(spa), zil_vdev_offline, + error = dmu_objset_find(spa_name(spa), zil_reset, NULL, DS_FIND_CHILDREN); if (error == 0) { /* @@ -2155,7 +2177,7 @@ static int spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) { vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); - return (err); + return (SET_ERROR(err)); } /* @@ -2547,7 +2569,7 @@ out: __attribute__((always_inline)) static inline int spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, - spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, + spa_load_state_t state, spa_import_type_t type, boolean_t trust_config, char **ereport) { int error = 0; @@ -2566,7 +2588,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * If this is an untrusted config, access the pool in read-only mode. * This prevents things like resilvering recently removed devices. */ - if (!mosconfig) + if (!trust_config) spa->spa_mode = FREAD; ASSERT(MUTEX_HELD(&spa_namespace_lock)); @@ -2634,7 +2656,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, */ if (type != SPA_IMPORT_ASSEMBLE) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = vdev_validate(rvd, mosconfig); + error = vdev_validate(rvd, trust_config); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) @@ -2755,7 +2777,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * can handle missing vdevs. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, - &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && + &children) != 0 && trust_config && type != SPA_IMPORT_ASSEMBLE && rvd->vdev_guid_sum != ub->ub_guid_sum) return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); @@ -2779,6 +2801,13 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa->spa_claim_max_txg = spa->spa_first_txg; spa->spa_prev_software_version = ub->ub_software_version; + /* + * Everything that we read before we do spa_remove_init() must + * have been rewritten after the last device removal was initiated. + * Otherwise we could be reading from indirect vdevs before + * we have loaded their mappings. + */ + error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); @@ -2787,6 +2816,41 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + /* + * Validate the config, using the MOS config to fill in any + * information which might be missing. If we fail to validate + * the config then declare the pool unfit for use. If we're + * assembling a pool from a split, the log is not transferred + * over. + */ + if (type != SPA_IMPORT_ASSEMBLE) { + nvlist_t *mos_config; + if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + if (!spa_config_valid(spa, mos_config)) { + nvlist_free(mos_config); + return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, + ENXIO)); + } + nvlist_free(mos_config); + + /* + * Now that we've validated the config, check the state of the + * root vdev. If it can't be opened, it indicates one or + * more toplevel vdevs are faulted. + */ + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) + return (SET_ERROR(ENXIO)); + } + + /* + * Everything that we read before spa_remove_init() must be stored + * on concreted vdevs. Therefore we do this as early as possible. + */ + if (spa_remove_init(spa) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + if (spa_version(spa) >= SPA_VERSION_FEATURES) { boolean_t missing_feat_read = B_FALSE; nvlist_t *unsup_feat, *enabled_feat; @@ -2894,33 +2958,34 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - if (!mosconfig) { + if (!trust_config) { uint64_t hostid; - nvlist_t *policy = NULL, *nvconfig; + nvlist_t *policy = NULL; + nvlist_t *mos_config; - if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) + if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, + if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { char *hostname; unsigned long myhostid = 0; - VERIFY(nvlist_lookup_string(nvconfig, + VERIFY(nvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); myhostid = spa_get_hostid(); if (hostid && myhostid && hostid != myhostid) { - nvlist_free(nvconfig); + nvlist_free(mos_config); return (SET_ERROR(EBADF)); } } if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_REWIND_POLICY, &policy) == 0) - VERIFY(nvlist_add_nvlist(nvconfig, + VERIFY(nvlist_add_nvlist(mos_config, ZPOOL_REWIND_POLICY, policy) == 0); - spa_config_set(spa, nvconfig); + spa_config_set(spa, mos_config); spa_unload(spa); spa_deactivate(spa); spa_activate(spa, orig_mode); @@ -3120,7 +3185,15 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, /* * Load the vdev state for all toplevel vdevs. */ - vdev_load(rvd); + error = vdev_load(rvd); + if (error != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); + } + + error = spa_condense_init(spa); + if (error != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); + } /* * Propagate the leaf DTLs we just loaded all the way up the tree. @@ -3138,38 +3211,10 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa_update_dspace(spa); - /* - * Validate the config, using the MOS config to fill in any - * information which might be missing. If we fail to validate - * the config then declare the pool unfit for use. If we're - * assembling a pool from a split, the log is not transferred - * over. - */ - if (type != SPA_IMPORT_ASSEMBLE) { - nvlist_t *nvconfig; - - if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - - if (!spa_config_valid(spa, nvconfig)) { - nvlist_free(nvconfig); - return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, - ENXIO)); - } - nvlist_free(nvconfig); - - /* - * Now that we've validated the config, check the state of the - * root vdev. If it can't be opened, it indicates one or - * more toplevel vdevs are faulted. - */ - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) - return (SET_ERROR(ENXIO)); - - if (spa_writeable(spa) && spa_check_logs(spa)) { - *ereport = FM_EREPORT_ZFS_LOG_REPLAY; - return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); - } + if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa) && + spa_check_logs(spa)) { + *ereport = FM_EREPORT_ZFS_LOG_REPLAY; + return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); } if (missing_feat_write) { @@ -3199,6 +3244,18 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, int need_update = B_FALSE; dsl_pool_t *dp = spa_get_dsl(spa); + /* + * We must check this before we start the sync thread, because + * we only want to start a condense thread for condense + * operations that were in progress when the pool was + * imported. Once we start syncing, spa_sync() could + * initiate a condense (and start a thread for it). In + * that case it would be wrong to start a second + * condense thread. + */ + boolean_t condense_in_progress = + (spa->spa_condensing_indirect != NULL); + ASSERT(state != SPA_LOAD_TRYIMPORT); /* @@ -3278,6 +3335,16 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * Clean up any stale temporary dataset userrefs. */ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); + + /* + * Note: unlike condensing, we don't need an analogous + * "removal_in_progress" dance because no other thread + * can start a removal while we hold the spa_namespace_lock. + */ + spa_restart_removal(spa); + + if (condense_in_progress) + spa_condense_indirect_restart(spa); } return (0); @@ -3463,7 +3530,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, */ spa_unload(spa); spa_deactivate(spa); - spa_config_sync(spa, B_TRUE, B_TRUE); + spa_write_cachefile(spa, B_TRUE, B_TRUE); spa_remove(spa); if (locked) mutex_exit(&spa_namespace_lock); @@ -4098,6 +4165,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; spa->spa_load_state = SPA_LOAD_CREATE; + spa->spa_removing_phys.sr_state = DSS_NONE; + spa->spa_removing_phys.sr_removing_vdev = -1; + spa->spa_removing_phys.sr_prev_indirect_vdev = -1; /* * Create "The Godfather" zio to hold all async IOs @@ -4283,7 +4353,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (dp->dp_root_dir->dd_crypto_obj != 0) VERIFY0(spa_keystore_remove_mapping(spa, root_dsobj, FTAG)); - spa_config_sync(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE); /* * Don't count references from objsets that are already closed @@ -4344,7 +4414,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) if (props != NULL) spa_configfile_set(spa, props, B_FALSE); - spa_config_sync(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); mutex_exit(&spa_namespace_lock); @@ -4688,7 +4758,7 @@ export_spa: if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) - spa_config_sync(spa, B_TRUE, B_TRUE); + spa_write_cachefile(spa, B_TRUE, B_TRUE); spa_remove(spa); } mutex_exit(&spa_namespace_lock); @@ -4780,8 +4850,41 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) return (spa_vdev_exit(spa, vd, txg, error)); /* - * Transfer each new top-level vdev from vd to rvd. + * If we are in the middle of a device removal, we can only add + * devices which match the existing devices in the pool. + * If we are in the middle of a removal, or have some indirect + * vdevs, we can not add raidz toplevels. */ + if (spa->spa_vdev_removal != NULL || + spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { + for (int c = 0; c < vd->vdev_children; c++) { + tvd = vd->vdev_child[c]; + if (spa->spa_vdev_removal != NULL && + tvd->vdev_ashift != + spa->spa_vdev_removal->svr_vdev->vdev_ashift) { + return (spa_vdev_exit(spa, vd, txg, EINVAL)); + } + /* Fail if top level vdev is raidz */ + if (tvd->vdev_ops == &vdev_raidz_ops) { + return (spa_vdev_exit(spa, vd, txg, EINVAL)); + } + /* + * Need the top level mirror to be + * a mirror of leaf vdevs only + */ + if (tvd->vdev_ops == &vdev_mirror_ops) { + for (uint64_t cid = 0; + cid < tvd->vdev_children; cid++) { + vdev_t *cvd = tvd->vdev_child[cid]; + if (!cvd->vdev_ops->vdev_op_leaf) { + return (spa_vdev_exit(spa, vd, + txg, EINVAL)); + } + } + } + } + } + for (int c = 0; c < vd->vdev_children; c++) { /* @@ -4867,6 +4970,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); + if (spa->spa_vdev_removal != NULL || + spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + } + if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); @@ -5317,7 +5425,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, /* clear the log and flush everything up to now */ activate_slog = spa_passivate_log(spa); (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); - error = spa_offline_log(spa); + error = spa_reset_logs(spa); txg = spa_vdev_config_enter(spa); if (activate_slog) @@ -5345,7 +5453,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, vdev_t *vd = rvd->vdev_child[c]; /* don't count the holes & logs as children */ - if (vd->vdev_islog || vd->vdev_ishole) { + if (vd->vdev_islog || !vdev_is_concrete(vd)) { if (lastlog == 0) lastlog = c; continue; @@ -5398,7 +5506,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, /* make sure there's nothing stopping the split */ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || vml[c]->vdev_islog || - vml[c]->vdev_ishole || + !vdev_is_concrete(vml[c]) || vml[c]->vdev_isspare || vml[c]->vdev_isl2cache || !vdev_writeable(vml[c]) || @@ -5588,257 +5696,6 @@ out: return (error); } -static nvlist_t * -spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) -{ - for (int i = 0; i < count; i++) { - uint64_t guid; - - VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, - &guid) == 0); - - if (guid == target_guid) - return (nvpp[i]); - } - - return (NULL); -} - -static void -spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, - nvlist_t *dev_to_remove) -{ - nvlist_t **newdev = NULL; - - if (count > 1) - newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); - - for (int i = 0, j = 0; i < count; i++) { - if (dev[i] == dev_to_remove) - continue; - VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); - } - - VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); - VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); - - for (int i = 0; i < count - 1; i++) - nvlist_free(newdev[i]); - - if (count > 1) - kmem_free(newdev, (count - 1) * sizeof (void *)); -} - -/* - * Evacuate the device. - */ -static int -spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) -{ - uint64_t txg; - int error = 0; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); - ASSERT(vd == vd->vdev_top); - - /* - * Evacuate the device. We don't hold the config lock as writer - * since we need to do I/O but we do keep the - * spa_namespace_lock held. Once this completes the device - * should no longer have any blocks allocated on it. - */ - if (vd->vdev_islog) { - if (vd->vdev_stat.vs_alloc != 0) - error = spa_offline_log(spa); - } else { - error = SET_ERROR(ENOTSUP); - } - - if (error) - return (error); - - /* - * The evacuation succeeded. Remove any remaining MOS metadata - * associated with this vdev, and wait for these changes to sync. - */ - ASSERT0(vd->vdev_stat.vs_alloc); - txg = spa_vdev_config_enter(spa); - vd->vdev_removing = B_TRUE; - vdev_dirty_leaves(vd, VDD_DTL, txg); - vdev_config_dirty(vd); - spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); - - return (0); -} - -/* - * Complete the removal by cleaning up the namespace. - */ -static void -spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) -{ - vdev_t *rvd = spa->spa_root_vdev; - uint64_t id = vd->vdev_id; - boolean_t last_vdev = (id == (rvd->vdev_children - 1)); - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - ASSERT(vd == vd->vdev_top); - - /* - * Only remove any devices which are empty. - */ - if (vd->vdev_stat.vs_alloc != 0) - return; - - (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); - - if (list_link_active(&vd->vdev_state_dirty_node)) - vdev_state_clean(vd); - if (list_link_active(&vd->vdev_config_dirty_node)) - vdev_config_clean(vd); - - vdev_free(vd); - - if (last_vdev) { - vdev_compact_children(rvd); - } else { - vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); - vdev_add_child(rvd, vd); - } - vdev_config_dirty(rvd); - - /* - * Reassess the health of our root vdev. - */ - vdev_reopen(rvd); -} - -/* - * Remove a device from the pool - - * - * Removing a device from the vdev namespace requires several steps - * and can take a significant amount of time. As a result we use - * the spa_vdev_config_[enter/exit] functions which allow us to - * grab and release the spa_config_lock while still holding the namespace - * lock. During each step the configuration is synced out. - * - * Currently, this supports removing only hot spares, slogs, and level 2 ARC - * devices. - */ -int -spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) -{ - vdev_t *vd; - sysevent_t *ev = NULL; - metaslab_group_t *mg; - nvlist_t **spares, **l2cache, *nv; - uint64_t txg = 0; - uint_t nspares, nl2cache; - int error = 0; - boolean_t locked = MUTEX_HELD(&spa_namespace_lock); - - ASSERT(spa_writeable(spa)); - - if (!locked) - txg = spa_vdev_enter(spa); - - vd = spa_lookup_by_guid(spa, guid, B_FALSE); - - if (spa->spa_spares.sav_vdevs != NULL && - nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && - (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { - /* - * Only remove the hot spare if it's not currently in use - * in this pool. - */ - if (vd == NULL || unspare) { - if (vd == NULL) - vd = spa_lookup_by_guid(spa, guid, B_TRUE); - ev = spa_event_create(spa, vd, NULL, - ESC_ZFS_VDEV_REMOVE_AUX); - spa_vdev_remove_aux(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, nspares, nv); - spa_load_spares(spa); - spa->spa_spares.sav_sync = B_TRUE; - } else { - error = SET_ERROR(EBUSY); - } - } else if (spa->spa_l2cache.sav_vdevs != NULL && - nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && - (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { - /* - * Cache devices can always be removed. - */ - vd = spa_lookup_by_guid(spa, guid, B_TRUE); - ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); - spa_vdev_remove_aux(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); - spa_load_l2cache(spa); - spa->spa_l2cache.sav_sync = B_TRUE; - } else if (vd != NULL && vd->vdev_islog) { - ASSERT(!locked); - ASSERT(vd == vd->vdev_top); - - mg = vd->vdev_mg; - - /* - * Stop allocating from this vdev. - */ - metaslab_group_passivate(mg); - - /* - * Wait for the youngest allocations and frees to sync, - * and then wait for the deferral of those frees to finish. - */ - spa_vdev_config_exit(spa, NULL, - txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); - - /* - * Attempt to evacuate the vdev. - */ - error = spa_vdev_remove_evacuate(spa, vd); - - txg = spa_vdev_config_enter(spa); - - /* - * If we couldn't evacuate the vdev, unwind. - */ - if (error) { - metaslab_group_activate(mg); - return (spa_vdev_exit(spa, NULL, txg, error)); - } - - /* - * Clean up the vdev namespace. - */ - ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_DEV); - spa_vdev_remove_from_namespace(spa, vd); - - } else if (vd != NULL) { - /* - * Normal vdevs cannot be removed (yet). - */ - error = SET_ERROR(ENOTSUP); - } else { - /* - * There is no vdev of any kind with the specified guid. - */ - error = SET_ERROR(ENOENT); - } - - if (!locked) - error = spa_vdev_exit(spa, NULL, txg, error); - - if (ev) - spa_event_post(ev); - - return (error); -} - /* * Find any device that's done replacing, or a vdev marked 'unspare' that's * currently spared, so we can detach it. @@ -6205,9 +6062,12 @@ spa_async_suspend(spa_t *spa) { mutex_enter(&spa->spa_async_lock); spa->spa_async_suspended++; - while (spa->spa_async_thread != NULL) + while (spa->spa_async_thread != NULL || + spa->spa_condense_thread != NULL) cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); mutex_exit(&spa->spa_async_lock); + + spa_vdev_remove_suspend(spa); } void @@ -6217,6 +6077,7 @@ spa_async_resume(spa_t *spa) ASSERT(spa->spa_async_suspended != 0); spa->spa_async_suspended--; mutex_exit(&spa->spa_async_lock); + spa_restart_removal(spa); } static boolean_t @@ -6763,6 +6624,39 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) rrw_exit(&dp->dp_config_rwlock, FTAG); } +static void +vdev_indirect_state_sync_verify(vdev_t *vd) +{ + ASSERTV(vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping); + ASSERTV(vdev_indirect_births_t *vib = vd->vdev_indirect_births); + + if (vd->vdev_ops == &vdev_indirect_ops) { + ASSERT(vim != NULL); + ASSERT(vib != NULL); + } + + if (vdev_obsolete_sm_object(vd) != 0) { + ASSERT(vd->vdev_obsolete_sm != NULL); + ASSERT(vd->vdev_removing || + vd->vdev_ops == &vdev_indirect_ops); + ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); + ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); + + ASSERT3U(vdev_obsolete_sm_object(vd), ==, + space_map_object(vd->vdev_obsolete_sm)); + ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, + space_map_allocated(vd->vdev_obsolete_sm)); + } + ASSERT(vd->vdev_obsolete_segments != NULL); + + /* + * Since frees / remaps to an indirect vdev can only + * happen in syncing context, the obsolete segments + * tree must be empty when we start syncing. + */ + ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); +} + /* * Sync the specified transaction group. New blocks may be dirtied as * part of the process, so we iterate until it converges. @@ -6783,6 +6677,13 @@ spa_sync(spa_t *spa, uint64_t txg) VERIFY(spa_writeable(spa)); /* + * Wait for i/os issued in open context that need to complete + * before this txg syncs. + */ + VERIFY0(zio_wait(spa->spa_txg_zio[txg & TXG_MASK])); + spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 0); + + /* * Lock out configuration changes. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); @@ -6879,6 +6780,16 @@ spa_sync(spa_t *spa, uint64_t txg) ASSERT3U(mc->mc_alloc_max_slots, <=, max_queue_depth * rvd->vdev_children); + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + vdev_indirect_state_sync_verify(vd); + + if (vdev_indirect_should_condense(vd)) { + spa_condense_indirect_start_sync(vd, tx); + break; + } + } + /* * Iterate to convergence. */ @@ -6908,7 +6819,11 @@ spa_sync(spa_t *spa, uint64_t txg) ddt_sync(spa, txg); dsl_scan_sync(dp, tx); - while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))) + if (spa->spa_vdev_removal != NULL) + svr_sync(spa, tx); + + while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) + != NULL) vdev_sync(vd, txg); if (pass == 1) { @@ -6962,6 +6877,10 @@ spa_sync(spa_t *spa, uint64_t txg) } #endif + if (spa->spa_vdev_removal != NULL) { + ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); + } + /* * Rewrite the vdev configuration (which includes the uberblock) * to commit the transaction group. @@ -6986,7 +6905,8 @@ spa_sync(spa_t *spa, uint64_t txg) for (int c = 0; c < children; c++) { vd = rvd->vdev_child[(c0 + c) % children]; - if (vd->vdev_ms_array == 0 || vd->vdev_islog) + if (vd->vdev_ms_array == 0 || vd->vdev_islog || + !vdev_is_concrete(vd)) continue; svd[svdcount++] = vd; if (svdcount == SPA_DVAS_PER_BP) @@ -7223,7 +7143,7 @@ spa_has_active_shared_spare(spa_t *spa) return (B_FALSE); } -static sysevent_t * +sysevent_t * spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { sysevent_t *ev = NULL; @@ -7239,7 +7159,7 @@ spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) return (ev); } -static void +void spa_event_post(sysevent_t *ev) { #ifdef _KERNEL @@ -7286,7 +7206,6 @@ EXPORT_SYMBOL(spa_scan_get_stats); EXPORT_SYMBOL(spa_vdev_add); EXPORT_SYMBOL(spa_vdev_attach); EXPORT_SYMBOL(spa_vdev_detach); -EXPORT_SYMBOL(spa_vdev_remove); EXPORT_SYMBOL(spa_vdev_setpath); EXPORT_SYMBOL(spa_vdev_setfru); EXPORT_SYMBOL(spa_vdev_split_mirror); diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index ec9661b86..4e9fd6c57 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -55,7 +55,7 @@ * configuration information. When the module loads, we read this information * from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is * maintained independently in spa.c. Whenever the namespace is modified, or - * the configuration of a pool is changed, we call spa_config_sync(), which + * the configuration of a pool is changed, we call spa_write_cachefile(), which * walks through all the active pools and writes the configuration to disk. */ @@ -249,7 +249,7 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) * would be required. */ void -spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) +spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) { spa_config_dirent_t *dp, *tdp; nvlist_t *nvl; @@ -590,15 +590,16 @@ spa_config_update(spa_t *spa, int what) /* * Update the global config cache to reflect the new mosconfig. */ - if (!spa->spa_is_root) - spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); + if (!spa->spa_is_root) { + spa_write_cachefile(spa, B_FALSE, + what != SPA_CONFIG_UPDATE_POOL); + } if (what == SPA_CONFIG_UPDATE_POOL) spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); } #if defined(_KERNEL) && defined(HAVE_SPL) -EXPORT_SYMBOL(spa_config_sync); EXPORT_SYMBOL(spa_config_load); EXPORT_SYMBOL(spa_all_configs); EXPORT_SYMBOL(spa_config_set); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 17f8c1638..5c6e82567 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -243,7 +243,12 @@ kmem_cache_t *spa_buffer_pool; int spa_mode_global; #ifdef ZFS_DEBUG -int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR | ZFS_DEBUG_SPA); +/* + * Everything except dprintf, set_error, spa, and indirect_remap is on + * by default in debug builds. + */ +int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR | + ZFS_DEBUG_SPA | ZFS_DEBUG_INDIRECT_REMAP); #else int zfs_flags = 0; #endif @@ -460,7 +465,7 @@ spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) (void) refcount_add(&scl->scl_count, tag); mutex_exit(&scl->scl_lock); } - ASSERT(wlocks_held <= locks); + ASSERT3U(wlocks_held, <=, locks); } void @@ -1136,7 +1141,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) * If the config changed, update the config cache. */ if (config_changed) - spa_config_sync(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE); } /* @@ -1228,7 +1233,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) */ if (config_changed) { mutex_enter(&spa_namespace_lock); - spa_config_sync(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE); mutex_exit(&spa_namespace_lock); } @@ -1306,7 +1311,7 @@ spa_rename(const char *name, const char *newname) /* * Sync the updated config cache. */ - spa_config_sync(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE); spa_close(spa, FTAG); @@ -1525,6 +1530,12 @@ spa_is_initializing(spa_t *spa) return (spa->spa_is_initializing); } +boolean_t +spa_indirect_vdevs_loaded(spa_t *spa) +{ + return (spa->spa_indirect_vdevs_loaded); +} + blkptr_t * spa_get_rootblkptr(spa_t *spa) { @@ -1683,6 +1694,24 @@ spa_update_dspace(spa_t *spa) { spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + ddt_get_dedup_dspace(spa); + if (spa->spa_vdev_removal != NULL) { + /* + * We can't allocate from the removing device, so + * subtract its size. This prevents the DMU/DSL from + * filling up the (now smaller) pool while we are in the + * middle of removing the device. + * + * Note that the DMU/DSL doesn't actually know or care + * how much space is allocated (it does its own tracking + * of how much space has been logically used). So it + * doesn't matter that the data we are moving may be + * allocated twice (on the old device and the new + * device). + */ + vdev_t *vd = spa->spa_vdev_removal->svr_vdev; + spa->spa_dspace -= spa_deflate(spa) ? + vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; + } } /* @@ -2105,6 +2134,49 @@ spa_maxblocksize(spa_t *spa) return (SPA_OLD_MAXBLOCKSIZE); } + +/* + * Returns the txg that the last device removal completed. No indirect mappings + * have been added since this txg. + */ +uint64_t +spa_get_last_removal_txg(spa_t *spa) +{ + uint64_t vdevid; + uint64_t ret = -1ULL; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + /* + * sr_prev_indirect_vdev is only modified while holding all the + * config locks, so it is sufficient to hold SCL_VDEV as reader when + * examining it. + */ + vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev; + + while (vdevid != -1ULL) { + vdev_t *vd = vdev_lookup_top(spa, vdevid); + vdev_indirect_births_t *vib = vd->vdev_indirect_births; + + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + + /* + * If the removal did not remap any data, we don't care. + */ + if (vdev_indirect_births_count(vib) != 0) { + ret = vdev_indirect_births_last_entry_txg(vib); + break; + } + + vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev; + } + spa_config_exit(spa, SCL_VDEV, FTAG); + + IMPLY(ret != -1ULL, + spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); + + return (ret); +} + int spa_maxdnodesize(spa_t *spa) { diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index baab0610d..d84dd7583 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -46,42 +46,27 @@ int space_map_blksz = (1 << 12); /* - * Load the space map disk into the specified range tree. Segments of maptype - * are added to the range tree, other segment types are removed. - * - * Note: space_map_load() will drop sm_lock across dmu_read() calls. - * The caller must be OK with this. + * Iterate through the space map, invoking the callback on each (non-debug) + * space map entry. */ int -space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) +space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg) { uint64_t *entry, *entry_map, *entry_map_end; - uint64_t bufsize, size, offset, end, space; + uint64_t bufsize, size, offset, end; int error = 0; - ASSERT(MUTEX_HELD(sm->sm_lock)); - end = space_map_length(sm); - space = space_map_allocated(sm); - - VERIFY0(range_tree_space(rt)); - - if (maptype == SM_FREE) { - range_tree_add(rt, sm->sm_start, sm->sm_size); - space = sm->sm_size - space; - } bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); entry_map = vmem_alloc(bufsize, KM_SLEEP); - mutex_exit(sm->sm_lock); if (end > bufsize) { dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize, end - bufsize, ZIO_PRIORITY_SYNC_READ); } - mutex_enter(sm->sm_lock); - for (offset = 0; offset < end; offset += bufsize) { + for (offset = 0; offset < end && error == 0; offset += bufsize) { size = MIN(end - offset, bufsize); VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0); VERIFY(size != 0); @@ -90,19 +75,18 @@ space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) dprintf("object=%llu offset=%llx size=%llx\n", space_map_object(sm), offset, size); - mutex_exit(sm->sm_lock); error = dmu_read(sm->sm_os, space_map_object(sm), offset, size, entry_map, DMU_READ_PREFETCH); - mutex_enter(sm->sm_lock); if (error != 0) break; entry_map_end = entry_map + (size / sizeof (uint64_t)); - for (entry = entry_map; entry < entry_map_end; entry++) { + for (entry = entry_map; entry < entry_map_end && error == 0; + entry++) { uint64_t e = *entry; uint64_t offset, size; - if (SM_DEBUG_DECODE(e)) /* Skip debug entries */ + if (SM_DEBUG_DECODE(e)) /* Skip debug entries */ continue; offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) + @@ -113,23 +97,67 @@ space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift)); VERIFY3U(offset, >=, sm->sm_start); VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size); - if (SM_TYPE_DECODE(e) == maptype) { - VERIFY3U(range_tree_space(rt) + size, <=, - sm->sm_size); - range_tree_add(rt, offset, size); - } else { - range_tree_remove(rt, offset, size); - } + error = callback(SM_TYPE_DECODE(e), offset, size, arg); } } - if (error == 0) + vmem_free(entry_map, bufsize); + return (error); +} + +typedef struct space_map_load_arg { + space_map_t *smla_sm; + range_tree_t *smla_rt; + maptype_t smla_type; +} space_map_load_arg_t; + +static int +space_map_load_callback(maptype_t type, uint64_t offset, uint64_t size, + void *arg) +{ + space_map_load_arg_t *smla = arg; + if (type == smla->smla_type) { + VERIFY3U(range_tree_space(smla->smla_rt) + size, <=, + smla->smla_sm->sm_size); + range_tree_add(smla->smla_rt, offset, size); + } else { + range_tree_remove(smla->smla_rt, offset, size); + } + + return (0); +} + +/* + * Load the space map disk into the specified range tree. Segments of maptype + * are added to the range tree, other segment types are removed. + */ +int +space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) +{ + uint64_t space; + int err; + space_map_load_arg_t smla; + + VERIFY0(range_tree_space(rt)); + space = space_map_allocated(sm); + + if (maptype == SM_FREE) { + range_tree_add(rt, sm->sm_start, sm->sm_size); + space = sm->sm_size - space; + } + + smla.smla_rt = rt; + smla.smla_sm = sm; + smla.smla_type = maptype; + err = space_map_iterate(sm, space_map_load_callback, &smla); + + if (err == 0) { VERIFY3U(range_tree_space(rt), ==, space); - else + } else { range_tree_vacate(rt, NULL, NULL); + } - vmem_free(entry_map, bufsize); - return (error); + return (err); } void @@ -160,7 +188,6 @@ space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx) { int idx = 0; - ASSERT(MUTEX_HELD(rt->rt_lock)); ASSERT(dmu_tx_is_syncing(tx)); VERIFY3U(space_map_object(sm), !=, 0); @@ -229,9 +256,6 @@ space_map_entries(space_map_t *sm, range_tree_t *rt) return (entries); } -/* - * Note: space_map_write() will drop sm_lock across dmu_write() calls. - */ void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, dmu_tx_t *tx) @@ -244,7 +268,6 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, uint64_t *entry, *entry_map, *entry_map_end; uint64_t expected_entries, actual_entries = 1; - ASSERT(MUTEX_HELD(rt->rt_lock)); ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); VERIFY3U(space_map_object(sm), !=, 0); dmu_buf_will_dirty(sm->sm_dbuf, tx); @@ -294,11 +317,9 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, run_len = MIN(size, SM_RUN_MAX); if (entry == entry_map_end) { - mutex_exit(rt->rt_lock); dmu_write(os, space_map_object(sm), sm->sm_phys->smp_objsize, sm->sm_blksz, entry_map, tx); - mutex_enter(rt->rt_lock); sm->sm_phys->smp_objsize += sm->sm_blksz; entry = entry_map; } @@ -315,10 +336,8 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, if (entry != entry_map) { size = (entry - entry_map) * sizeof (uint64_t); - mutex_exit(rt->rt_lock); dmu_write(os, space_map_object(sm), sm->sm_phys->smp_objsize, size, entry_map, tx); - mutex_enter(rt->rt_lock); sm->sm_phys->smp_objsize += size; } ASSERT3U(expected_entries, ==, actual_entries); @@ -351,7 +370,7 @@ space_map_open_impl(space_map_t *sm) int space_map_open(space_map_t **smp, objset_t *os, uint64_t object, - uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp) + uint64_t start, uint64_t size, uint8_t shift) { space_map_t *sm; int error; @@ -365,7 +384,6 @@ space_map_open(space_map_t **smp, objset_t *os, uint64_t object, sm->sm_start = start; sm->sm_size = size; sm->sm_shift = shift; - sm->sm_lock = lp; sm->sm_os = os; sm->sm_object = object; sm->sm_length = 0; @@ -459,8 +477,6 @@ space_map_update(space_map_t *sm) if (sm == NULL) return; - ASSERT(MUTEX_HELD(sm->sm_lock)); - sm->sm_alloc = sm->sm_phys->smp_alloc; sm->sm_length = sm->sm_phys->smp_objsize; } @@ -488,27 +504,29 @@ space_map_alloc(objset_t *os, dmu_tx_t *tx) } void -space_map_free(space_map_t *sm, dmu_tx_t *tx) +space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx) { - spa_t *spa; - - if (sm == NULL) - return; - - spa = dmu_objset_spa(sm->sm_os); + spa_t *spa = dmu_objset_spa(os); if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { dmu_object_info_t doi; - dmu_object_info_from_db(sm->sm_dbuf, &doi); + VERIFY0(dmu_object_info(os, smobj, &doi)); if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) { - VERIFY(spa_feature_is_active(spa, - SPA_FEATURE_SPACEMAP_HISTOGRAM)); spa_feature_decr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx); } } - VERIFY3U(dmu_object_free(sm->sm_os, space_map_object(sm), tx), ==, 0); + VERIFY0(dmu_object_free(os, smobj, tx)); +} + +void +space_map_free(space_map_t *sm, dmu_tx_t *tx) +{ + if (sm == NULL) + return; + + space_map_free_obj(sm->sm_os, space_map_object(sm), tx); sm->sm_object = 0; } diff --git a/module/zfs/space_reftree.c b/module/zfs/space_reftree.c index 038572b08..aa289ba10 100644 --- a/module/zfs/space_reftree.c +++ b/module/zfs/space_reftree.c @@ -111,8 +111,6 @@ space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt) { range_seg_t *rs; - ASSERT(MUTEX_HELD(rt->rt_lock)); - for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) space_reftree_add_seg(t, rs->rs_start, rs->rs_end, refcnt); } @@ -128,8 +126,6 @@ space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt, int64_t minref) int64_t refcnt = 0; space_ref_t *sr; - ASSERT(MUTEX_HELD(rt->rt_lock)); - range_tree_vacate(rt, NULL, NULL); for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) { diff --git a/module/zfs/trace.c b/module/zfs/trace.c index e4ebf31b3..eb6efe841 100644 --- a/module/zfs/trace.c +++ b/module/zfs/trace.c @@ -46,6 +46,7 @@ #include <sys/trace_dnode.h> #include <sys/trace_multilist.h> #include <sys/trace_txg.h> +#include <sys/trace_vdev.h> #include <sys/trace_zil.h> #include <sys/trace_zio.h> #include <sys/trace_zrlock.h> diff --git a/module/zfs/txg.c b/module/zfs/txg.c index bf8544507..2c7f5303b 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -860,6 +860,8 @@ txg_list_remove(txg_list_t *tl, uint64_t txg) txg_verify(tl->tl_spa, txg); mutex_enter(&tl->tl_lock); if ((tn = tl->tl_head[t]) != NULL) { + ASSERT(tn->tn_member[t]); + ASSERT(tn->tn_next[t] == NULL || tn->tn_next[t]->tn_member[t]); p = (char *)tn - tl->tl_offset; tl->tl_head[t] = tn->tn_next[t]; tn->tn_next[t] = NULL; diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 8ab996434..f3e3f90fa 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -32,8 +32,10 @@ #include <sys/fm/fs/zfs.h> #include <sys/spa.h> #include <sys/spa_impl.h> +#include <sys/bpobj.h> #include <sys/dmu.h> #include <sys/dmu_tx.h> +#include <sys/dsl_dir.h> #include <sys/vdev_impl.h> #include <sys/uberblock_impl.h> #include <sys/metaslab.h> @@ -86,6 +88,7 @@ static vdev_ops_t *vdev_ops_table[] = { &vdev_file_ops, &vdev_missing_ops, &vdev_hole_ops, + &vdev_indirect_ops, NULL }; @@ -310,17 +313,24 @@ vdev_compact_children(vdev_t *pvd) ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); + if (oldc == 0) + return; + for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; - newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP); + if (newc > 0) { + newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP); - for (int c = newc = 0; c < oldc; c++) { - if ((cvd = pvd->vdev_child[c]) != NULL) { - newchild[newc] = cvd; - cvd->vdev_id = newc++; + for (int c = newc = 0; c < oldc; c++) { + if ((cvd = pvd->vdev_child[c]) != NULL) { + newchild[newc] = cvd; + cvd->vdev_id = newc++; + } } + } else { + newchild = NULL; } kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); @@ -335,8 +345,10 @@ vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; + vdev_indirect_config_t *vic; vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); + vic = &vd->vdev_indirect_config; if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); @@ -367,6 +379,11 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); + vic->vic_prev_indirect_vdev = UINT64_MAX; + + rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); + vd->vdev_obsolete_segments = range_tree_create(NULL, NULL); /* * Initialize rate limit structs for events. We rate limit ZIO delay @@ -385,8 +402,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { - vd->vdev_dtl[t] = range_tree_create(NULL, NULL, - &vd->vdev_dtl_lock); + vd->vdev_dtl[t] = range_tree_create(NULL, NULL); } txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); @@ -412,6 +428,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, char *type; uint64_t guid = 0, islog, nparity; vdev_t *vd; + vdev_indirect_config_t *vic; char *tmp = NULL; int rc; @@ -501,6 +518,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, ASSERT(nparity != -1ULL); vd = vdev_alloc_common(spa, id, guid, ops); + vic = &vd->vdev_indirect_config; vd->vdev_islog = islog; vd->vdev_nparity = nparity; @@ -541,6 +559,16 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; + ASSERT0(vic->vic_mapping_object); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, + &vic->vic_mapping_object); + ASSERT0(vic->vic_births_object); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, + &vic->vic_births_object); + ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, + &vic->vic_prev_indirect_vdev); + /* * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. @@ -754,6 +782,23 @@ vdev_free(vdev_t *vd) } mutex_exit(&vd->vdev_dtl_lock); + EQUIV(vd->vdev_indirect_births != NULL, + vd->vdev_indirect_mapping != NULL); + if (vd->vdev_indirect_births != NULL) { + vdev_indirect_mapping_close(vd->vdev_indirect_mapping); + vdev_indirect_births_close(vd->vdev_indirect_births); + } + + if (vd->vdev_obsolete_sm != NULL) { + ASSERT(vd->vdev_removing || + vd->vdev_ops == &vdev_indirect_ops); + space_map_close(vd->vdev_obsolete_sm); + vd->vdev_obsolete_sm = NULL; + } + range_tree_destroy(vd->vdev_obsolete_segments); + rw_destroy(&vd->vdev_indirect_rwlock); + mutex_destroy(&vd->vdev_obsolete_lock); + mutex_destroy(&vd->vdev_queue_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); @@ -869,6 +914,7 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_max_asize = cvd->vdev_max_asize; + mvd->vdev_psize = cvd->vdev_psize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; @@ -960,15 +1006,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) ASSERT(!vd->vdev_ishole); - /* - * Compute the raidz-deflation ratio. Note, we hard-code - * in 128k (1 << 17) because it is the "typical" blocksize. - * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, - * otherwise it would inconsistently account for existing bp's. - */ - vd->vdev_deflate_ratio = (1 << 17) / - (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); - ASSERT(oldc <= newc); mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); @@ -984,7 +1021,12 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) for (m = oldc; m < newc; m++) { uint64_t object = 0; - if (txg == 0) { + /* + * vdev_ms_array may be 0 if we are creating the "fake" + * metaslabs for an indirect vdev for zdb's leak detection. + * See zdb_leak_init(). + */ + if (txg == 0 && vd->vdev_ms_array != 0) { error = dmu_read(mos, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); @@ -1018,12 +1060,11 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) void vdev_metaslab_fini(vdev_t *vd) { - uint64_t m; - uint64_t count = vd->vdev_ms_count; - if (vd->vdev_ms != NULL) { + uint64_t count = vd->vdev_ms_count; + metaslab_group_passivate(vd->vdev_mg); - for (m = 0; m < count; m++) { + for (uint64_t m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp != NULL) @@ -1031,8 +1072,10 @@ vdev_metaslab_fini(vdev_t *vd) } vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; - } + vd->vdev_ms_count = 0; + } + ASSERT0(vd->vdev_ms_count); ASSERT3U(vd->vdev_pending_fastwrite, ==, 0); } @@ -1078,6 +1121,8 @@ vdev_probe_done(zio_t *zio) zio->io_error = 0; } else { ASSERT(zio->io_error != 0); + zfs_dbgmsg("failed probe on vdev %llu", + (longlong_t)vd->vdev_id); zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, NULL, 0, 0); zio->io_error = SET_ERROR(ENXIO); @@ -1256,6 +1301,21 @@ retry_sync: } /* + * Compute the raidz-deflation ratio. Note, we hard-code + * in 128k (1 << 17) because it is the "typical" blocksize. + * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, + * otherwise it would inconsistently account for existing bp's. + */ +static void +vdev_set_deflate_ratio(vdev_t *vd) +{ + if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { + vd->vdev_deflate_ratio = (1 << 17) / + (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); + } +} + +/* * Prepare a virtual device for access. */ int @@ -1458,6 +1518,14 @@ vdev_open(vdev_t *vd) return (error); } + if (vd->vdev_top == vd && vd->vdev_ashift != 0 && + !vd->vdev_isl2cache && !vd->vdev_islog) { + if (vd->vdev_ashift > spa->spa_max_ashift) + spa->spa_max_ashift = vd->vdev_ashift; + if (vd->vdev_ashift < spa->spa_min_ashift) + spa->spa_min_ashift = vd->vdev_ashift; + } + /* * Track the min and max ashift values for normal data devices. */ @@ -1752,7 +1820,8 @@ void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); - ASSERT(!vd->vdev_ishole); + /* indirect vdevs don't have metaslabs or dtls */ + ASSERT(vdev_is_concrete(vd) || flags == 0); ASSERT(ISP2(flags)); ASSERT(spa_writeable(vd->vdev_spa)); @@ -1822,10 +1891,10 @@ vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) ASSERT(vd != vd->vdev_spa->spa_root_vdev); ASSERT(spa_writeable(vd->vdev_spa)); - mutex_enter(rt->rt_lock); + mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_contains(rt, txg, size)) range_tree_add(rt, txg, size); - mutex_exit(rt->rt_lock); + mutex_exit(&vd->vdev_dtl_lock); } boolean_t @@ -1837,10 +1906,21 @@ vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); - mutex_enter(rt->rt_lock); + /* + * While we are loading the pool, the DTLs have not been loaded yet. + * Ignore the DTLs and try all devices. This avoids a recursive + * mutex enter on the vdev_dtl_lock, and also makes us try hard + * when loading the pool (relying on the checksum to ensure that + * we get the right data -- note that we while loading, we are + * only reading the MOS, which is always checksummed). + */ + if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE) + return (B_FALSE); + + mutex_enter(&vd->vdev_dtl_lock); if (range_tree_space(rt) != 0) dirty = range_tree_contains(rt, txg, size); - mutex_exit(rt->rt_lock); + mutex_exit(&vd->vdev_dtl_lock); return (dirty); } @@ -1851,9 +1931,9 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; - mutex_enter(rt->rt_lock); + mutex_enter(&vd->vdev_dtl_lock); empty = (range_tree_space(rt) == 0); - mutex_exit(rt->rt_lock); + mutex_exit(&vd->vdev_dtl_lock); return (empty); } @@ -1961,7 +2041,7 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) vdev_dtl_reassess(vd->vdev_child[c], txg, scrub_txg, scrub_done); - if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux) + if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { @@ -2076,10 +2156,10 @@ vdev_dtl_load(vdev_t *vd) int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { - ASSERT(!vd->vdev_ishole); + ASSERT(vdev_is_concrete(vd)); error = space_map_open(&vd->vdev_dtl_sm, mos, - vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock); + vd->vdev_dtl_object, 0, -1ULL, 0); if (error) return (error); ASSERT(vd->vdev_dtl_sm != NULL); @@ -2158,11 +2238,10 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; range_tree_t *rtsync; - kmutex_t rtlock; dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); - ASSERT(!vd->vdev_ishole); + ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_ops->vdev_op_leaf); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); @@ -2196,15 +2275,11 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, - 0, -1ULL, 0, &vd->vdev_dtl_lock)); + 0, -1ULL, 0)); ASSERT(vd->vdev_dtl_sm != NULL); } - mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL); - - rtsync = range_tree_create(NULL, NULL, &rtlock); - - mutex_enter(&rtlock); + rtsync = range_tree_create(NULL, NULL); mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, rtsync); @@ -2216,9 +2291,6 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) range_tree_destroy(rtsync); - mutex_exit(&rtlock); - mutex_destroy(&rtlock); - /* * If the object for the space map has changed then dirty * the top level so that we update the config. @@ -2311,29 +2383,63 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) return (needed); } -void +int vdev_load(vdev_t *vd) { + int error = 0; + /* * Recursively load all children. */ - for (int c = 0; c < vd->vdev_children; c++) - vdev_load(vd->vdev_child[c]); + for (int c = 0; c < vd->vdev_children; c++) { + error = vdev_load(vd->vdev_child[c]); + if (error != 0) { + return (error); + } + } + + vdev_set_deflate_ratio(vd); /* * If this is a top-level vdev, initialize its metaslabs. */ - if (vd == vd->vdev_top && !vd->vdev_ishole && - (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || - vdev_metaslab_init(vd, 0) != 0)) - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); + if (vd == vd->vdev_top && vdev_is_concrete(vd)) { + if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + return (SET_ERROR(ENXIO)); + } else if ((error = vdev_metaslab_init(vd, 0)) != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + return (error); + } + } + /* * If this is a leaf vdev, load its DTL. */ - if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) + if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); + return (error); + } + + uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd); + if (obsolete_sm_object != 0) { + objset_t *mos = vd->vdev_spa->spa_meta_objset; + ASSERT(vd->vdev_asize != 0); + ASSERT(vd->vdev_obsolete_sm == NULL); + + if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, + obsolete_sm_object, 0, vd->vdev_asize, 0))) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + return (error); + } + space_map_update(vd->vdev_obsolete_sm); + } + + return (0); } /* @@ -2378,14 +2484,42 @@ vdev_validate_aux(vdev_t *vd) return (0); } +/* + * Free the objects used to store this vdev's spacemaps, and the array + * that points to them. + */ void -vdev_remove(vdev_t *vd, uint64_t txg) +vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) +{ + if (vd->vdev_ms_array == 0) + return; + + objset_t *mos = vd->vdev_spa->spa_meta_objset; + uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; + size_t array_bytes = array_count * sizeof (uint64_t); + uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); + VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, + array_bytes, smobj_array, 0)); + + for (uint64_t i = 0; i < array_count; i++) { + uint64_t smobj = smobj_array[i]; + if (smobj == 0) + continue; + + space_map_free_obj(mos, smobj, tx); + } + + kmem_free(smobj_array, array_bytes); + VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); + vd->vdev_ms_array = 0; +} + +static void +vdev_remove_empty(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; - objset_t *mos = spa->spa_meta_objset; dmu_tx_t *tx; - tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); ASSERT(vd == vd->vdev_top); ASSERT3U(txg, ==, spa_syncing_txg(spa)); @@ -2412,7 +2546,6 @@ vdev_remove(vdev_t *vd, uint64_t txg) metaslab_group_histogram_remove(mg, msp); VERIFY0(space_map_allocated(msp->ms_sm)); - space_map_free(msp->ms_sm, tx); space_map_close(msp->ms_sm); msp->ms_sm = NULL; mutex_exit(&msp->ms_lock); @@ -2422,13 +2555,10 @@ vdev_remove(vdev_t *vd, uint64_t txg) metaslab_class_histogram_verify(mg->mg_class); for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) ASSERT0(mg->mg_histogram[i]); - } - if (vd->vdev_ms_array) { - (void) dmu_object_free(mos, vd->vdev_ms_array, tx); - vd->vdev_ms_array = 0; - } + tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + vdev_destroy_spacemaps(vd, tx); if (vd->vdev_islog && vd->vdev_top_zap != 0) { vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); @@ -2443,7 +2573,7 @@ vdev_sync_done(vdev_t *vd, uint64_t txg) metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); - ASSERT(!vd->vdev_ishole); + ASSERT(vdev_is_concrete(vd)); while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))) metaslab_sync_done(msp, txg); @@ -2460,10 +2590,33 @@ vdev_sync(vdev_t *vd, uint64_t txg) metaslab_t *msp; dmu_tx_t *tx; - ASSERT(!vd->vdev_ishole); + if (range_tree_space(vd->vdev_obsolete_segments) > 0) { + dmu_tx_t *tx; - if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { + ASSERT(vd->vdev_removing || + vd->vdev_ops == &vdev_indirect_ops); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + vdev_indirect_sync_obsolete(vd, tx); + dmu_tx_commit(tx); + + /* + * If the vdev is indirect, it can't have dirty + * metaslabs or DTLs. + */ + if (vd->vdev_ops == &vdev_indirect_ops) { + ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); + ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); + return; + } + } + + ASSERT(vdev_is_concrete(vd)); + + if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && + !vd->vdev_removing) { ASSERT(vd == vd->vdev_top); + ASSERT0(vd->vdev_indirect_config.vic_mapping_object); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); @@ -2472,12 +2625,6 @@ vdev_sync(vdev_t *vd, uint64_t txg) dmu_tx_commit(tx); } - /* - * Remove the metadata associated with this vdev once it's empty. - */ - if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) - vdev_remove(vd, txg); - while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); @@ -2486,6 +2633,16 @@ vdev_sync(vdev_t *vd, uint64_t txg) while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) vdev_dtl_sync(lvd, txg); + /* + * Remove the metadata associated with this vdev once it's empty. + * Note that this is typically used for log/cache device removal; + * we don't empty toplevel vdevs when removing them. But if + * a toplevel happens to be emptied, this is not harmful. + */ + if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) { + vdev_remove_empty(vd, txg); + } + (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); } @@ -2725,7 +2882,7 @@ top: metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); - error = spa_offline_log(spa); + error = spa_reset_logs(spa); spa_vdev_state_enter(spa, SCL_ALLOC); @@ -2807,6 +2964,12 @@ vdev_clear(spa_t *spa, vdev_t *vd) vdev_clear(spa, vd->vdev_child[c]); /* + * It makes no sense to "clear" an indirect vdev. + */ + if (!vdev_is_concrete(vd)) + return; + + /* * If we're in the FAULTED state or have experienced failed I/O, then * clear the persistent state and attempt to reopen the device. We * also mark the vdev config dirty, so that the new faulted state is @@ -2860,7 +3023,8 @@ vdev_is_dead(vdev_t *vd) * Instead we rely on the fact that we skip over dead devices * before issuing I/O to them. */ - return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || + return (vd->vdev_state < VDEV_STATE_DEGRADED || + vd->vdev_ops == &vdev_hole_ops || vd->vdev_ops == &vdev_missing_ops); } @@ -2873,7 +3037,8 @@ vdev_readable(vdev_t *vd) boolean_t vdev_writeable(vdev_t *vd) { - return (!vdev_is_dead(vd) && !vd->vdev_cant_write); + return (!vdev_is_dead(vd) && !vd->vdev_cant_write && + vdev_is_concrete(vd)); } boolean_t @@ -2890,7 +3055,7 @@ vdev_allocatable(vdev_t *vd) * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && - !vd->vdev_cant_write && !vd->vdev_ishole && + !vd->vdev_cant_write && vdev_is_concrete(vd) && vd->vdev_mg->mg_initialized); } @@ -3033,7 +3198,7 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) } vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize; if (vd->vdev_aux == NULL && vd == vd->vdev_top && - !vd->vdev_ishole) { + vdev_is_concrete(vd)) { vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; } } @@ -3196,7 +3361,8 @@ vdev_stat_update(zio_t *zio, uint64_t psize) vs->vs_write_errors++; mutex_exit(&vd->vdev_stat_lock); - if (type == ZIO_TYPE_WRITE && txg != 0 && + if (spa->spa_load_state == SPA_LOAD_NONE && + type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { @@ -3361,8 +3527,9 @@ vdev_config_dirty(vdev_t *vd) ASSERT(vd == vd->vdev_top); if (!list_link_active(&vd->vdev_config_dirty_node) && - !vd->vdev_ishole) + vdev_is_concrete(vd)) { list_insert_head(&spa->spa_config_dirty_list, vd); + } } } @@ -3403,7 +3570,8 @@ vdev_state_dirty(vdev_t *vd) (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); - if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) + if (!list_link_active(&vd->vdev_state_dirty_node) && + vdev_is_concrete(vd)) list_insert_head(&spa->spa_state_dirty_list, vd); } @@ -3437,9 +3605,10 @@ vdev_propagate_state(vdev_t *vd) child = vd->vdev_child[c]; /* - * Don't factor holes into the decision. + * Don't factor holes or indirect vdevs into the + * decision. */ - if (child->vdev_ishole) + if (!vdev_is_concrete(child)) continue; if (!vdev_readable(child) || @@ -3642,8 +3811,10 @@ vdev_is_bootable(vdev_t *vd) if (!vd->vdev_ops->vdev_op_leaf) { const char *vdev_type = vd->vdev_ops->vdev_op_type; - if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) + if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 || + strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) { return (B_FALSE); + } } for (int c = 0; c < vd->vdev_children; c++) { @@ -3653,6 +3824,18 @@ vdev_is_bootable(vdev_t *vd) return (B_TRUE); } +boolean_t +vdev_is_concrete(vdev_t *vd) +{ + vdev_ops_t *ops = vd->vdev_ops; + if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || + ops == &vdev_missing_ops || ops == &vdev_root_ops) { + return (B_FALSE); + } else { + return (B_TRUE); + } +} + /* * Load the state from the original vdev tree (ovd) which * we've retrieved from the MOS config object. If the original @@ -3709,7 +3892,10 @@ vdev_expand(vdev_t *vd, uint64_t txg) ASSERT(vd->vdev_top == vd); ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); - if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { + vdev_set_deflate_ratio(vd); + + if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && + vdev_is_concrete(vd)) { VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 788503bcd..056381c9d 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -843,6 +843,7 @@ vdev_ops_t vdev_disk_ops = { NULL, vdev_disk_hold, vdev_disk_rele, + NULL, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index 13c32e083..bd7e0bc2e 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -253,6 +253,7 @@ vdev_ops_t vdev_file_ops = { NULL, vdev_file_hold, vdev_file_rele, + NULL, VDEV_TYPE_FILE, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; @@ -287,6 +288,7 @@ vdev_ops_t vdev_disk_ops = { NULL, vdev_file_hold, vdev_file_rele, + NULL, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c new file mode 100644 index 000000000..86a05daa8 --- /dev/null +++ b/module/zfs/vdev_indirect.c @@ -0,0 +1,1064 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2014, 2015 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> +#include <sys/metaslab.h> +#include <sys/refcount.h> +#include <sys/dmu.h> +#include <sys/vdev_indirect_mapping.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_synctask.h> +#include <sys/zap.h> + +/* + * An indirect vdev corresponds to a vdev that has been removed. Since + * we cannot rewrite block pointers of snapshots, etc., we keep a + * mapping from old location on the removed device to the new location + * on another device in the pool and use this mapping whenever we need + * to access the DVA. Unfortunately, this mapping did not respect + * logical block boundaries when it was first created, and so a DVA on + * this indirect vdev may be "split" into multiple sections that each + * map to a different location. As a consequence, not all DVAs can be + * translated to an equivalent new DVA. Instead we must provide a + * "vdev_remap" operation that executes a callback on each contiguous + * segment of the new location. This function is used in multiple ways: + * + * - reads and repair writes to this device use the callback to create + * a child io for each mapped segment. + * + * - frees and claims to this device use the callback to free or claim + * each mapped segment. (Note that we don't actually need to claim + * log blocks on indirect vdevs, because we don't allocate to + * removing vdevs. However, zdb uses zio_claim() for its leak + * detection.) + */ + +/* + * "Big theory statement" for how we mark blocks obsolete. + * + * When a block on an indirect vdev is freed or remapped, a section of + * that vdev's mapping may no longer be referenced (aka "obsolete"). We + * keep track of how much of each mapping entry is obsolete. When + * an entry becomes completely obsolete, we can remove it, thus reducing + * the memory used by the mapping. The complete picture of obsolescence + * is given by the following data structures, described below: + * - the entry-specific obsolete count + * - the vdev-specific obsolete spacemap + * - the pool-specific obsolete bpobj + * + * == On disk data structures used == + * + * We track the obsolete space for the pool using several objects. Each + * of these objects is created on demand and freed when no longer + * needed, and is assumed to be empty if it does not exist. + * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects. + * + * - Each vic_mapping_object (associated with an indirect vdev) can + * have a vimp_counts_object. This is an array of uint32_t's + * with the same number of entries as the vic_mapping_object. When + * the mapping is condensed, entries from the vic_obsolete_sm_object + * (see below) are folded into the counts. Therefore, each + * obsolete_counts entry tells us the number of bytes in the + * corresponding mapping entry that were not referenced when the + * mapping was last condensed. + * + * - Each indirect or removing vdev can have a vic_obsolete_sm_object. + * This is a space map containing an alloc entry for every DVA that + * has been obsoleted since the last time this indirect vdev was + * condensed. We use this object in order to improve performance + * when marking a DVA as obsolete. Instead of modifying an arbitrary + * offset of the vimp_counts_object, we only need to append an entry + * to the end of this object. When a DVA becomes obsolete, it is + * added to the obsolete space map. This happens when the DVA is + * freed, remapped and not referenced by a snapshot, or the last + * snapshot referencing it is destroyed. + * + * - Each dataset can have a ds_remap_deadlist object. This is a + * deadlist object containing all blocks that were remapped in this + * dataset but referenced in a previous snapshot. Blocks can *only* + * appear on this list if they were remapped (dsl_dataset_block_remapped); + * blocks that were killed in a head dataset are put on the normal + * ds_deadlist and marked obsolete when they are freed. + * + * - The pool can have a dp_obsolete_bpobj. This is a list of blocks + * in the pool that need to be marked obsolete. When a snapshot is + * destroyed, we move some of the ds_remap_deadlist to the obsolete + * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then + * asynchronously process the obsolete bpobj, moving its entries to + * the specific vdevs' obsolete space maps. + * + * == Summary of how we mark blocks as obsolete == + * + * - When freeing a block: if any DVA is on an indirect vdev, append to + * vic_obsolete_sm_object. + * - When remapping a block, add dva to ds_remap_deadlist (if prev snap + * references; otherwise append to vic_obsolete_sm_object). + * - When freeing a snapshot: move parts of ds_remap_deadlist to + * dp_obsolete_bpobj (same algorithm as ds_deadlist). + * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to + * individual vdev's vic_obsolete_sm_object. + */ + +/* + * "Big theory statement" for how we condense indirect vdevs. + * + * Condensing an indirect vdev's mapping is the process of determining + * the precise counts of obsolete space for each mapping entry (by + * integrating the obsolete spacemap into the obsolete counts) and + * writing out a new mapping that contains only referenced entries. + * + * We condense a vdev when we expect the mapping to shrink (see + * vdev_indirect_should_condense()), but only perform one condense at a + * time to limit the memory usage. In addition, we use a separate + * open-context thread (spa_condense_indirect_thread) to incrementally + * create the new mapping object in a way that minimizes the impact on + * the rest of the system. + * + * == Generating a new mapping == + * + * To generate a new mapping, we follow these steps: + * + * 1. Save the old obsolete space map and create a new mapping object + * (see spa_condense_indirect_start_sync()). This initializes the + * spa_condensing_indirect_phys with the "previous obsolete space map", + * which is now read only. Newly obsolete DVAs will be added to a + * new (initially empty) obsolete space map, and will not be + * considered as part of this condense operation. + * + * 2. Construct in memory the precise counts of obsolete space for each + * mapping entry, by incorporating the obsolete space map into the + * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().) + * + * 3. Iterate through each mapping entry, writing to the new mapping any + * entries that are not completely obsolete (i.e. which don't have + * obsolete count == mapping length). (See + * spa_condense_indirect_generate_new_mapping().) + * + * 4. Destroy the old mapping object and switch over to the new one + * (spa_condense_indirect_complete_sync). + * + * == Restarting from failure == + * + * To restart the condense when we import/open the pool, we must start + * at the 2nd step above: reconstruct the precise counts in memory, + * based on the space map + counts. Then in the 3rd step, we start + * iterating where we left off: at vimp_max_offset of the new mapping + * object. + */ + +boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE; + +/* + * Condense if at least this percent of the bytes in the mapping is + * obsolete. With the default of 25%, the amount of space mapped + * will be reduced to 1% of its original size after at most 16 + * condenses. Higher values will condense less often (causing less + * i/o); lower values will reduce the mapping size more quickly. + */ +int zfs_indirect_condense_obsolete_pct = 25; + +/* + * Condense if the obsolete space map takes up more than this amount of + * space on disk (logically). This limits the amount of disk space + * consumed by the obsolete space map; the default of 1GB is small enough + * that we typically don't mind "wasting" it. + */ +uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; + +/* + * Don't bother condensing if the mapping uses less than this amount of + * memory. The default of 128KB is considered a "trivial" amount of + * memory and not worth reducing. + */ +unsigned long zfs_condense_min_mapping_bytes = 128 * 1024; + +/* + * This is used by the test suite so that it can ensure that certain + * actions happen while in the middle of a condense (which might otherwise + * complete too quickly). If used to reduce the performance impact of + * condensing in production, a maximum value of 1 should be sufficient. + */ +int zfs_condense_indirect_commit_entry_delay_ms = 0; + +/* + * Mark the given offset and size as being obsolete in the given txg. + */ +void +vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size, + uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + ASSERT3U(spa_syncing_txg(spa), ==, txg); + ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); + ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); + ASSERT(size > 0); + VERIFY(vdev_indirect_mapping_entry_for_offset( + vd->vdev_indirect_mapping, offset) != NULL); + + if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + mutex_enter(&vd->vdev_obsolete_lock); + range_tree_add(vd->vdev_obsolete_segments, offset, size); + mutex_exit(&vd->vdev_obsolete_lock); + vdev_dirty(vd, 0, NULL, txg); + } +} + +/* + * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This + * wrapper is provided because the DMU does not know about vdev_t's and + * cannot directly call vdev_indirect_mark_obsolete. + */ +void +spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset, + uint64_t size, dmu_tx_t *tx) +{ + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + ASSERT(dmu_tx_is_syncing(tx)); + + /* The DMU can only remap indirect vdevs. */ + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + vdev_indirect_mark_obsolete(vd, offset, size, dmu_tx_get_txg(tx)); +} + +static spa_condensing_indirect_t * +spa_condensing_indirect_create(spa_t *spa) +{ + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP); + objset_t *mos = spa->spa_meta_objset; + + for (int i = 0; i < TXG_SIZE; i++) { + list_create(&sci->sci_new_mapping_entries[i], + sizeof (vdev_indirect_mapping_entry_t), + offsetof(vdev_indirect_mapping_entry_t, vime_node)); + } + + sci->sci_new_mapping = + vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object); + + return (sci); +} + +static void +spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci) +{ + for (int i = 0; i < TXG_SIZE; i++) + list_destroy(&sci->sci_new_mapping_entries[i]); + + if (sci->sci_new_mapping != NULL) + vdev_indirect_mapping_close(sci->sci_new_mapping); + + kmem_free(sci, sizeof (*sci)); +} + +boolean_t +vdev_indirect_should_condense(vdev_t *vd) +{ + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + spa_t *spa = vd->vdev_spa; + + ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool)); + + if (!zfs_condense_indirect_vdevs_enable) + return (B_FALSE); + + /* + * We can only condense one indirect vdev at a time. + */ + if (spa->spa_condensing_indirect != NULL) + return (B_FALSE); + + if (spa_shutting_down(spa)) + return (B_FALSE); + + /* + * The mapping object size must not change while we are + * condensing, so we can only condense indirect vdevs + * (not vdevs that are still in the middle of being removed). + */ + if (vd->vdev_ops != &vdev_indirect_ops) + return (B_FALSE); + + /* + * If nothing new has been marked obsolete, there is no + * point in condensing. + */ + if (vd->vdev_obsolete_sm == NULL) { + ASSERT0(vdev_obsolete_sm_object(vd)); + return (B_FALSE); + } + + ASSERT(vd->vdev_obsolete_sm != NULL); + + ASSERT3U(vdev_obsolete_sm_object(vd), ==, + space_map_object(vd->vdev_obsolete_sm)); + + uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim); + uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm); + uint64_t mapping_size = vdev_indirect_mapping_size(vim); + uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm); + + ASSERT3U(bytes_obsolete, <=, bytes_mapped); + + /* + * If a high percentage of the bytes that are mapped have become + * obsolete, condense (unless the mapping is already small enough). + * This has a good chance of reducing the amount of memory used + * by the mapping. + */ + if (bytes_obsolete * 100 / bytes_mapped >= + zfs_indirect_condense_obsolete_pct && + mapping_size > zfs_condense_min_mapping_bytes) { + zfs_dbgmsg("should condense vdev %llu because obsolete " + "spacemap covers %d%% of %lluMB mapping", + (u_longlong_t)vd->vdev_id, + (int)(bytes_obsolete * 100 / bytes_mapped), + (u_longlong_t)bytes_mapped / 1024 / 1024); + return (B_TRUE); + } + + /* + * If the obsolete space map takes up too much space on disk, + * condense in order to free up this disk space. + */ + if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) { + zfs_dbgmsg("should condense vdev %llu because obsolete sm " + "length %lluMB >= max size %lluMB", + (u_longlong_t)vd->vdev_id, + (u_longlong_t)obsolete_sm_size / 1024 / 1024, + (u_longlong_t)zfs_condense_max_obsolete_bytes / + 1024 / 1024); + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * This sync task completes (finishes) a condense, deleting the old + * mapping and replacing it with the new one. + */ +static void +spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) +{ + spa_condensing_indirect_t *sci = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + objset_t *mos = spa->spa_meta_objset; + vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; + uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping); + uint64_t new_count = + vdev_indirect_mapping_num_entries(sci->sci_new_mapping); + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + ASSERT3P(sci, ==, spa->spa_condensing_indirect); + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); + } + ASSERT(vic->vic_mapping_object != 0); + ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); + ASSERT(scip->scip_next_mapping_object != 0); + ASSERT(scip->scip_prev_obsolete_sm_object != 0); + + /* + * Reset vdev_indirect_mapping to refer to the new object. + */ + rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); + vdev_indirect_mapping_close(vd->vdev_indirect_mapping); + vd->vdev_indirect_mapping = sci->sci_new_mapping; + rw_exit(&vd->vdev_indirect_rwlock); + + sci->sci_new_mapping = NULL; + vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); + vic->vic_mapping_object = scip->scip_next_mapping_object; + scip->scip_next_mapping_object = 0; + + space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); + spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + scip->scip_prev_obsolete_sm_object = 0; + + scip->scip_vdev = 0; + + VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CONDENSING_INDIRECT, tx)); + spa_condensing_indirect_destroy(spa->spa_condensing_indirect); + spa->spa_condensing_indirect = NULL; + + zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " + "new mapping object %llu has %llu entries " + "(was %llu entries)", + vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object, + new_count, old_count); + + vdev_config_dirty(spa->spa_root_vdev); +} + +/* + * This sync task appends entries to the new mapping object. + */ +static void +spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx) +{ + spa_condensing_indirect_t *sci = arg; + uint64_t txg = dmu_tx_get_txg(tx); + ASSERTV(spa_t *spa = dmu_tx_pool(tx)->dp_spa); + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT3P(sci, ==, spa->spa_condensing_indirect); + + vdev_indirect_mapping_add_entries(sci->sci_new_mapping, + &sci->sci_new_mapping_entries[txg & TXG_MASK], tx); + ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK])); +} + +/* + * Open-context function to add one entry to the new mapping. The new + * entry will be remembered and written from syncing context. + */ +static void +spa_condense_indirect_commit_entry(spa_t *spa, + vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count) +{ + spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; + + ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + /* + * If we are the first entry committed this txg, kick off the sync + * task to write to the MOS on our behalf. + */ + if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { + dsl_sync_task_nowait(dmu_tx_pool(tx), + spa_condense_indirect_commit_sync, sci, + 0, ZFS_SPACE_CHECK_NONE, tx); + } + + vdev_indirect_mapping_entry_t *vime = + kmem_alloc(sizeof (*vime), KM_SLEEP); + vime->vime_mapping = *vimep; + vime->vime_obsolete_count = count; + list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime); + + dmu_tx_commit(tx); +} + +static void +spa_condense_indirect_generate_new_mapping(vdev_t *vd, + uint32_t *obsolete_counts, uint64_t start_index) +{ + spa_t *spa = vd->vdev_spa; + uint64_t mapi = start_index; + vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; + uint64_t old_num_entries = + vdev_indirect_mapping_num_entries(old_mapping); + + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev); + + zfs_dbgmsg("starting condense of vdev %llu from index %llu", + (u_longlong_t)vd->vdev_id, + (u_longlong_t)mapi); + + while (mapi < old_num_entries && !spa_shutting_down(spa)) { + vdev_indirect_mapping_entry_phys_t *entry = + &old_mapping->vim_entries[mapi]; + uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); + ASSERT3U(obsolete_counts[mapi], <=, entry_size); + if (obsolete_counts[mapi] < entry_size) { + spa_condense_indirect_commit_entry(spa, entry, + obsolete_counts[mapi]); + + /* + * This delay may be requested for testing, debugging, + * or performance reasons. + */ + hrtime_t now = gethrtime(); + hrtime_t sleep_until = now + MSEC2NSEC( + zfs_condense_indirect_commit_entry_delay_ms); + zfs_sleep_until(sleep_until); + } + + mapi++; + } + if (spa_shutting_down(spa)) { + zfs_dbgmsg("pausing condense of vdev %llu at index %llu", + (u_longlong_t)vd->vdev_id, + (u_longlong_t)mapi); + } +} + +static void +spa_condense_indirect_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + uint32_t *counts; + uint64_t start_index; + vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; + space_map_t *prev_obsolete_sm = NULL; + + ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); + ASSERT(scip->scip_next_mapping_object != 0); + ASSERT(scip->scip_prev_obsolete_sm_object != 0); + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + + for (int i = 0; i < TXG_SIZE; i++) { + /* + * The list must start out empty in order for the + * _commit_sync() sync task to be properly registered + * on the first call to _commit_entry(); so it's wise + * to double check and ensure we actually are starting + * with empty lists. + */ + ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); + } + + VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, + scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); + space_map_update(prev_obsolete_sm); + counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); + if (prev_obsolete_sm != NULL) { + vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, + counts, prev_obsolete_sm); + } + space_map_close(prev_obsolete_sm); + + /* + * Generate new mapping. Determine what index to continue from + * based on the max offset that we've already written in the + * new mapping. + */ + uint64_t max_offset = + vdev_indirect_mapping_max_offset(sci->sci_new_mapping); + if (max_offset == 0) { + /* We haven't written anything to the new mapping yet. */ + start_index = 0; + } else { + /* + * Pick up from where we left off. _entry_for_offset() + * returns a pointer into the vim_entries array. If + * max_offset is greater than any of the mappings + * contained in the table NULL will be returned and + * that indicates we've exhausted our iteration of the + * old_mapping. + */ + + vdev_indirect_mapping_entry_phys_t *entry = + vdev_indirect_mapping_entry_for_offset_or_next(old_mapping, + max_offset); + + if (entry == NULL) { + /* + * We've already written the whole new mapping. + * This special value will cause us to skip the + * generate_new_mapping step and just do the sync + * task to complete the condense. + */ + start_index = UINT64_MAX; + } else { + start_index = entry - old_mapping->vim_entries; + ASSERT3U(start_index, <, + vdev_indirect_mapping_num_entries(old_mapping)); + } + } + + spa_condense_indirect_generate_new_mapping(vd, counts, start_index); + + vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts); + + /* + * We may have bailed early from generate_new_mapping(), if + * the spa is shutting down. In this case, do not complete + * the condense. + */ + if (!spa_shutting_down(spa)) { + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + spa_condense_indirect_complete_sync, sci, 0, + ZFS_SPACE_CHECK_NONE)); + } + + mutex_enter(&spa->spa_async_lock); + spa->spa_condense_thread = NULL; + cv_broadcast(&spa->spa_async_cv); + mutex_exit(&spa->spa_async_lock); +} + +/* + * Sync task to begin the condensing process. + */ +void +spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + + ASSERT0(scip->scip_next_mapping_object); + ASSERT0(scip->scip_prev_obsolete_sm_object); + ASSERT0(scip->scip_vdev); + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS)); + ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping)); + + uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd); + ASSERT(obsolete_sm_obj != 0); + + scip->scip_vdev = vd->vdev_id; + scip->scip_next_mapping_object = + vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx); + + scip->scip_prev_obsolete_sm_object = obsolete_sm_obj; + + /* + * We don't need to allocate a new space map object, since + * vdev_indirect_sync_obsolete will allocate one when needed. + */ + space_map_close(vd->vdev_obsolete_sm); + vd->vdev_obsolete_sm = NULL; + VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); + + VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), + sizeof (*scip) / sizeof (uint64_t), scip, tx)); + + ASSERT3P(spa->spa_condensing_indirect, ==, NULL); + spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); + + zfs_dbgmsg("starting condense of vdev %llu in txg %llu: " + "posm=%llu nm=%llu", + vd->vdev_id, dmu_tx_get_txg(tx), + (u_longlong_t)scip->scip_prev_obsolete_sm_object, + (u_longlong_t)scip->scip_next_mapping_object); + + ASSERT3P(spa->spa_condense_thread, ==, NULL); + spa->spa_condense_thread = thread_create(NULL, 0, + spa_condense_indirect_thread, vd, 0, &p0, TS_RUN, minclsyspri); +} + +/* + * Sync to the given vdev's obsolete space map any segments that are no longer + * referenced as of the given txg. + * + * If the obsolete space map doesn't exist yet, create and open it. + */ +void +vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + ASSERTV(vdev_indirect_config_t *vic = &vd->vdev_indirect_config); + + ASSERT3U(vic->vic_mapping_object, !=, 0); + ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0); + ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); + ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)); + + if (vdev_obsolete_sm_object(vd) == 0) { + uint64_t obsolete_sm_object = + space_map_alloc(spa->spa_meta_objset, tx); + + ASSERT(vd->vdev_top_zap != 0); + VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, + sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx)); + ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0); + + spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + VERIFY0(space_map_open(&vd->vdev_obsolete_sm, + spa->spa_meta_objset, obsolete_sm_object, + 0, vd->vdev_asize, 0)); + space_map_update(vd->vdev_obsolete_sm); + } + + ASSERT(vd->vdev_obsolete_sm != NULL); + ASSERT3U(vdev_obsolete_sm_object(vd), ==, + space_map_object(vd->vdev_obsolete_sm)); + + space_map_write(vd->vdev_obsolete_sm, + vd->vdev_obsolete_segments, SM_ALLOC, tx); + space_map_update(vd->vdev_obsolete_sm); + range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); +} + +int +spa_condense_init(spa_t *spa) +{ + int error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), + sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t), + &spa->spa_condensing_indirect_phys); + if (error == 0) { + if (spa_writeable(spa)) { + spa->spa_condensing_indirect = + spa_condensing_indirect_create(spa); + } + return (0); + } else if (error == ENOENT) { + return (0); + } else { + return (error); + } +} + +void +spa_condense_fini(spa_t *spa) +{ + if (spa->spa_condensing_indirect != NULL) { + spa_condensing_indirect_destroy(spa->spa_condensing_indirect); + spa->spa_condensing_indirect = NULL; + } +} + +/* + * Restart the condense - called when the pool is opened. + */ +void +spa_condense_indirect_restart(spa_t *spa) +{ + vdev_t *vd; + ASSERT(spa->spa_condensing_indirect != NULL); + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + vd = vdev_lookup_top(spa, + spa->spa_condensing_indirect_phys.scip_vdev); + ASSERT(vd != NULL); + spa_config_exit(spa, SCL_VDEV, FTAG); + + ASSERT3P(spa->spa_condense_thread, ==, NULL); + spa->spa_condense_thread = thread_create(NULL, 0, + spa_condense_indirect_thread, vd, 0, &p0, TS_RUN, + minclsyspri); +} + +/* + * Gets the obsolete spacemap object from the vdev's ZAP. + * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't + * exist yet. + */ +int +vdev_obsolete_sm_object(vdev_t *vd) +{ + ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); + if (vd->vdev_top_zap == 0) { + return (0); + } + + uint64_t sm_obj = 0; + int err; + err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj); + + ASSERT(err == 0 || err == ENOENT); + + return (sm_obj); +} + +boolean_t +vdev_obsolete_counts_are_precise(vdev_t *vd) +{ + ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); + if (vd->vdev_top_zap == 0) { + return (B_FALSE); + } + + uint64_t val = 0; + int err; + err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val); + + ASSERT(err == 0 || err == ENOENT); + + return (val != 0); +} + +/* ARGSUSED */ +static void +vdev_indirect_close(vdev_t *vd) +{ +} + +/* ARGSUSED */ +static void +vdev_indirect_io_done(zio_t *zio) +{ +} + +/* ARGSUSED */ +static int +vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *ashift) +{ + *psize = *max_psize = vd->vdev_asize + + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + *ashift = vd->vdev_ashift; + return (0); +} + +typedef struct remap_segment { + vdev_t *rs_vd; + uint64_t rs_offset; + uint64_t rs_asize; + uint64_t rs_split_offset; + list_node_t rs_node; +} remap_segment_t; + +remap_segment_t * +rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) +{ + remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP); + rs->rs_vd = vd; + rs->rs_offset = offset; + rs->rs_asize = asize; + rs->rs_split_offset = split_offset; + return (rs); +} + +/* + * Goes through the relevant indirect mappings until it hits a concrete vdev + * and issues the callback. On the way to the concrete vdev, if any other + * indirect vdevs are encountered, then the callback will also be called on + * each of those indirect vdevs. For example, if the segment is mapped to + * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is + * mapped to segment B on concrete vdev 2, then the callback will be called on + * both vdev 1 and vdev 2. + * + * While the callback passed to vdev_indirect_remap() is called on every vdev + * the function encounters, certain callbacks only care about concrete vdevs. + * These types of callbacks should return immediately and explicitly when they + * are called on an indirect vdev. + * + * Because there is a possibility that a DVA section in the indirect device + * has been split into multiple sections in our mapping, we keep track + * of the relevant contiguous segments of the new location (remap_segment_t) + * in a stack. This way we can call the callback for each of the new sections + * created by a single section of the indirect device. Note though, that in + * this scenario the callbacks in each split block won't occur in-order in + * terms of offset, so callers should not make any assumptions about that. + * + * For callbacks that don't handle split blocks and immediately return when + * they encounter them (as is the case for remap_blkptr_cb), the caller can + * assume that its callback will be applied from the first indirect vdev + * encountered to the last one and then the concrete vdev, in that order. + */ +static void +vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, + void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg) +{ + list_t stack; + spa_t *spa = vd->vdev_spa; + + list_create(&stack, sizeof (remap_segment_t), + offsetof(remap_segment_t, rs_node)); + + for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0); + rs != NULL; rs = list_remove_head(&stack)) { + vdev_t *v = rs->rs_vd; + + /* + * Note: this can be called from open context + * (eg. zio_read()), so we need the rwlock to prevent + * the mapping from being changed by condensing. + */ + rw_enter(&v->vdev_indirect_rwlock, RW_READER); + vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping; + ASSERT3P(vim, !=, NULL); + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + ASSERT(rs->rs_asize > 0); + + vdev_indirect_mapping_entry_phys_t *mapping = + vdev_indirect_mapping_entry_for_offset(vim, rs->rs_offset); + ASSERT3P(mapping, !=, NULL); + + while (rs->rs_asize > 0) { + /* + * Note: the vdev_indirect_mapping can not change + * while we are running. It only changes while the + * removal is in progress, and then only from syncing + * context. While a removal is in progress, this + * function is only called for frees, which also only + * happen from syncing context. + */ + + uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst); + uint64_t dst_offset = + DVA_GET_OFFSET(&mapping->vimep_dst); + uint64_t dst_vdev = DVA_GET_VDEV(&mapping->vimep_dst); + + ASSERT3U(rs->rs_offset, >=, + DVA_MAPPING_GET_SRC_OFFSET(mapping)); + ASSERT3U(rs->rs_offset, <, + DVA_MAPPING_GET_SRC_OFFSET(mapping) + size); + ASSERT3U(dst_vdev, !=, v->vdev_id); + + uint64_t inner_offset = rs->rs_offset - + DVA_MAPPING_GET_SRC_OFFSET(mapping); + uint64_t inner_size = + MIN(rs->rs_asize, size - inner_offset); + + vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); + ASSERT3P(dst_v, !=, NULL); + + if (dst_v->vdev_ops == &vdev_indirect_ops) { + list_insert_head(&stack, + rs_alloc(dst_v, dst_offset + inner_offset, + inner_size, rs->rs_split_offset)); + + } + + if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) && + IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) { + /* + * Note: This clause exists only solely for + * testing purposes. We use it to ensure that + * split blocks work and that the callbacks + * using them yield the same result if issued + * in reverse order. + */ + uint64_t inner_half = inner_size / 2; + + func(rs->rs_split_offset + inner_half, dst_v, + dst_offset + inner_offset + inner_half, + inner_half, arg); + + func(rs->rs_split_offset, dst_v, + dst_offset + inner_offset, + inner_half, arg); + } else { + func(rs->rs_split_offset, dst_v, + dst_offset + inner_offset, + inner_size, arg); + } + + rs->rs_offset += inner_size; + rs->rs_asize -= inner_size; + rs->rs_split_offset += inner_size; + mapping++; + } + + rw_exit(&v->vdev_indirect_rwlock); + kmem_free(rs, sizeof (remap_segment_t)); + } + list_destroy(&stack); +} + +static void +vdev_indirect_child_io_done(zio_t *zio) +{ + zio_t *pio = zio->io_private; + + mutex_enter(&pio->io_lock); + pio->io_error = zio_worst_error(pio->io_error, zio->io_error); + mutex_exit(&pio->io_lock); + + abd_put(zio->io_abd); +} + +static void +vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + zio_t *zio = arg; + + ASSERT3P(vd, !=, NULL); + + if (vd->vdev_ops == &vdev_indirect_ops) + return; + + zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, + abd_get_offset(zio->io_abd, split_offset), + size, zio->io_type, zio->io_priority, + 0, vdev_indirect_child_io_done, zio)); +} + +static void +vdev_indirect_io_start(zio_t *zio) +{ + ASSERTV(spa_t *spa = zio->io_spa); + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + if (zio->io_type != ZIO_TYPE_READ) { + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + ASSERT((zio->io_flags & + (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0); + } + + vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, + vdev_indirect_io_start_cb, zio); + + zio_execute(zio); +} + +vdev_ops_t vdev_indirect_ops = { + vdev_indirect_open, + vdev_indirect_close, + vdev_default_asize, + vdev_indirect_io_start, + vdev_indirect_io_done, + NULL, + NULL, + NULL, + NULL, + vdev_indirect_remap, + VDEV_TYPE_INDIRECT, /* name of this vdev type */ + B_FALSE /* leaf vdev */ +}; + +#if defined(_KERNEL) && defined(HAVE_SPL) +EXPORT_SYMBOL(rs_alloc); +EXPORT_SYMBOL(spa_condense_fini); +EXPORT_SYMBOL(spa_condense_indirect_restart); +EXPORT_SYMBOL(spa_condense_indirect_start_sync); +EXPORT_SYMBOL(spa_condense_init); +EXPORT_SYMBOL(spa_vdev_indirect_mark_obsolete); +EXPORT_SYMBOL(vdev_indirect_mark_obsolete); +EXPORT_SYMBOL(vdev_indirect_should_condense); +EXPORT_SYMBOL(vdev_indirect_sync_obsolete); +EXPORT_SYMBOL(vdev_obsolete_counts_are_precise); +EXPORT_SYMBOL(vdev_obsolete_sm_object); + +/* CSTYLED */ +module_param(zfs_condense_min_mapping_bytes, ulong, 0644); +MODULE_PARM_DESC(zfs_condense_min_mapping_bytes, + "Minimum size of vdev mapping to condense"); + +module_param(zfs_condense_indirect_commit_entry_delay_ms, int, 0644); +MODULE_PARM_DESC(zfs_condense_indirect_commit_entry_delay_ms, + "Delay while condensing vdev mapping"); +#endif diff --git a/module/zfs/vdev_indirect_births.c b/module/zfs/vdev_indirect_births.c new file mode 100644 index 000000000..a0163b2e5 --- /dev/null +++ b/module/zfs/vdev_indirect_births.c @@ -0,0 +1,226 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2015 by Delphix. All rights reserved. + */ + +#include <sys/dmu_tx.h> +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/dsl_pool.h> +#include <sys/vdev_indirect_births.h> + +#ifdef ZFS_DEBUG +static boolean_t +vdev_indirect_births_verify(vdev_indirect_births_t *vib) +{ + ASSERT(vib != NULL); + + ASSERT(vib->vib_object != 0); + ASSERT(vib->vib_objset != NULL); + ASSERT(vib->vib_phys != NULL); + ASSERT(vib->vib_dbuf != NULL); + + EQUIV(vib->vib_phys->vib_count > 0, vib->vib_entries != NULL); + + return (B_TRUE); +} +#endif + +uint64_t +vdev_indirect_births_count(vdev_indirect_births_t *vib) +{ + ASSERT(vdev_indirect_births_verify(vib)); + + return (vib->vib_phys->vib_count); +} + +uint64_t +vdev_indirect_births_object(vdev_indirect_births_t *vib) +{ + ASSERT(vdev_indirect_births_verify(vib)); + + return (vib->vib_object); +} + +static uint64_t +vdev_indirect_births_size_impl(vdev_indirect_births_t *vib) +{ + return (vib->vib_phys->vib_count * sizeof (*vib->vib_entries)); +} + +void +vdev_indirect_births_close(vdev_indirect_births_t *vib) +{ + ASSERT(vdev_indirect_births_verify(vib)); + + if (vib->vib_phys->vib_count > 0) { + uint64_t births_size = vdev_indirect_births_size_impl(vib); + + kmem_free(vib->vib_entries, births_size); + vib->vib_entries = NULL; + } + + dmu_buf_rele(vib->vib_dbuf, vib); + + vib->vib_objset = NULL; + vib->vib_object = 0; + vib->vib_dbuf = NULL; + vib->vib_phys = NULL; + + kmem_free(vib, sizeof (*vib)); +} + +uint64_t +vdev_indirect_births_alloc(objset_t *os, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + + return (dmu_object_alloc(os, + DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, + DMU_OTN_UINT64_METADATA, sizeof (vdev_indirect_birth_phys_t), + tx)); +} + +vdev_indirect_births_t * +vdev_indirect_births_open(objset_t *os, uint64_t births_object) +{ + vdev_indirect_births_t *vib = kmem_zalloc(sizeof (*vib), KM_SLEEP); + + vib->vib_objset = os; + vib->vib_object = births_object; + + VERIFY0(dmu_bonus_hold(os, vib->vib_object, vib, &vib->vib_dbuf)); + vib->vib_phys = vib->vib_dbuf->db_data; + + if (vib->vib_phys->vib_count > 0) { + uint64_t births_size = vdev_indirect_births_size_impl(vib); + vib->vib_entries = kmem_alloc(births_size, KM_SLEEP); + VERIFY0(dmu_read(vib->vib_objset, vib->vib_object, 0, + births_size, vib->vib_entries, DMU_READ_PREFETCH)); + } + + ASSERT(vdev_indirect_births_verify(vib)); + + return (vib); +} + +void +vdev_indirect_births_free(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + VERIFY0(dmu_object_free(os, object, tx)); +} + +void +vdev_indirect_births_add_entry(vdev_indirect_births_t *vib, + uint64_t max_offset, uint64_t txg, dmu_tx_t *tx) +{ + vdev_indirect_birth_entry_phys_t vibe; + uint64_t old_size; + uint64_t new_size; + vdev_indirect_birth_entry_phys_t *new_entries; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx))); + ASSERT(vdev_indirect_births_verify(vib)); + + dmu_buf_will_dirty(vib->vib_dbuf, tx); + + vibe.vibe_offset = max_offset; + vibe.vibe_phys_birth_txg = txg; + + old_size = vdev_indirect_births_size_impl(vib); + dmu_write(vib->vib_objset, vib->vib_object, old_size, sizeof (vibe), + &vibe, tx); + vib->vib_phys->vib_count++; + new_size = vdev_indirect_births_size_impl(vib); + + new_entries = kmem_alloc(new_size, KM_SLEEP); + if (old_size > 0) { + bcopy(vib->vib_entries, new_entries, old_size); + kmem_free(vib->vib_entries, old_size); + } + new_entries[vib->vib_phys->vib_count - 1] = vibe; + vib->vib_entries = new_entries; +} + +uint64_t +vdev_indirect_births_last_entry_txg(vdev_indirect_births_t *vib) +{ + ASSERT(vdev_indirect_births_verify(vib)); + ASSERT(vib->vib_phys->vib_count > 0); + + vdev_indirect_birth_entry_phys_t *last = + &vib->vib_entries[vib->vib_phys->vib_count - 1]; + return (last->vibe_phys_birth_txg); +} + +/* + * Return the txg in which the given range was copied (i.e. its physical + * birth txg). The specified offset+asize must be contiguously mapped + * (i.e. not a split block). + * + * The entries are sorted by increasing phys_birth, and also by increasing + * offset. We find the specified offset by binary search. Note that we + * can not use bsearch() because looking at each entry independently is + * insufficient to find the correct entry. Each entry implicitly relies + * on the previous entry: an entry indicates that the offsets from the + * end of the previous entry to the end of this entry were written in the + * specified txg. + */ +uint64_t +vdev_indirect_births_physbirth(vdev_indirect_births_t *vib, uint64_t offset, + uint64_t asize) +{ + vdev_indirect_birth_entry_phys_t *base; + vdev_indirect_birth_entry_phys_t *last; + + ASSERT(vdev_indirect_births_verify(vib)); + ASSERT(vib->vib_phys->vib_count > 0); + + base = vib->vib_entries; + last = base + vib->vib_phys->vib_count - 1; + + ASSERT3U(offset, <, last->vibe_offset); + + while (last >= base) { + vdev_indirect_birth_entry_phys_t *p = + base + ((last - base) / 2); + if (offset >= p->vibe_offset) { + base = p + 1; + } else if (p == vib->vib_entries || + offset >= (p - 1)->vibe_offset) { + ASSERT3U(offset + asize, <=, p->vibe_offset); + return (p->vibe_phys_birth_txg); + } else { + last = p - 1; + } + } + ASSERT(!"offset not found"); + return (-1); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +EXPORT_SYMBOL(vdev_indirect_births_add_entry); +EXPORT_SYMBOL(vdev_indirect_births_alloc); +EXPORT_SYMBOL(vdev_indirect_births_close); +EXPORT_SYMBOL(vdev_indirect_births_count); +EXPORT_SYMBOL(vdev_indirect_births_free); +EXPORT_SYMBOL(vdev_indirect_births_last_entry_txg); +EXPORT_SYMBOL(vdev_indirect_births_object); +EXPORT_SYMBOL(vdev_indirect_births_open); +EXPORT_SYMBOL(vdev_indirect_births_physbirth); +#endif diff --git a/module/zfs/vdev_indirect_mapping.c b/module/zfs/vdev_indirect_mapping.c new file mode 100644 index 000000000..dbd6a7635 --- /dev/null +++ b/module/zfs/vdev_indirect_mapping.c @@ -0,0 +1,616 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2015 by Delphix. All rights reserved. + */ + +#include <sys/dmu_tx.h> +#include <sys/dsl_pool.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/vdev_indirect_mapping.h> +#include <sys/zfeature.h> +#include <sys/dmu_objset.h> + +#ifdef ZFS_DEBUG +static boolean_t +vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim) +{ + ASSERT(vim != NULL); + + ASSERT(vim->vim_object != 0); + ASSERT(vim->vim_objset != NULL); + ASSERT(vim->vim_phys != NULL); + ASSERT(vim->vim_dbuf != NULL); + + EQUIV(vim->vim_phys->vimp_num_entries > 0, + vim->vim_entries != NULL); + if (vim->vim_phys->vimp_num_entries > 0) { + ASSERTV(vdev_indirect_mapping_entry_phys_t *last_entry = + &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1]); + ASSERTV(uint64_t offset = + DVA_MAPPING_GET_SRC_OFFSET(last_entry)); + ASSERTV(uint64_t size = DVA_GET_ASIZE(&last_entry->vimep_dst)); + + ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size); + } + if (vim->vim_havecounts) { + ASSERT(vim->vim_phys->vimp_counts_object != 0); + } + + return (B_TRUE); +} +#endif + +uint64_t +vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + return (vim->vim_phys->vimp_num_entries); +} + +uint64_t +vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + return (vim->vim_phys->vimp_max_offset); +} + +uint64_t +vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + return (vim->vim_object); +} + +uint64_t +vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + return (vim->vim_phys->vimp_bytes_mapped); +} + +/* + * The length (in bytes) of the mapping object array in memory and + * (logically) on disk. + * + * Note that unlike most of our accessor functions, + * we don't assert that the struct is consistent; therefore it can be + * called while there may be concurrent changes, if we don't care about + * the value being immediately stale (e.g. from spa_removal_get_stats()). + */ +uint64_t +vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim) +{ + return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries)); +} + +/* + * Compare an offset with an indirect mapping entry; there are three + * possible scenarios: + * + * 1. The offset is "less than" the mapping entry; meaning the + * offset is less than the source offset of the mapping entry. In + * this case, there is no overlap between the offset and the + * mapping entry and -1 will be returned. + * + * 2. The offset is "greater than" the mapping entry; meaning the + * offset is greater than the mapping entry's source offset plus + * the entry's size. In this case, there is no overlap between + * the offset and the mapping entry and 1 will be returned. + * + * NOTE: If the offset is actually equal to the entry's offset + * plus size, this is considered to be "greater" than the entry, + * and this case applies (i.e. 1 will be returned). Thus, the + * entry's "range" can be considered to be inclusive at its + * start, but exclusive at its end: e.g. [src, src + size). + * + * 3. The last case to consider is if the offset actually falls + * within the mapping entry's range. If this is the case, the + * offset is considered to be "equal to" the mapping entry and + * 0 will be returned. + * + * NOTE: If the offset is equal to the entry's source offset, + * this case applies and 0 will be returned. If the offset is + * equal to the entry's source plus its size, this case does + * *not* apply (see "NOTE" above for scenario 2), and 1 will be + * returned. + */ +static int +dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem) +{ + const uint64_t * const key = v_key; + const vdev_indirect_mapping_entry_phys_t * const array_elem = + v_array_elem; + uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem); + + if (*key < src_offset) { + return (-1); + } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) { + return (0); + } else { + return (1); + } +} + +/* + * Returns the mapping entry for the given offset. + * + * It's possible that the given offset will not be in the mapping table + * (i.e. no mapping entries contain this offset), in which case, the + * return value value depends on the "next_if_missing" parameter. + * + * If the offset is not found in the table and "next_if_missing" is + * B_FALSE, then NULL will always be returned. The behavior is intended + * to allow consumers to get the entry corresponding to the offset + * parameter, iff the offset overlaps with an entry in the table. + * + * If the offset is not found in the table and "next_if_missing" is + * B_TRUE, then the entry nearest to the given offset will be returned, + * such that the entry's source offset is greater than the offset + * passed in (i.e. the "next" mapping entry in the table is returned, if + * the offset is missing from the table). If there are no entries whose + * source offset is greater than the passed in offset, NULL is returned. + */ +static vdev_indirect_mapping_entry_phys_t * +vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim, + uint64_t offset, boolean_t next_if_missing) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + ASSERT(vim->vim_phys->vimp_num_entries > 0); + + vdev_indirect_mapping_entry_phys_t *entry = NULL; + + uint64_t last = vim->vim_phys->vimp_num_entries - 1; + uint64_t base = 0; + + /* + * We don't define these inside of the while loop because we use + * their value in the case that offset isn't in the mapping. + */ + uint64_t mid; + int result; + + while (last >= base) { + mid = base + ((last - base) >> 1); + + result = dva_mapping_overlap_compare(&offset, + &vim->vim_entries[mid]); + + if (result == 0) { + entry = &vim->vim_entries[mid]; + break; + } else if (result < 0) { + last = mid - 1; + } else { + base = mid + 1; + } + } + + if (entry == NULL && next_if_missing) { + ASSERT3U(base, ==, last + 1); + ASSERT(mid == base || mid == last); + ASSERT3S(result, !=, 0); + + /* + * The offset we're looking for isn't actually contained + * in the mapping table, thus we need to return the + * closest mapping entry that is greater than the + * offset. We reuse the result of the last comparison, + * comparing the mapping entry at index "mid" and the + * offset. The offset is guaranteed to lie between + * indices one less than "mid", and one greater than + * "mid"; we just need to determine if offset is greater + * than, or less than the mapping entry contained at + * index "mid". + */ + + uint64_t index; + if (result < 0) + index = mid; + else + index = mid + 1; + + ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries); + + if (index == vim->vim_phys->vimp_num_entries) { + /* + * If "index" is past the end of the entries + * array, then not only is the offset not in the + * mapping table, but it's actually greater than + * all entries in the table. In this case, we + * can't return a mapping entry greater than the + * offset (since none exist), so we return NULL. + */ + + ASSERT3S(dva_mapping_overlap_compare(&offset, + &vim->vim_entries[index - 1]), >, 0); + + return (NULL); + } else { + /* + * Just to be safe, we verify the offset falls + * in between the mapping entries at index and + * one less than index. Since we know the offset + * doesn't overlap an entry, and we're supposed + * to return the entry just greater than the + * offset, both of the following tests must be + * true. + */ + ASSERT3S(dva_mapping_overlap_compare(&offset, + &vim->vim_entries[index]), <, 0); + IMPLY(index >= 1, dva_mapping_overlap_compare(&offset, + &vim->vim_entries[index - 1]) > 0); + + return (&vim->vim_entries[index]); + } + } else { + return (entry); + } +} + +vdev_indirect_mapping_entry_phys_t * +vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim, + uint64_t offset) +{ + return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset, + B_FALSE)); +} + +vdev_indirect_mapping_entry_phys_t * +vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim, + uint64_t offset) +{ + return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset, + B_TRUE)); +} + + +void +vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + if (vim->vim_phys->vimp_num_entries > 0) { + uint64_t map_size = vdev_indirect_mapping_size(vim); + vmem_free(vim->vim_entries, map_size); + vim->vim_entries = NULL; + } + + dmu_buf_rele(vim->vim_dbuf, vim); + + vim->vim_objset = NULL; + vim->vim_object = 0; + vim->vim_dbuf = NULL; + vim->vim_phys = NULL; + + kmem_free(vim, sizeof (*vim)); +} + +uint64_t +vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx) +{ + uint64_t object; + ASSERT(dmu_tx_is_syncing(tx)); + uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0; + + if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + bonus_size = sizeof (vdev_indirect_mapping_phys_t); + } + + object = dmu_object_alloc(os, + DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, + DMU_OTN_UINT64_METADATA, bonus_size, + tx); + + if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + dmu_buf_t *dbuf; + vdev_indirect_mapping_phys_t *vimp; + + VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf)); + dmu_buf_will_dirty(dbuf, tx); + vimp = dbuf->db_data; + vimp->vimp_counts_object = dmu_object_alloc(os, + DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE, + DMU_OT_NONE, 0, tx); + spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + dmu_buf_rele(dbuf, FTAG); + } + + return (object); +} + + +vdev_indirect_mapping_t * +vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object) +{ + vdev_indirect_mapping_t *vim = kmem_zalloc(sizeof (*vim), KM_SLEEP); + dmu_object_info_t doi; + VERIFY0(dmu_object_info(os, mapping_object, &doi)); + + vim->vim_objset = os; + vim->vim_object = mapping_object; + + VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim, + &vim->vim_dbuf)); + vim->vim_phys = vim->vim_dbuf->db_data; + + vim->vim_havecounts = + (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0); + + if (vim->vim_phys->vimp_num_entries > 0) { + uint64_t map_size = vdev_indirect_mapping_size(vim); + vim->vim_entries = vmem_alloc(map_size, KM_SLEEP); + VERIFY0(dmu_read(os, vim->vim_object, 0, map_size, + vim->vim_entries, DMU_READ_PREFETCH)); + } + + ASSERT(vdev_indirect_mapping_verify(vim)); + + return (vim); +} + +void +vdev_indirect_mapping_free(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object); + if (vim->vim_havecounts) { + VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object, + tx)); + spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + } + vdev_indirect_mapping_close(vim); + + VERIFY0(dmu_object_free(os, object, tx)); +} + +/* + * Append the list of vdev_indirect_mapping_entry_t's to the on-disk + * mapping object. Also remove the entries from the list and free them. + * This also implicitly extends the max_offset of the mapping (to the end + * of the last entry). + */ +void +vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim, + list_t *list, dmu_tx_t *tx) +{ + vdev_indirect_mapping_entry_phys_t *mapbuf; + uint64_t old_size; + uint32_t *countbuf = NULL; + vdev_indirect_mapping_entry_phys_t *old_entries; + uint64_t old_count; + uint64_t entries_written = 0; + + ASSERT(vdev_indirect_mapping_verify(vim)); + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx))); + ASSERT(!list_is_empty(list)); + + old_size = vdev_indirect_mapping_size(vim); + old_entries = vim->vim_entries; + old_count = vim->vim_phys->vimp_num_entries; + + dmu_buf_will_dirty(vim->vim_dbuf, tx); + + mapbuf = vmem_alloc(SPA_OLD_MAXBLOCKSIZE, KM_SLEEP); + if (vim->vim_havecounts) { + countbuf = vmem_alloc(SPA_OLD_MAXBLOCKSIZE, KM_SLEEP); + ASSERT(spa_feature_is_active(vim->vim_objset->os_spa, + SPA_FEATURE_OBSOLETE_COUNTS)); + } + while (!list_is_empty(list)) { + uint64_t i; + /* + * Write entries from the list to the + * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE. + */ + for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) { + vdev_indirect_mapping_entry_t *entry = + list_remove_head(list); + if (entry == NULL) + break; + + uint64_t size = + DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst); + uint64_t src_offset = + DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping); + + /* + * We shouldn't be adding an entry which is fully + * obsolete. + */ + ASSERT3U(entry->vime_obsolete_count, <, size); + IMPLY(entry->vime_obsolete_count != 0, + vim->vim_havecounts); + + mapbuf[i] = entry->vime_mapping; + if (vim->vim_havecounts) + countbuf[i] = entry->vime_obsolete_count; + + vim->vim_phys->vimp_bytes_mapped += size; + ASSERT3U(src_offset, >=, + vim->vim_phys->vimp_max_offset); + vim->vim_phys->vimp_max_offset = src_offset + size; + + entries_written++; + + vmem_free(entry, sizeof (*entry)); + } + dmu_write(vim->vim_objset, vim->vim_object, + vim->vim_phys->vimp_num_entries * sizeof (*mapbuf), + i * sizeof (*mapbuf), + mapbuf, tx); + if (vim->vim_havecounts) { + dmu_write(vim->vim_objset, + vim->vim_phys->vimp_counts_object, + vim->vim_phys->vimp_num_entries * + sizeof (*countbuf), + i * sizeof (*countbuf), countbuf, tx); + } + vim->vim_phys->vimp_num_entries += i; + } + vmem_free(mapbuf, SPA_OLD_MAXBLOCKSIZE); + if (vim->vim_havecounts) + vmem_free(countbuf, SPA_OLD_MAXBLOCKSIZE); + + /* + * Update the entry array to reflect the new entries. First, copy + * over any old entries then read back the new entries we just wrote. + */ + uint64_t new_size = vdev_indirect_mapping_size(vim); + ASSERT3U(new_size, >, old_size); + ASSERT3U(new_size - old_size, ==, + entries_written * sizeof (vdev_indirect_mapping_entry_phys_t)); + vim->vim_entries = vmem_alloc(new_size, KM_SLEEP); + if (old_size > 0) { + bcopy(old_entries, vim->vim_entries, old_size); + vmem_free(old_entries, old_size); + } + VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size, + new_size - old_size, &vim->vim_entries[old_count], + DMU_READ_PREFETCH)); + + zfs_dbgmsg("txg %llu: wrote %llu entries to " + "indirect mapping obj %llu; max offset=0x%llx", + (u_longlong_t)dmu_tx_get_txg(tx), + (u_longlong_t)entries_written, + (u_longlong_t)vim->vim_object, + (u_longlong_t)vim->vim_phys->vimp_max_offset); +} + +/* + * Increment the relevant counts for the specified offset and length. + * The counts array must be obtained from + * vdev_indirect_mapping_load_obsolete_counts(). + */ +void +vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim, + uint64_t offset, uint64_t length, uint32_t *counts) +{ + vdev_indirect_mapping_entry_phys_t *mapping; + uint64_t index; + + mapping = vdev_indirect_mapping_entry_for_offset(vim, offset); + + ASSERT(length > 0); + ASSERT3P(mapping, !=, NULL); + + index = mapping - vim->vim_entries; + + while (length > 0) { + ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim)); + + uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst); + uint64_t inner_offset = offset - + DVA_MAPPING_GET_SRC_OFFSET(mapping); + VERIFY3U(inner_offset, <, size); + uint64_t inner_size = MIN(length, size - inner_offset); + + VERIFY3U(counts[index] + inner_size, <=, size); + counts[index] += inner_size; + + offset += inner_size; + length -= inner_size; + mapping++; + index++; + } +} + +typedef struct load_obsolete_space_map_arg { + vdev_indirect_mapping_t *losma_vim; + uint32_t *losma_counts; +} load_obsolete_space_map_arg_t; + +static int +load_obsolete_sm_callback(maptype_t type, uint64_t offset, uint64_t size, + void *arg) +{ + load_obsolete_space_map_arg_t *losma = arg; + ASSERT3S(type, ==, SM_ALLOC); + + vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim, + offset, size, losma->losma_counts); + + return (0); +} + +/* + * Modify the counts (increment them) based on the spacemap. + */ +void +vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim, + uint32_t *counts, space_map_t *obsolete_space_sm) +{ + load_obsolete_space_map_arg_t losma; + losma.losma_counts = counts; + losma.losma_vim = vim; + VERIFY0(space_map_iterate(obsolete_space_sm, + load_obsolete_sm_callback, &losma)); +} + +/* + * Read the obsolete counts from disk, returning them in an array. + */ +uint32_t * +vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + uint64_t counts_size = + vim->vim_phys->vimp_num_entries * sizeof (uint32_t); + uint32_t *counts = vmem_alloc(counts_size, KM_SLEEP); + if (vim->vim_havecounts) { + VERIFY0(dmu_read(vim->vim_objset, + vim->vim_phys->vimp_counts_object, + 0, counts_size, + counts, DMU_READ_PREFETCH)); + } else { + bzero(counts, counts_size); + } + return (counts); +} + +extern void +vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim, + uint32_t *counts) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + vmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t)); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +EXPORT_SYMBOL(vdev_indirect_mapping_add_entries); +EXPORT_SYMBOL(vdev_indirect_mapping_alloc); +EXPORT_SYMBOL(vdev_indirect_mapping_bytes_mapped); +EXPORT_SYMBOL(vdev_indirect_mapping_close); +EXPORT_SYMBOL(vdev_indirect_mapping_entry_for_offset); +EXPORT_SYMBOL(vdev_indirect_mapping_entry_for_offset_or_next); +EXPORT_SYMBOL(vdev_indirect_mapping_free); +EXPORT_SYMBOL(vdev_indirect_mapping_free_obsolete_counts); +EXPORT_SYMBOL(vdev_indirect_mapping_increment_obsolete_count); +EXPORT_SYMBOL(vdev_indirect_mapping_load_obsolete_counts); +EXPORT_SYMBOL(vdev_indirect_mapping_load_obsolete_spacemap); +EXPORT_SYMBOL(vdev_indirect_mapping_max_offset); +EXPORT_SYMBOL(vdev_indirect_mapping_num_entries); +EXPORT_SYMBOL(vdev_indirect_mapping_object); +EXPORT_SYMBOL(vdev_indirect_mapping_open); +EXPORT_SYMBOL(vdev_indirect_mapping_size); +#endif diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 4fee4bc7a..70d68d903 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -143,6 +143,7 @@ #include <sys/vdev_impl.h> #include <sys/uberblock_impl.h> #include <sys/metaslab.h> +#include <sys/metaslab_impl.h> #include <sys/zio.h> #include <sys/dsl_scan.h> #include <sys/abd.h> @@ -359,6 +360,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vdev_config_flag_t flags) { nvlist_t *nv = NULL; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + nv = fnvlist_alloc(); fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type); @@ -425,9 +428,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, vd->vdev_asize); fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); - if (vd->vdev_removing) + if (vd->vdev_removing) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, vd->vdev_removing); + } } if (vd->vdev_dtl_sm != NULL) { @@ -435,6 +439,21 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, space_map_object(vd->vdev_dtl_sm)); } + if (vic->vic_mapping_object != 0) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, + vic->vic_mapping_object); + } + + if (vic->vic_births_object != 0) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, + vic->vic_births_object); + } + + if (vic->vic_prev_indirect_vdev != UINT64_MAX) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, + vic->vic_prev_indirect_vdev); + } + if (vd->vdev_crtxg) fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); @@ -453,16 +472,70 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, } if (getstats) { - pool_scan_stat_t ps; - vdev_config_generate_stats(vd, nv); /* provide either current or previous scan information */ + pool_scan_stat_t ps; if (spa_scan_get_stats(spa, &ps) == 0) { fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps, sizeof (pool_scan_stat_t) / sizeof (uint64_t)); } + + pool_removal_stat_t prs; + if (spa_removal_get_stats(spa, &prs) == 0) { + fnvlist_add_uint64_array(nv, + ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs, + sizeof (prs) / sizeof (uint64_t)); + } + + /* + * Note: this can be called from open context + * (spa_get_stats()), so we need the rwlock to prevent + * the mapping from being changed by condensing. + */ + rw_enter(&vd->vdev_indirect_rwlock, RW_READER); + if (vd->vdev_indirect_mapping != NULL) { + ASSERT(vd->vdev_indirect_births != NULL); + vdev_indirect_mapping_t *vim = + vd->vdev_indirect_mapping; + fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, + vdev_indirect_mapping_size(vim)); + } + rw_exit(&vd->vdev_indirect_rwlock); + if (vd->vdev_mg != NULL && + vd->vdev_mg->mg_fragmentation != ZFS_FRAG_INVALID) { + /* + * Compute approximately how much memory would be used + * for the indirect mapping if this device were to + * be removed. + * + * Note: If the frag metric is invalid, then not + * enough metaslabs have been converted to have + * histograms. + */ + uint64_t seg_count = 0; + + /* + * There are the same number of allocated segments + * as free segments, so we will have at least one + * entry per free segment. + */ + for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + seg_count += vd->vdev_mg->mg_histogram[i]; + } + + /* + * The maximum length of a mapping is SPA_MAXBLOCKSIZE, + * so we need at least one entry per SPA_MAXBLOCKSIZE + * of allocated data. + */ + seg_count += vd->vdev_stat.vs_alloc / SPA_MAXBLOCKSIZE; + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, + seg_count * + sizeof (vdev_indirect_mapping_entry_phys_t)); + } } if (!vd->vdev_ops->vdev_op_leaf) { @@ -567,8 +640,9 @@ vdev_top_config_generate(spa_t *spa, nvlist_t *config) for (c = 0, idx = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; - if (tvd->vdev_ishole) + if (tvd->vdev_ishole) { array[idx++] = c; + } } if (idx) { @@ -1263,8 +1337,11 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) */ zio = zio_root(spa, NULL, NULL, flags); - for (int v = 0; v < svdcount; v++) - zio_flush(zio, svd[v]); + for (int v = 0; v < svdcount; v++) { + if (vdev_writeable(svd[v])) { + zio_flush(zio, svd[v]); + } + } (void) zio_wait(zio); diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 1d5adce17..b56f955eb 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -679,6 +679,7 @@ vdev_ops_t vdev_mirror_ops = { NULL, NULL, NULL, + NULL, VDEV_TYPE_MIRROR, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; @@ -693,6 +694,7 @@ vdev_ops_t vdev_replacing_ops = { NULL, NULL, NULL, + NULL, VDEV_TYPE_REPLACING, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; @@ -707,6 +709,7 @@ vdev_ops_t vdev_spare_ops = { NULL, NULL, NULL, + NULL, VDEV_TYPE_SPARE, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c index d7d017fb8..b1c039f16 100644 --- a/module/zfs/vdev_missing.c +++ b/module/zfs/vdev_missing.c @@ -89,6 +89,7 @@ vdev_ops_t vdev_missing_ops = { NULL, NULL, NULL, + NULL, VDEV_TYPE_MISSING, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; @@ -103,6 +104,7 @@ vdev_ops_t vdev_hole_ops = { NULL, NULL, NULL, + NULL, VDEV_TYPE_HOLE, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 5d2c98013..3ac31a872 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -152,6 +152,8 @@ uint32_t zfs_vdev_async_write_min_active = 2; uint32_t zfs_vdev_async_write_max_active = 10; uint32_t zfs_vdev_scrub_min_active = 1; uint32_t zfs_vdev_scrub_max_active = 2; +uint32_t zfs_vdev_removal_min_active = 1; +uint32_t zfs_vdev_removal_max_active = 2; /* * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent @@ -248,6 +250,8 @@ vdev_queue_class_min_active(zio_priority_t p) return (zfs_vdev_async_write_min_active); case ZIO_PRIORITY_SCRUB: return (zfs_vdev_scrub_min_active); + case ZIO_PRIORITY_REMOVAL: + return (zfs_vdev_removal_min_active); default: panic("invalid priority %u", p); return (0); @@ -316,6 +320,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) return (vdev_queue_max_async_writes(spa)); case ZIO_PRIORITY_SCRUB: return (zfs_vdev_scrub_max_active); + case ZIO_PRIORITY_REMOVAL: + return (zfs_vdev_removal_max_active); default: panic("invalid priority %u", p); return (0); @@ -560,7 +566,8 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) while ((dio = AVL_PREV(t, first)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(dio, last) <= limit && - IO_GAP(dio, first) <= maxgap) { + IO_GAP(dio, first) <= maxgap && + dio->io_type == zio->io_type) { first = dio; if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL)) mandatory = first; @@ -586,7 +593,8 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) (IO_SPAN(first, dio) <= limit || (dio->io_flags & ZIO_FLAG_OPTIONAL)) && IO_SPAN(first, dio) <= maxblocksize && - IO_GAP(last, dio) <= maxgap) { + IO_GAP(last, dio) <= maxgap && + dio->io_type == zio->io_type) { last = dio; if (!(last->io_flags & ZIO_FLAG_OPTIONAL)) mandatory = last; @@ -757,12 +765,14 @@ vdev_queue_io(zio_t *zio) if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_priority != ZIO_PRIORITY_SYNC_READ && zio->io_priority != ZIO_PRIORITY_ASYNC_READ && - zio->io_priority != ZIO_PRIORITY_SCRUB) + zio->io_priority != ZIO_PRIORITY_SCRUB && + zio->io_priority != ZIO_PRIORITY_REMOVAL) zio->io_priority = ZIO_PRIORITY_ASYNC_READ; } else { ASSERT(zio->io_type == ZIO_TYPE_WRITE); if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && - zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE) + zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE && + zio->io_priority != ZIO_PRIORITY_REMOVAL) zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; } diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index ef81af6f7..a21baf9c2 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -2333,6 +2333,7 @@ vdev_ops_t vdev_raidz_ops = { vdev_raidz_need_resilver, NULL, NULL, + NULL, VDEV_TYPE_RAIDZ, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c new file mode 100644 index 000000000..6e81bf014 --- /dev/null +++ b/module/zfs/vdev_removal.c @@ -0,0 +1,1925 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa_impl.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/zap.h> +#include <sys/vdev_impl.h> +#include <sys/metaslab.h> +#include <sys/metaslab_impl.h> +#include <sys/uberblock_impl.h> +#include <sys/txg.h> +#include <sys/avl.h> +#include <sys/bpobj.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_dir.h> +#include <sys/arc.h> +#include <sys/zfeature.h> +#include <sys/vdev_indirect_births.h> +#include <sys/vdev_indirect_mapping.h> +#include <sys/abd.h> +#include <sys/trace_vdev.h> + +/* + * This file contains the necessary logic to remove vdevs from a + * storage pool. Currently, the only devices that can be removed + * are log, cache, and spare devices; and top level vdevs from a pool + * w/o raidz or mirrors. (Note that members of a mirror can be removed + * by the detach operation.) + * + * Log vdevs are removed by evacuating them and then turning the vdev + * into a hole vdev while holding spa config locks. + * + * Top level vdevs are removed and converted into an indirect vdev via + * a multi-step process: + * + * - Disable allocations from this device (spa_vdev_remove_top). + * + * - From a new thread (spa_vdev_remove_thread), copy data from + * the removing vdev to a different vdev. The copy happens in open + * context (spa_vdev_copy_impl) and issues a sync task + * (vdev_mapping_sync) so the sync thread can update the partial + * indirect mappings in core and on disk. + * + * - If a free happens during a removal, it is freed from the + * removing vdev, and if it has already been copied, from the new + * location as well (free_from_removing_vdev). + * + * - After the removal is completed, the copy thread converts the vdev + * into an indirect vdev (vdev_remove_complete) before instructing + * the sync thread to destroy the space maps and finish the removal + * (spa_finish_removal). + */ + +typedef struct vdev_copy_arg { + metaslab_t *vca_msp; + uint64_t vca_outstanding_bytes; + kcondvar_t vca_cv; + kmutex_t vca_lock; +} vdev_copy_arg_t; + +typedef struct vdev_copy_seg_arg { + vdev_copy_arg_t *vcsa_copy_arg; + uint64_t vcsa_txg; + dva_t *vcsa_dest_dva; + blkptr_t *vcsa_dest_bp; +} vdev_copy_seg_arg_t; + +/* + * The maximum amount of allowed data we're allowed to copy from a device + * at a time when removing it. + */ +int zfs_remove_max_copy_bytes = 8 * 1024 * 1024; + +/* + * The largest contiguous segment that we will attempt to allocate when + * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If + * there is a performance problem with attempting to allocate large blocks, + * consider decreasing this. + */ +int zfs_remove_max_segment = SPA_MAXBLOCKSIZE; + +#define VDEV_REMOVAL_ZAP_OBJS "lzap" + +static void spa_vdev_remove_thread(void *arg); + +static void +spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx) +{ + VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_REMOVING, sizeof (uint64_t), + sizeof (spa->spa_removing_phys) / sizeof (uint64_t), + &spa->spa_removing_phys, tx)); +} + +static nvlist_t * +spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) +{ + for (int i = 0; i < count; i++) { + uint64_t guid = + fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID); + + if (guid == target_guid) + return (nvpp[i]); + } + + return (NULL); +} + +static void +spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, + nvlist_t *dev_to_remove) +{ + nvlist_t **newdev = NULL; + + if (count > 1) + newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); + + for (int i = 0, j = 0; i < count; i++) { + if (dev[i] == dev_to_remove) + continue; + VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); + } + + VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); + VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); + + for (int i = 0; i < count - 1; i++) + nvlist_free(newdev[i]); + + if (count > 1) + kmem_free(newdev, (count - 1) * sizeof (void *)); +} + +static spa_vdev_removal_t * +spa_vdev_removal_create(vdev_t *vd) +{ + spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); + mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); + svr->svr_allocd_segs = range_tree_create(NULL, NULL); + svr->svr_vdev = vd; + + for (int i = 0; i < TXG_SIZE; i++) { + svr->svr_frees[i] = range_tree_create(NULL, NULL); + list_create(&svr->svr_new_segments[i], + sizeof (vdev_indirect_mapping_entry_t), + offsetof(vdev_indirect_mapping_entry_t, vime_node)); + } + + return (svr); +} + +void +spa_vdev_removal_destroy(spa_vdev_removal_t *svr) +{ + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(svr->svr_bytes_done[i]); + ASSERT0(svr->svr_max_offset_to_sync[i]); + range_tree_destroy(svr->svr_frees[i]); + list_destroy(&svr->svr_new_segments[i]); + } + + range_tree_destroy(svr->svr_allocd_segs); + mutex_destroy(&svr->svr_lock); + cv_destroy(&svr->svr_cv); + kmem_free(svr, sizeof (*svr)); +} + +/* + * This is called as a synctask in the txg in which we will mark this vdev + * as removing (in the config stored in the MOS). + * + * It begins the evacuation of a toplevel vdev by: + * - initializing the spa_removing_phys which tracks this removal + * - computing the amount of space to remove for accounting purposes + * - dirtying all dbufs in the spa_config_object + * - creating the spa_vdev_removal + * - starting the spa_vdev_remove_thread + */ +static void +vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) +{ + vdev_t *vd = arg; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_dsl_pool->dp_meta_objset; + spa_vdev_removal_t *svr = NULL; + ASSERTV(uint64_t txg = dmu_tx_get_txg(tx)); + + ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); + svr = spa_vdev_removal_create(vd); + + ASSERT(vd->vdev_removing); + ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); + + spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); + if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + /* + * By activating the OBSOLETE_COUNTS feature, we prevent + * the pool from being downgraded and ensure that the + * refcounts are precise. + */ + spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + uint64_t one = 1; + VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1, + &one, tx)); + ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0); + } + + vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx); + vd->vdev_indirect_mapping = + vdev_indirect_mapping_open(mos, vic->vic_mapping_object); + vic->vic_births_object = vdev_indirect_births_alloc(mos, tx); + vd->vdev_indirect_births = + vdev_indirect_births_open(mos, vic->vic_births_object); + spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id; + spa->spa_removing_phys.sr_start_time = gethrestime_sec(); + spa->spa_removing_phys.sr_end_time = 0; + spa->spa_removing_phys.sr_state = DSS_SCANNING; + spa->spa_removing_phys.sr_to_copy = 0; + spa->spa_removing_phys.sr_copied = 0; + + /* + * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because + * there may be space in the defer tree, which is free, but still + * counted in vs_alloc. + */ + for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { + metaslab_t *ms = vd->vdev_ms[i]; + if (ms->ms_sm == NULL) + continue; + + /* + * Sync tasks happen before metaslab_sync(), therefore + * smp_alloc and sm_alloc must be the same. + */ + ASSERT3U(space_map_allocated(ms->ms_sm), ==, + ms->ms_sm->sm_phys->smp_alloc); + + spa->spa_removing_phys.sr_to_copy += + space_map_allocated(ms->ms_sm); + + /* + * Space which we are freeing this txg does not need to + * be copied. + */ + spa->spa_removing_phys.sr_to_copy -= + range_tree_space(ms->ms_freeingtree); + + ASSERT0(range_tree_space(ms->ms_freedtree)); + for (int t = 0; t < TXG_SIZE; t++) + ASSERT0(range_tree_space(ms->ms_alloctree[t])); + } + + /* + * Sync tasks are called before metaslab_sync(), so there should + * be no already-synced metaslabs in the TXG_CLEAN list. + */ + ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL); + + spa_sync_removing_state(spa, tx); + + /* + * All blocks that we need to read the most recent mapping must be + * stored on concrete vdevs. Therefore, we must dirty anything that + * is read before spa_remove_init(). Specifically, the + * spa_config_object. (Note that although we already modified the + * spa_config_object in spa_sync_removing_state, that may not have + * modified all blocks of the object.) + */ + dmu_object_info_t doi; + VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi)); + for (uint64_t offset = 0; offset < doi.doi_max_offset; ) { + dmu_buf_t *dbuf; + VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT, + offset, FTAG, &dbuf, 0)); + dmu_buf_will_dirty(dbuf, tx); + offset += dbuf->db_size; + dmu_buf_rele(dbuf, FTAG); + } + + /* + * Now that we've allocated the im_object, dirty the vdev to ensure + * that the object gets written to the config on disk. + */ + vdev_config_dirty(vd); + + zfs_dbgmsg("starting removal thread for vdev %llu (%p) in txg %llu " + "im_obj=%llu", vd->vdev_id, vd, dmu_tx_get_txg(tx), + vic->vic_mapping_object); + + spa_history_log_internal(spa, "vdev remove started", tx, + "%s vdev %llu %s", spa_name(spa), vd->vdev_id, + (vd->vdev_path != NULL) ? vd->vdev_path : "-"); + /* + * Setting spa_vdev_removal causes subsequent frees to call + * free_from_removing_vdev(). Note that we don't need any locking + * because we are the sync thread, and metaslab_free_impl() is only + * called from syncing context (potentially from a zio taskq thread, + * but in any case only when there are outstanding free i/os, which + * there are not). + */ + ASSERT3P(spa->spa_vdev_removal, ==, NULL); + spa->spa_vdev_removal = svr; + svr->svr_thread = thread_create(NULL, 0, + spa_vdev_remove_thread, vd, 0, &p0, TS_RUN, minclsyspri); +} + +/* + * When we are opening a pool, we must read the mapping for each + * indirect vdev in order from most recently removed to least + * recently removed. We do this because the blocks for the mapping + * of older indirect vdevs may be stored on more recently removed vdevs. + * In order to read each indirect mapping object, we must have + * initialized all more recently removed vdevs. + */ +int +spa_remove_init(spa_t *spa) +{ + int error; + + error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_REMOVING, sizeof (uint64_t), + sizeof (spa->spa_removing_phys) / sizeof (uint64_t), + &spa->spa_removing_phys); + + if (error == ENOENT) { + spa->spa_removing_phys.sr_state = DSS_NONE; + spa->spa_removing_phys.sr_removing_vdev = -1; + spa->spa_removing_phys.sr_prev_indirect_vdev = -1; + return (0); + } else if (error != 0) { + return (error); + } + + if (spa->spa_removing_phys.sr_state == DSS_SCANNING) { + /* + * We are currently removing a vdev. Create and + * initialize a spa_vdev_removal_t from the bonus + * buffer of the removing vdevs vdev_im_object, and + * initialize its partial mapping. + */ + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + vdev_t *vd = vdev_lookup_top(spa, + spa->spa_removing_phys.sr_removing_vdev); + spa_config_exit(spa, SCL_STATE, FTAG); + + if (vd == NULL) + return (EINVAL); + + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + + ASSERT(vdev_is_concrete(vd)); + spa_vdev_removal_t *svr = spa_vdev_removal_create(vd); + ASSERT(svr->svr_vdev->vdev_removing); + + vd->vdev_indirect_mapping = vdev_indirect_mapping_open( + spa->spa_meta_objset, vic->vic_mapping_object); + vd->vdev_indirect_births = vdev_indirect_births_open( + spa->spa_meta_objset, vic->vic_births_object); + + spa->spa_vdev_removal = svr; + } + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + uint64_t indirect_vdev_id = + spa->spa_removing_phys.sr_prev_indirect_vdev; + while (indirect_vdev_id != UINT64_MAX) { + vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id); + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + vd->vdev_indirect_mapping = vdev_indirect_mapping_open( + spa->spa_meta_objset, vic->vic_mapping_object); + vd->vdev_indirect_births = vdev_indirect_births_open( + spa->spa_meta_objset, vic->vic_births_object); + + indirect_vdev_id = vic->vic_prev_indirect_vdev; + } + spa_config_exit(spa, SCL_STATE, FTAG); + + /* + * Now that we've loaded all the indirect mappings, we can allow + * reads from other blocks (e.g. via predictive prefetch). + */ + spa->spa_indirect_vdevs_loaded = B_TRUE; + return (0); +} + +void +spa_restart_removal(spa_t *spa) +{ + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + + if (svr == NULL) + return; + + /* + * In general when this function is called there is no + * removal thread running. The only scenario where this + * is not true is during spa_import() where this function + * is called twice [once from spa_import_impl() and + * spa_async_resume()]. Thus, in the scenario where we + * import a pool that has an ongoing removal we don't + * want to spawn a second thread. + */ + if (svr->svr_thread != NULL) + return; + + if (!spa_writeable(spa)) + return; + + vdev_t *vd = svr->svr_vdev; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + ASSERT3P(vd, !=, NULL); + ASSERT(vd->vdev_removing); + + zfs_dbgmsg("restarting removal of %llu at count=%llu", + vd->vdev_id, vdev_indirect_mapping_num_entries(vim)); + svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, vd, + 0, &p0, TS_RUN, minclsyspri); +} + +/* + * Process freeing from a device which is in the middle of being removed. + * We must handle this carefully so that we attempt to copy freed data, + * and we correctly free already-copied data. + */ +void +free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size, + uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + uint64_t max_offset_yet = 0; + + ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); + ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==, + vdev_indirect_mapping_object(vim)); + ASSERT3P(vd, ==, svr->svr_vdev); + ASSERT3U(spa_syncing_txg(spa), ==, txg); + + mutex_enter(&svr->svr_lock); + + /* + * Remove the segment from the removing vdev's spacemap. This + * ensures that we will not attempt to copy this space (if the + * removal thread has not yet visited it), and also ensures + * that we know what is actually allocated on the new vdevs + * (needed if we cancel the removal). + * + * Note: we must do the metaslab_free_concrete() with the svr_lock + * held, so that the remove_thread can not load this metaslab and then + * visit this offset between the time that we metaslab_free_concrete() + * and when we check to see if it has been visited. + */ + metaslab_free_concrete(vd, offset, size, txg); + + uint64_t synced_size = 0; + uint64_t synced_offset = 0; + uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim); + if (offset < max_offset_synced) { + /* + * The mapping for this offset is already on disk. + * Free from the new location. + * + * Note that we use svr_max_synced_offset because it is + * updated atomically with respect to the in-core mapping. + * By contrast, vim_max_offset is not. + * + * This block may be split between a synced entry and an + * in-flight or unvisited entry. Only process the synced + * portion of it here. + */ + synced_size = MIN(size, max_offset_synced - offset); + synced_offset = offset; + + ASSERT3U(max_offset_yet, <=, max_offset_synced); + max_offset_yet = max_offset_synced; + + DTRACE_PROBE3(remove__free__synced, + spa_t *, spa, + uint64_t, offset, + uint64_t, synced_size); + + size -= synced_size; + offset += synced_size; + } + + /* + * Look at all in-flight txgs starting from the currently syncing one + * and see if a section of this free is being copied. By starting from + * this txg and iterating forward, we might find that this region + * was copied in two different txgs and handle it appropriately. + */ + for (int i = 0; i < TXG_CONCURRENT_STATES; i++) { + int txgoff = (txg + i) & TXG_MASK; + if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) { + /* + * The mapping for this offset is in flight, and + * will be synced in txg+i. + */ + uint64_t inflight_size = MIN(size, + svr->svr_max_offset_to_sync[txgoff] - offset); + + DTRACE_PROBE4(remove__free__inflight, + spa_t *, spa, + uint64_t, offset, + uint64_t, inflight_size, + uint64_t, txg + i); + + /* + * We copy data in order of increasing offset. + * Therefore the max_offset_to_sync[] must increase + * (or be zero, indicating that nothing is being + * copied in that txg). + */ + if (svr->svr_max_offset_to_sync[txgoff] != 0) { + ASSERT3U(svr->svr_max_offset_to_sync[txgoff], + >=, max_offset_yet); + max_offset_yet = + svr->svr_max_offset_to_sync[txgoff]; + } + + /* + * We've already committed to copying this segment: + * we have allocated space elsewhere in the pool for + * it and have an IO outstanding to copy the data. We + * cannot free the space before the copy has + * completed, or else the copy IO might overwrite any + * new data. To free that space, we record the + * segment in the appropriate svr_frees tree and free + * the mapped space later, in the txg where we have + * completed the copy and synced the mapping (see + * vdev_mapping_sync). + */ + range_tree_add(svr->svr_frees[txgoff], + offset, inflight_size); + size -= inflight_size; + offset += inflight_size; + + /* + * This space is already accounted for as being + * done, because it is being copied in txg+i. + * However, if i!=0, then it is being copied in + * a future txg. If we crash after this txg + * syncs but before txg+i syncs, then the space + * will be free. Therefore we must account + * for the space being done in *this* txg + * (when it is freed) rather than the future txg + * (when it will be copied). + */ + ASSERT3U(svr->svr_bytes_done[txgoff], >=, + inflight_size); + svr->svr_bytes_done[txgoff] -= inflight_size; + svr->svr_bytes_done[txg & TXG_MASK] += inflight_size; + } + } + ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]); + + if (size > 0) { + /* + * The copy thread has not yet visited this offset. Ensure + * that it doesn't. + */ + + DTRACE_PROBE3(remove__free__unvisited, + spa_t *, spa, + uint64_t, offset, + uint64_t, size); + + if (svr->svr_allocd_segs != NULL) + range_tree_clear(svr->svr_allocd_segs, offset, size); + + /* + * Since we now do not need to copy this data, for + * accounting purposes we have done our job and can count + * it as completed. + */ + svr->svr_bytes_done[txg & TXG_MASK] += size; + } + mutex_exit(&svr->svr_lock); + + /* + * Now that we have dropped svr_lock, process the synced portion + * of this free. + */ + if (synced_size > 0) { + vdev_indirect_mark_obsolete(vd, synced_offset, synced_size, + txg); + /* + * Note: this can only be called from syncing context, + * and the vdev_indirect_mapping is only changed from the + * sync thread, so we don't need svr_lock while doing + * metaslab_free_impl_cb. + */ + vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size, + metaslab_free_impl_cb, &txg); + } +} + +/* + * Stop an active removal and update the spa_removing phys. + */ +static void +spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx) +{ + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa)); + + /* Ensure the removal thread has completed before we free the svr. */ + spa_vdev_remove_suspend(spa); + + ASSERT(state == DSS_FINISHED || state == DSS_CANCELED); + + if (state == DSS_FINISHED) { + spa_removing_phys_t *srp = &spa->spa_removing_phys; + vdev_t *vd = svr->svr_vdev; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + + if (srp->sr_prev_indirect_vdev != UINT64_MAX) { + vdev_t *pvd; + pvd = vdev_lookup_top(spa, + srp->sr_prev_indirect_vdev); + ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops); + } + + vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev; + srp->sr_prev_indirect_vdev = vd->vdev_id; + } + spa->spa_removing_phys.sr_state = state; + spa->spa_removing_phys.sr_end_time = gethrestime_sec(); + + spa->spa_vdev_removal = NULL; + spa_vdev_removal_destroy(svr); + + spa_sync_removing_state(spa, tx); + + vdev_config_dirty(spa->spa_root_vdev); +} + +static void +free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size) +{ + vdev_t *vd = arg; + vdev_indirect_mark_obsolete(vd, offset, size, + vd->vdev_spa->spa_syncing_txg); + vdev_indirect_ops.vdev_op_remap(vd, offset, size, + metaslab_free_impl_cb, &vd->vdev_spa->spa_syncing_txg); +} + +/* + * On behalf of the removal thread, syncs an incremental bit more of + * the indirect mapping to disk and updates the in-memory mapping. + * Called as a sync task in every txg that the removal thread makes progress. + */ +static void +vdev_mapping_sync(void *arg, dmu_tx_t *tx) +{ + spa_vdev_removal_t *svr = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = svr->svr_vdev; + ASSERTV(vdev_indirect_config_t *vic = &vd->vdev_indirect_config); + uint64_t txg = dmu_tx_get_txg(tx); + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + ASSERT(vic->vic_mapping_object != 0); + ASSERT3U(txg, ==, spa_syncing_txg(spa)); + + vdev_indirect_mapping_add_entries(vim, + &svr->svr_new_segments[txg & TXG_MASK], tx); + vdev_indirect_births_add_entry(vd->vdev_indirect_births, + vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx); + + /* + * Free the copied data for anything that was freed while the + * mapping entries were in flight. + */ + mutex_enter(&svr->svr_lock); + range_tree_vacate(svr->svr_frees[txg & TXG_MASK], + free_mapped_segment_cb, vd); + ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=, + vdev_indirect_mapping_max_offset(vim)); + svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0; + mutex_exit(&svr->svr_lock); + + spa_sync_removing_state(spa, tx); +} + +static void +spa_vdev_copy_segment_write_done(zio_t *zio) +{ + vdev_copy_seg_arg_t *vcsa = zio->io_private; + vdev_copy_arg_t *vca = vcsa->vcsa_copy_arg; + spa_config_exit(zio->io_spa, SCL_STATE, FTAG); + abd_free(zio->io_abd); + + mutex_enter(&vca->vca_lock); + vca->vca_outstanding_bytes -= zio->io_size; + cv_signal(&vca->vca_cv); + mutex_exit(&vca->vca_lock); + + ASSERT0(zio->io_error); + kmem_free(vcsa->vcsa_dest_bp, sizeof (blkptr_t)); + kmem_free(vcsa, sizeof (vdev_copy_seg_arg_t)); +} + +static void +spa_vdev_copy_segment_read_done(zio_t *zio) +{ + vdev_copy_seg_arg_t *vcsa = zio->io_private; + dva_t *dest_dva = vcsa->vcsa_dest_dva; + uint64_t txg = vcsa->vcsa_txg; + spa_t *spa = zio->io_spa; + ASSERTV(vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(dest_dva))); + blkptr_t *bp = NULL; + dva_t *dva = NULL; + uint64_t size = zio->io_size; + + ASSERT3P(dest_vd, !=, NULL); + ASSERT0(zio->io_error); + + vcsa->vcsa_dest_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); + bp = vcsa->vcsa_dest_bp; + dva = bp->blk_dva; + + BP_ZERO(bp); + + /* initialize with dest_dva */ + bcopy(dest_dva, dva, sizeof (dva_t)); + BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); + + BP_SET_LSIZE(bp, size); + BP_SET_PSIZE(bp, size); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); + BP_SET_TYPE(bp, DMU_OT_NONE); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + + zio_nowait(zio_rewrite(spa->spa_txg_zio[txg & TXG_MASK], spa, + txg, bp, zio->io_abd, size, + spa_vdev_copy_segment_write_done, vcsa, + ZIO_PRIORITY_REMOVAL, 0, NULL)); +} + +static int +spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg, + vdev_copy_arg_t *vca, zio_alloc_list_t *zal) +{ + metaslab_group_t *mg = vd->vdev_mg; + spa_t *spa = vd->vdev_spa; + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + vdev_indirect_mapping_entry_t *entry; + vdev_copy_seg_arg_t *private; + dva_t dst = {{ 0 }}; + blkptr_t blk, *bp = &blk; + dva_t *dva = bp->blk_dva; + + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + + int error = metaslab_alloc_dva(spa, mg->mg_class, size, + &dst, 0, NULL, txg, 0, zal); + if (error != 0) + return (error); + + /* + * We can't have any padding of the allocated size, otherwise we will + * misunderstand what's allocated, and the size of the mapping. + * The caller ensures this will be true by passing in a size that is + * aligned to the worst (highest) ashift in the pool. + */ + ASSERT3U(DVA_GET_ASIZE(&dst), ==, size); + + mutex_enter(&vca->vca_lock); + vca->vca_outstanding_bytes += size; + mutex_exit(&vca->vca_lock); + + entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP); + DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start); + entry->vime_mapping.vimep_dst = dst; + + private = kmem_alloc(sizeof (vdev_copy_seg_arg_t), KM_SLEEP); + private->vcsa_dest_dva = &entry->vime_mapping.vimep_dst; + private->vcsa_txg = txg; + private->vcsa_copy_arg = vca; + + /* + * This lock is eventually released by the donefunc for the + * zio_write_phys that finishes copying the data. + */ + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + + /* + * Do logical I/O, letting the redundancy vdevs (like mirror) + * handle their own I/O instead of duplicating that code here. + */ + BP_ZERO(bp); + + DVA_SET_VDEV(&dva[0], vd->vdev_id); + DVA_SET_OFFSET(&dva[0], start); + DVA_SET_GANG(&dva[0], 0); + DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, size)); + + BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); + + BP_SET_LSIZE(bp, size); + BP_SET_PSIZE(bp, size); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); + BP_SET_TYPE(bp, DMU_OT_NONE); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + + zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, + bp, abd_alloc_for_io(size, B_FALSE), size, + spa_vdev_copy_segment_read_done, private, + ZIO_PRIORITY_REMOVAL, 0, NULL)); + + list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry); + ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift); + vdev_dirty(vd, 0, NULL, txg); + + return (0); +} + +/* + * Complete the removal of a toplevel vdev. This is called as a + * synctask in the same txg that we will sync out the new config (to the + * MOS object) which indicates that this vdev is indirect. + */ +static void +vdev_remove_complete_sync(void *arg, dmu_tx_t *tx) +{ + spa_vdev_removal_t *svr = arg; + vdev_t *vd = svr->svr_vdev; + spa_t *spa = vd->vdev_spa; + + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(svr->svr_bytes_done[i]); + } + + ASSERT3U(spa->spa_removing_phys.sr_copied, ==, + spa->spa_removing_phys.sr_to_copy); + + vdev_destroy_spacemaps(vd, tx); + + /* destroy leaf zaps, if any */ + ASSERT3P(svr->svr_zaplist, !=, NULL); + for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL); + pair != NULL; + pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) { + vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx); + } + fnvlist_free(svr->svr_zaplist); + + spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx); + /* vd->vdev_path is not available here */ + spa_history_log_internal(spa, "vdev remove completed", tx, + "%s vdev %llu", spa_name(spa), vd->vdev_id); +} + +static void +vdev_indirect_state_transfer(vdev_t *ivd, vdev_t *vd) +{ + ivd->vdev_indirect_config = vd->vdev_indirect_config; + + ASSERT3P(ivd->vdev_indirect_mapping, ==, NULL); + ASSERT(vd->vdev_indirect_mapping != NULL); + ivd->vdev_indirect_mapping = vd->vdev_indirect_mapping; + vd->vdev_indirect_mapping = NULL; + + ASSERT3P(ivd->vdev_indirect_births, ==, NULL); + ASSERT(vd->vdev_indirect_births != NULL); + ivd->vdev_indirect_births = vd->vdev_indirect_births; + vd->vdev_indirect_births = NULL; + + ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); + ASSERT0(range_tree_space(ivd->vdev_obsolete_segments)); + + if (vd->vdev_obsolete_sm != NULL) { + ASSERT3U(ivd->vdev_asize, ==, vd->vdev_asize); + + /* + * We cannot use space_map_{open,close} because we hold all + * the config locks as writer. + */ + ASSERT3P(ivd->vdev_obsolete_sm, ==, NULL); + ivd->vdev_obsolete_sm = vd->vdev_obsolete_sm; + vd->vdev_obsolete_sm = NULL; + } +} + +static void +vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist) +{ + ASSERT3P(zlist, !=, NULL); + ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); + + if (vd->vdev_leaf_zap != 0) { + char zkey[32]; + (void) snprintf(zkey, sizeof (zkey), "%s-%llu", + VDEV_REMOVAL_ZAP_OBJS, (u_longlong_t)vd->vdev_leaf_zap); + fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap); + } + + for (uint64_t id = 0; id < vd->vdev_children; id++) { + vdev_remove_enlist_zaps(vd->vdev_child[id], zlist); + } +} + +static void +vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg) +{ + vdev_t *ivd; + dmu_tx_t *tx; + spa_t *spa = vd->vdev_spa; + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + + /* + * First, build a list of leaf zaps to be destroyed. + * This is passed to the sync context thread, + * which does the actual unlinking. + */ + svr->svr_zaplist = fnvlist_alloc(); + vdev_remove_enlist_zaps(vd, svr->svr_zaplist); + + ivd = vdev_add_parent(vd, &vdev_indirect_ops); + + vd->vdev_leaf_zap = 0; + + vdev_remove_child(ivd, vd); + vdev_compact_children(ivd); + + vdev_indirect_state_transfer(ivd, vd); + + svr->svr_vdev = ivd; + + ASSERT(!ivd->vdev_removing); + ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr, + 0, ZFS_SPACE_CHECK_NONE, tx); + dmu_tx_commit(tx); + + /* + * Indicate that this thread has exited. + * After this, we can not use svr. + */ + mutex_enter(&svr->svr_lock); + svr->svr_thread = NULL; + cv_broadcast(&svr->svr_cv); + mutex_exit(&svr->svr_lock); +} + +/* + * Complete the removal of a toplevel vdev. This is called in open + * context by the removal thread after we have copied all vdev's data. + */ +static void +vdev_remove_complete(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + uint64_t txg; + + /* + * Wait for any deferred frees to be synced before we call + * vdev_metaslab_fini() + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + + txg = spa_vdev_enter(spa); + zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu", + vd->vdev_id, txg); + + /* + * Discard allocation state. + */ + if (vd->vdev_mg != NULL) { + vdev_metaslab_fini(vd); + metaslab_group_destroy(vd->vdev_mg); + vd->vdev_mg = NULL; + } + ASSERT0(vd->vdev_stat.vs_space); + ASSERT0(vd->vdev_stat.vs_dspace); + + vdev_remove_replace_with_indirect(vd, txg); + + /* + * We now release the locks, allowing spa_sync to run and finish the + * removal via vdev_remove_complete_sync in syncing context. + */ + (void) spa_vdev_exit(spa, NULL, txg, 0); + + /* + * Top ZAP should have been transferred to the indirect vdev in + * vdev_remove_replace_with_indirect. + */ + ASSERT0(vd->vdev_top_zap); + + /* + * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect. + */ + ASSERT0(vd->vdev_leaf_zap); + + txg = spa_vdev_enter(spa); + (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); + /* + * Request to update the config and the config cachefile. + */ + vdev_config_dirty(spa->spa_root_vdev); + (void) spa_vdev_exit(spa, vd, txg, 0); +} + +/* + * Evacuates a segment of size at most max_alloc from the vdev + * via repeated calls to spa_vdev_copy_segment. If an allocation + * fails, the pool is probably too fragmented to handle such a + * large size, so decrease max_alloc so that the caller will not try + * this size again this txg. + */ +static void +spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, + uint64_t *max_alloc, dmu_tx_t *tx) +{ + uint64_t txg = dmu_tx_get_txg(tx); + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + mutex_enter(&svr->svr_lock); + + range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root); + if (rs == NULL) { + mutex_exit(&svr->svr_lock); + return; + } + uint64_t offset = rs->rs_start; + uint64_t length = MIN(rs->rs_end - rs->rs_start, *max_alloc); + + range_tree_remove(svr->svr_allocd_segs, offset, length); + + if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) { + dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync, + svr, 0, ZFS_SPACE_CHECK_NONE, tx); + } + + svr->svr_max_offset_to_sync[txg & TXG_MASK] = offset + length; + + /* + * Note: this is the amount of *allocated* space + * that we are taking care of each txg. + */ + svr->svr_bytes_done[txg & TXG_MASK] += length; + + mutex_exit(&svr->svr_lock); + + zio_alloc_list_t zal; + metaslab_trace_init(&zal); + uint64_t thismax = *max_alloc; + while (length > 0) { + uint64_t mylen = MIN(length, thismax); + + int error = spa_vdev_copy_segment(svr->svr_vdev, + offset, mylen, txg, vca, &zal); + + if (error == ENOSPC) { + /* + * Cut our segment in half, and don't try this + * segment size again this txg. Note that the + * allocation size must be aligned to the highest + * ashift in the pool, so that the allocation will + * not be padded out to a multiple of the ashift, + * which could cause us to think that this mapping + * is larger than we intended. + */ + ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT); + ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift); + thismax = P2ROUNDUP(mylen / 2, + 1 << spa->spa_max_ashift); + ASSERT3U(thismax, <, mylen); + /* + * The minimum-size allocation can not fail. + */ + ASSERT3U(mylen, >, 1 << spa->spa_max_ashift); + *max_alloc = mylen - (1 << spa->spa_max_ashift); + } else { + ASSERT0(error); + length -= mylen; + offset += mylen; + + /* + * We've performed an allocation, so reset the + * alloc trace list. + */ + metaslab_trace_fini(&zal); + metaslab_trace_init(&zal); + } + } + metaslab_trace_fini(&zal); +} + +/* + * The removal thread operates in open context. It iterates over all + * allocated space in the vdev, by loading each metaslab's spacemap. + * For each contiguous segment of allocated space (capping the segment + * size at SPA_MAXBLOCKSIZE), we: + * - Allocate space for it on another vdev. + * - Create a new mapping from the old location to the new location + * (as a record in svr_new_segments). + * - Initiate a physical read zio to get the data off the removing disk. + * - In the read zio's done callback, initiate a physical write zio to + * write it to the new vdev. + * Note that all of this will take effect when a particular TXG syncs. + * The sync thread ensures that all the phys reads and writes for the syncing + * TXG have completed (see spa_txg_zio) and writes the new mappings to disk + * (see vdev_mapping_sync()). + */ +static void +spa_vdev_remove_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + vdev_copy_arg_t vca; + uint64_t max_alloc = zfs_remove_max_segment; + uint64_t last_txg = 0; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + uint64_t start_offset = vdev_indirect_mapping_max_offset(vim); + + ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); + ASSERT(vdev_is_concrete(vd)); + ASSERT(vd->vdev_removing); + ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); + ASSERT3P(svr->svr_vdev, ==, vd); + ASSERT(vim != NULL); + + mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL); + vca.vca_outstanding_bytes = 0; + + mutex_enter(&svr->svr_lock); + + /* + * Start from vim_max_offset so we pick up where we left off + * if we are restarting the removal after opening the pool. + */ + uint64_t msi; + for (msi = start_offset >> vd->vdev_ms_shift; + msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) { + metaslab_t *msp = vd->vdev_ms[msi]; + ASSERT3U(msi, <=, vd->vdev_ms_count); + + ASSERT0(range_tree_space(svr->svr_allocd_segs)); + + mutex_enter(&msp->ms_sync_lock); + mutex_enter(&msp->ms_lock); + + /* + * Assert nothing in flight -- ms_*tree is empty. + */ + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(range_tree_space(msp->ms_alloctree[i])); + } + + /* + * If the metaslab has ever been allocated from (ms_sm!=NULL), + * read the allocated segments from the space map object + * into svr_allocd_segs. Since we do this while holding + * svr_lock and ms_sync_lock, concurrent frees (which + * would have modified the space map) will wait for us + * to finish loading the spacemap, and then take the + * appropriate action (see free_from_removing_vdev()). + */ + if (msp->ms_sm != NULL) { + space_map_t *sm = NULL; + + /* + * We have to open a new space map here, because + * ms_sm's sm_length and sm_alloc may not reflect + * what's in the object contents, if we are in between + * metaslab_sync() and metaslab_sync_done(). + */ + VERIFY0(space_map_open(&sm, + spa->spa_dsl_pool->dp_meta_objset, + msp->ms_sm->sm_object, msp->ms_sm->sm_start, + msp->ms_sm->sm_size, msp->ms_sm->sm_shift)); + space_map_update(sm); + VERIFY0(space_map_load(sm, svr->svr_allocd_segs, + SM_ALLOC)); + space_map_close(sm); + + range_tree_walk(msp->ms_freeingtree, + range_tree_remove, svr->svr_allocd_segs); + + /* + * When we are resuming from a paused removal (i.e. + * when importing a pool with a removal in progress), + * discard any state that we have already processed. + */ + range_tree_clear(svr->svr_allocd_segs, 0, start_offset); + } + mutex_exit(&msp->ms_lock); + mutex_exit(&msp->ms_sync_lock); + + vca.vca_msp = msp; + zfs_dbgmsg("copying %llu segments for metaslab %llu", + avl_numnodes(&svr->svr_allocd_segs->rt_root), + msp->ms_id); + + while (!svr->svr_thread_exit && + range_tree_space(svr->svr_allocd_segs) != 0) { + + mutex_exit(&svr->svr_lock); + + mutex_enter(&vca.vca_lock); + while (vca.vca_outstanding_bytes > + zfs_remove_max_copy_bytes) { + cv_wait(&vca.vca_cv, &vca.vca_lock); + } + mutex_exit(&vca.vca_lock); + + dmu_tx_t *tx = + dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + dmu_tx_hold_space(tx, SPA_MAXBLOCKSIZE); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + + if (txg != last_txg) + max_alloc = zfs_remove_max_segment; + last_txg = txg; + + spa_vdev_copy_impl(svr, &vca, &max_alloc, tx); + + dmu_tx_commit(tx); + mutex_enter(&svr->svr_lock); + } + } + + mutex_exit(&svr->svr_lock); + /* + * Wait for all copies to finish before cleaning up the vca. + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + ASSERT0(vca.vca_outstanding_bytes); + + mutex_destroy(&vca.vca_lock); + cv_destroy(&vca.vca_cv); + + if (svr->svr_thread_exit) { + mutex_enter(&svr->svr_lock); + range_tree_vacate(svr->svr_allocd_segs, NULL, NULL); + svr->svr_thread = NULL; + cv_broadcast(&svr->svr_cv); + mutex_exit(&svr->svr_lock); + } else { + ASSERT0(range_tree_space(svr->svr_allocd_segs)); + vdev_remove_complete(vd); + } +} + +void +spa_vdev_remove_suspend(spa_t *spa) +{ + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + + if (svr == NULL) + return; + + mutex_enter(&svr->svr_lock); + svr->svr_thread_exit = B_TRUE; + while (svr->svr_thread != NULL) + cv_wait(&svr->svr_cv, &svr->svr_lock); + svr->svr_thread_exit = B_FALSE; + mutex_exit(&svr->svr_lock); +} + +/* ARGSUSED */ +static int +spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + if (spa->spa_vdev_removal == NULL) + return (ENOTACTIVE); + return (0); +} + +/* + * Cancel a removal by freeing all entries from the partial mapping + * and marking the vdev as no longer being removing. + */ +/* ARGSUSED */ +static void +spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + vdev_t *vd = svr->svr_vdev; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + objset_t *mos = spa->spa_meta_objset; + + ASSERT3P(svr->svr_thread, ==, NULL); + + spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); + if (vdev_obsolete_counts_are_precise(vd)) { + spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx)); + } + + if (vdev_obsolete_sm_object(vd) != 0) { + ASSERT(vd->vdev_obsolete_sm != NULL); + ASSERT3U(vdev_obsolete_sm_object(vd), ==, + space_map_object(vd->vdev_obsolete_sm)); + + space_map_free(vd->vdev_obsolete_sm, tx); + VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); + space_map_close(vd->vdev_obsolete_sm); + vd->vdev_obsolete_sm = NULL; + spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + } + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT(list_is_empty(&svr->svr_new_segments[i])); + ASSERT3U(svr->svr_max_offset_to_sync[i], <=, + vdev_indirect_mapping_max_offset(vim)); + } + + for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { + metaslab_t *msp = vd->vdev_ms[msi]; + + if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) + break; + + ASSERT0(range_tree_space(svr->svr_allocd_segs)); + + mutex_enter(&msp->ms_lock); + + /* + * Assert nothing in flight -- ms_*tree is empty. + */ + for (int i = 0; i < TXG_SIZE; i++) + ASSERT0(range_tree_space(msp->ms_alloctree[i])); + for (int i = 0; i < TXG_DEFER_SIZE; i++) + ASSERT0(range_tree_space(msp->ms_defertree[i])); + ASSERT0(range_tree_space(msp->ms_freedtree)); + + if (msp->ms_sm != NULL) { + /* + * Assert that the in-core spacemap has the same + * length as the on-disk one, so we can use the + * existing in-core spacemap to load it from disk. + */ + ASSERT3U(msp->ms_sm->sm_alloc, ==, + msp->ms_sm->sm_phys->smp_alloc); + ASSERT3U(msp->ms_sm->sm_length, ==, + msp->ms_sm->sm_phys->smp_objsize); + + mutex_enter(&svr->svr_lock); + VERIFY0(space_map_load(msp->ms_sm, + svr->svr_allocd_segs, SM_ALLOC)); + range_tree_walk(msp->ms_freeingtree, + range_tree_remove, svr->svr_allocd_segs); + + /* + * Clear everything past what has been synced, + * because we have not allocated mappings for it yet. + */ + uint64_t syncd = vdev_indirect_mapping_max_offset(vim); + range_tree_clear(svr->svr_allocd_segs, syncd, + msp->ms_sm->sm_start + msp->ms_sm->sm_size - syncd); + + mutex_exit(&svr->svr_lock); + } + mutex_exit(&msp->ms_lock); + + mutex_enter(&svr->svr_lock); + range_tree_vacate(svr->svr_allocd_segs, + free_mapped_segment_cb, vd); + mutex_exit(&svr->svr_lock); + } + + /* + * Note: this must happen after we invoke free_mapped_segment_cb, + * because it adds to the obsolete_segments. + */ + range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); + + ASSERT3U(vic->vic_mapping_object, ==, + vdev_indirect_mapping_object(vd->vdev_indirect_mapping)); + vdev_indirect_mapping_close(vd->vdev_indirect_mapping); + vd->vdev_indirect_mapping = NULL; + vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); + vic->vic_mapping_object = 0; + + ASSERT3U(vic->vic_births_object, ==, + vdev_indirect_births_object(vd->vdev_indirect_births)); + vdev_indirect_births_close(vd->vdev_indirect_births); + vd->vdev_indirect_births = NULL; + vdev_indirect_births_free(mos, vic->vic_births_object, tx); + vic->vic_births_object = 0; + + /* + * We may have processed some frees from the removing vdev in this + * txg, thus increasing svr_bytes_done; discard that here to + * satisfy the assertions in spa_vdev_removal_destroy(). + * Note that future txg's can not have any bytes_done, because + * future TXG's are only modified from open context, and we have + * already shut down the copying thread. + */ + svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0; + spa_finish_removal(spa, DSS_CANCELED, tx); + + vd->vdev_removing = B_FALSE; + vdev_config_dirty(vd); + + zfs_dbgmsg("canceled device removal for vdev %llu in %llu", + vd->vdev_id, dmu_tx_get_txg(tx)); + spa_history_log_internal(spa, "vdev remove canceled", tx, + "%s vdev %llu %s", spa_name(spa), + vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-"); +} + +int +spa_vdev_remove_cancel(spa_t *spa) +{ + spa_vdev_remove_suspend(spa); + + if (spa->spa_vdev_removal == NULL) + return (ENOTACTIVE); + + uint64_t vdid = spa->spa_vdev_removal->svr_vdev->vdev_id; + + int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check, + spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_NONE); + + if (error == 0) { + spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER); + vdev_t *vd = vdev_lookup_top(spa, vdid); + metaslab_group_activate(vd->vdev_mg); + spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG); + } + + return (error); +} + +/* + * Called every sync pass of every txg if there's a svr. + */ +void +svr_sync(spa_t *spa, dmu_tx_t *tx) +{ + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + /* + * This check is necessary so that we do not dirty the + * DIRECTORY_OBJECT via spa_sync_removing_state() when there + * is nothing to do. Dirtying it every time would prevent us + * from syncing-to-convergence. + */ + if (svr->svr_bytes_done[txgoff] == 0) + return; + + /* + * Update progress accounting. + */ + spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff]; + svr->svr_bytes_done[txgoff] = 0; + + spa_sync_removing_state(spa, tx); +} + +static void +vdev_remove_make_hole_and_free(vdev_t *vd) +{ + uint64_t id = vd->vdev_id; + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; + boolean_t last_vdev = (id == (rvd->vdev_children - 1)); + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + vdev_free(vd); + + if (last_vdev) { + vdev_compact_children(rvd); + } else { + vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); + vdev_add_child(rvd, vd); + } + vdev_config_dirty(rvd); + + /* + * Reassess the health of our root vdev. + */ + vdev_reopen(rvd); +} + +/* + * Remove a log device. The config lock is held for the specified TXG. + */ +static int +spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) +{ + metaslab_group_t *mg = vd->vdev_mg; + spa_t *spa = vd->vdev_spa; + int error = 0; + + ASSERT(vd->vdev_islog); + ASSERT(vd == vd->vdev_top); + + /* + * Stop allocating from this vdev. + */ + metaslab_group_passivate(mg); + + /* + * Wait for the youngest allocations and frees to sync, + * and then wait for the deferral of those frees to finish. + */ + spa_vdev_config_exit(spa, NULL, + *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); + + /* + * Evacuate the device. We don't hold the config lock as writer + * since we need to do I/O but we do keep the + * spa_namespace_lock held. Once this completes the device + * should no longer have any blocks allocated on it. + */ + if (vd->vdev_islog) { + if (vd->vdev_stat.vs_alloc != 0) + error = spa_reset_logs(spa); + } + + *txg = spa_vdev_config_enter(spa); + + if (error != 0) { + metaslab_group_activate(mg); + return (error); + } + ASSERT0(vd->vdev_stat.vs_alloc); + + /* + * The evacuation succeeded. Remove any remaining MOS metadata + * associated with this vdev, and wait for these changes to sync. + */ + vd->vdev_removing = B_TRUE; + + vdev_dirty_leaves(vd, VDD_DTL, *txg); + vdev_config_dirty(vd); + + spa_history_log_internal(spa, "vdev remove", NULL, + "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id, + (vd->vdev_path != NULL) ? vd->vdev_path : "-"); + + spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); + + *txg = spa_vdev_config_enter(spa); + + sysevent_t *ev = spa_event_create(spa, vd, NULL, + ESC_ZFS_VDEV_REMOVE_DEV); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + /* The top ZAP should have been destroyed by vdev_remove_empty. */ + ASSERT0(vd->vdev_top_zap); + /* The leaf ZAP should have been destroyed by vdev_dtl_sync. */ + ASSERT0(vd->vdev_leaf_zap); + + (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); + + if (list_link_active(&vd->vdev_state_dirty_node)) + vdev_state_clean(vd); + if (list_link_active(&vd->vdev_config_dirty_node)) + vdev_config_clean(vd); + + /* + * Clean up the vdev namespace. + */ + vdev_remove_make_hole_and_free(vd); + + if (ev != NULL) + spa_event_post(ev); + + return (0); +} + +static int +spa_vdev_remove_top_check(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + if (vd != vd->vdev_top) + return (SET_ERROR(ENOTSUP)); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)) + return (SET_ERROR(ENOTSUP)); + + /* + * There has to be enough free space to remove the + * device and leave double the "slop" space (i.e. we + * must leave at least 3% of the pool free, in addition to + * the normal slop space). + */ + if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir, + NULL, 0, B_TRUE) < + vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) { + return (SET_ERROR(ENOSPC)); + } + + /* + * There can not be a removal in progress. + */ + if (spa->spa_removing_phys.sr_state == DSS_SCANNING) + return (SET_ERROR(EBUSY)); + + /* + * The device must have all its data. + */ + if (!vdev_dtl_empty(vd, DTL_MISSING) || + !vdev_dtl_empty(vd, DTL_OUTAGE)) + return (SET_ERROR(EBUSY)); + + /* + * The device must be healthy. + */ + if (!vdev_readable(vd)) + return (SET_ERROR(EIO)); + + /* + * All vdevs in normal class must have the same ashift. + */ + if (spa->spa_max_ashift != spa->spa_min_ashift) { + return (SET_ERROR(EINVAL)); + } + + /* + * All vdevs in normal class must have the same ashift + * and not be raidz. + */ + vdev_t *rvd = spa->spa_root_vdev; + int num_indirect = 0; + for (uint64_t id = 0; id < rvd->vdev_children; id++) { + vdev_t *cvd = rvd->vdev_child[id]; + if (cvd->vdev_ashift != 0 && !cvd->vdev_islog) + ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift); + if (cvd->vdev_ops == &vdev_indirect_ops) + num_indirect++; + if (!vdev_is_concrete(cvd)) + continue; + if (cvd->vdev_ops == &vdev_raidz_ops) + return (SET_ERROR(EINVAL)); + /* + * Need the mirror to be mirror of leaf vdevs only + */ + if (cvd->vdev_ops == &vdev_mirror_ops) { + for (uint64_t cid = 0; + cid < cvd->vdev_children; cid++) { + if (!cvd->vdev_child[cid]->vdev_ops-> + vdev_op_leaf) + return (SET_ERROR(EINVAL)); + } + } + } + + return (0); +} + +/* + * Initiate removal of a top-level vdev, reducing the total space in the pool. + * The config lock is held for the specified TXG. Once initiated, + * evacuation of all allocated space (copying it to other vdevs) happens + * in the background (see spa_vdev_remove_thread()), and can be canceled + * (see spa_vdev_remove_cancel()). If successful, the vdev will + * be transformed to an indirect vdev (see spa_vdev_remove_complete()). + */ +static int +spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) +{ + spa_t *spa = vd->vdev_spa; + int error; + + /* + * Check for errors up-front, so that we don't waste time + * passivating the metaslab group and clearing the ZIL if there + * are errors. + */ + error = spa_vdev_remove_top_check(vd); + if (error != 0) + return (error); + + /* + * Stop allocating from this vdev. Note that we must check + * that this is not the only device in the pool before + * passivating, otherwise we will not be able to make + * progress because we can't allocate from any vdevs. + * The above check for sufficient free space serves this + * purpose. + */ + metaslab_group_t *mg = vd->vdev_mg; + metaslab_group_passivate(mg); + + /* + * Wait for the youngest allocations and frees to sync, + * and then wait for the deferral of those frees to finish. + */ + spa_vdev_config_exit(spa, NULL, + *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); + + /* + * We must ensure that no "stubby" log blocks are allocated + * on the device to be removed. These blocks could be + * written at any time, including while we are in the middle + * of copying them. + */ + error = spa_reset_logs(spa); + + *txg = spa_vdev_config_enter(spa); + + /* + * Things might have changed while the config lock was dropped + * (e.g. space usage). Check for errors again. + */ + if (error == 0) + error = spa_vdev_remove_top_check(vd); + + if (error != 0) { + metaslab_group_activate(mg); + return (error); + } + + vd->vdev_removing = B_TRUE; + + vdev_dirty_leaves(vd, VDD_DTL, *txg); + vdev_config_dirty(vd); + dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, + vdev_remove_initiate_sync, + vd, 0, ZFS_SPACE_CHECK_NONE, tx); + dmu_tx_commit(tx); + + return (0); +} + +/* + * Remove a device from the pool. + * + * Removing a device from the vdev namespace requires several steps + * and can take a significant amount of time. As a result we use + * the spa_vdev_config_[enter/exit] functions which allow us to + * grab and release the spa_config_lock while still holding the namespace + * lock. During each step the configuration is synced out. + */ +int +spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) +{ + vdev_t *vd; + nvlist_t **spares, **l2cache, *nv; + uint64_t txg = 0; + uint_t nspares, nl2cache; + int error = 0; + boolean_t locked = MUTEX_HELD(&spa_namespace_lock); + sysevent_t *ev = NULL; + + ASSERT(spa_writeable(spa)); + + if (!locked) + txg = spa_vdev_enter(spa); + + vd = spa_lookup_by_guid(spa, guid, B_FALSE); + + if (spa->spa_spares.sav_vdevs != NULL && + nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && + (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { + /* + * Only remove the hot spare if it's not currently in use + * in this pool. + */ + if (vd == NULL || unspare) { + if (vd == NULL) + vd = spa_lookup_by_guid(spa, guid, B_TRUE); + ev = spa_event_create(spa, vd, NULL, + ESC_ZFS_VDEV_REMOVE_AUX); + + char *nvstr = fnvlist_lookup_string(nv, + ZPOOL_CONFIG_PATH); + spa_history_log_internal(spa, "vdev remove", NULL, + "%s vdev (%s) %s", spa_name(spa), + VDEV_TYPE_SPARE, nvstr); + spa_vdev_remove_aux(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares, nv); + spa_load_spares(spa); + spa->spa_spares.sav_sync = B_TRUE; + } else { + error = SET_ERROR(EBUSY); + } + } else if (spa->spa_l2cache.sav_vdevs != NULL && + nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && + (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { + char *nvstr = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); + spa_history_log_internal(spa, "vdev remove", NULL, + "%s vdev (%s) %s", spa_name(spa), VDEV_TYPE_L2CACHE, nvstr); + /* + * Cache devices can always be removed. + */ + vd = spa_lookup_by_guid(spa, guid, B_TRUE); + ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); + spa_vdev_remove_aux(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); + spa_load_l2cache(spa); + spa->spa_l2cache.sav_sync = B_TRUE; + } else if (vd != NULL && vd->vdev_islog) { + ASSERT(!locked); + error = spa_vdev_remove_log(vd, &txg); + } else if (vd != NULL) { + ASSERT(!locked); + error = spa_vdev_remove_top(vd, &txg); + } else { + /* + * There is no vdev of any kind with the specified guid. + */ + error = SET_ERROR(ENOENT); + } + + if (!locked) + error = spa_vdev_exit(spa, NULL, txg, error); + + if (ev != NULL) + spa_event_post(ev); + + return (error); +} + +int +spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs) +{ + prs->prs_state = spa->spa_removing_phys.sr_state; + + if (prs->prs_state == DSS_NONE) + return (SET_ERROR(ENOENT)); + + prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev; + prs->prs_start_time = spa->spa_removing_phys.sr_start_time; + prs->prs_end_time = spa->spa_removing_phys.sr_end_time; + prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy; + prs->prs_copied = spa->spa_removing_phys.sr_copied; + + if (spa->spa_vdev_removal != NULL) { + for (int i = 0; i < TXG_SIZE; i++) { + prs->prs_copied += + spa->spa_vdev_removal->svr_bytes_done[i]; + } + } + + prs->prs_mapping_memory = 0; + uint64_t indirect_vdev_id = + spa->spa_removing_phys.sr_prev_indirect_vdev; + while (indirect_vdev_id != -1) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id]; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + prs->prs_mapping_memory += vdev_indirect_mapping_size(vim); + indirect_vdev_id = vic->vic_prev_indirect_vdev; + } + + return (0); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(zfs_remove_max_segment, int, 0644); +MODULE_PARM_DESC(zfs_remove_max_segment, + "Largest contiguous segment to allocate when removing device"); + +EXPORT_SYMBOL(free_from_removing_vdev); +EXPORT_SYMBOL(spa_removal_get_stats); +EXPORT_SYMBOL(spa_remove_init); +EXPORT_SYMBOL(spa_restart_removal); +EXPORT_SYMBOL(spa_vdev_removal_destroy); +EXPORT_SYMBOL(spa_vdev_remove); +EXPORT_SYMBOL(spa_vdev_remove_cancel); +EXPORT_SYMBOL(spa_vdev_remove_suspend); +EXPORT_SYMBOL(svr_sync); +#endif diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c index 5db157d74..8ac9ce187 100644 --- a/module/zfs/vdev_root.c +++ b/module/zfs/vdev_root.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -118,6 +118,7 @@ vdev_ops_t vdev_root_ops = { NULL, NULL, NULL, + NULL, VDEV_TYPE_ROOT, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 1e987dc88..b1ac149b3 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -195,6 +195,7 @@ #include <sys/zfeature.h> #include <sys/zcp.h> #include <sys/zio_checksum.h> +#include <sys/vdev_removal.h> #include <linux/miscdevice.h> #include <linux/slab.h> @@ -1050,6 +1051,14 @@ zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) /* ARGSUSED */ static int +zfs_secpolicy_remap(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_REMAP, cr)); +} + +/* ARGSUSED */ +static int zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair, *nextpair; @@ -1920,8 +1929,8 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) /* * inputs: * zc_name name of the pool - * zc_nvlist_conf nvlist of devices to remove - * zc_cookie to stop the remove? + * zc_guid guid of vdev to remove + * zc_cookie cancel removal */ static int zfs_ioc_vdev_remove(zfs_cmd_t *zc) @@ -1932,7 +1941,11 @@ zfs_ioc_vdev_remove(zfs_cmd_t *zc) error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); - error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); + if (zc->zc_cookie != 0) { + error = spa_vdev_remove_cancel(spa); + } else { + error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); + } spa_close(spa, FTAG); return (error); } @@ -2920,7 +2933,7 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) { spa_configfile_set(spa, props, B_FALSE); - spa_config_sync(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE); } mutex_exit(&spa_namespace_lock); if (spa != NULL) { @@ -3395,6 +3408,17 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) return (error); } +/* ARGSUSED */ +static int +zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) +{ + if (strchr(fsname, '@') || + strchr(fsname, '%')) + return (SET_ERROR(EINVAL)); + + return (dmu_objset_remap_indirects(fsname)); +} + /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } @@ -6339,6 +6363,10 @@ zfs_ioctl_init(void) zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + zfs_ioctl_register("remap", ZFS_IOC_REMAP, + zfs_ioc_remap, zfs_secpolicy_remap, DATASET_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE); + zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS, zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index fd8debdcf..f7f2f6167 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1111,6 +1111,18 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) if (error == EALREADY) { lr->lr_common.lrc_txtype = TX_WRITE2; + /* + * TX_WRITE2 relies on the data previously + * written by the TX_WRITE that caused + * EALREADY. We zero out the BP because + * it is the old, currently-on-disk BP, + * so there's no need to zio_flush() its + * vdevs (flushing would needlesly hurt + * performance, and doesn't work on + * indirect vdevs). + */ + zgd->zgd_bp = NULL; + BP_ZERO(bp); error = 0; } } diff --git a/module/zfs/zil.c b/module/zfs/zil.c index d9ae1f413..d0a100252 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -3396,7 +3396,7 @@ zil_replaying(zilog_t *zilog, dmu_tx_t *tx) /* ARGSUSED */ int -zil_vdev_offline(const char *osname, void *arg) +zil_reset(const char *osname, void *arg) { int error; @@ -3419,7 +3419,6 @@ EXPORT_SYMBOL(zil_itx_create); EXPORT_SYMBOL(zil_itx_destroy); EXPORT_SYMBOL(zil_itx_assign); EXPORT_SYMBOL(zil_commit); -EXPORT_SYMBOL(zil_vdev_offline); EXPORT_SYMBOL(zil_claim); EXPORT_SYMBOL(zil_check_log_chain); EXPORT_SYMBOL(zil_sync); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index c6379bfd4..5259a0c6f 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1018,6 +1018,8 @@ void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { + zfs_blkptr_verify(spa, bp); + /* * The check for EMBEDDED is a performance optimization. We * process the free here (by ignoring it) rather than @@ -1081,7 +1083,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, { zio_t *zio; - dprintf_bp(bp, "claiming in txg %llu", txg); + zfs_blkptr_verify(spa, bp); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); @@ -1200,8 +1202,26 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; - ASSERT(vd->vdev_parent == - (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); + /* + * vdev child I/Os do not propagate their error to the parent. + * Therefore, for correct operation the caller *must* check for + * and handle the error in the child i/o's done callback. + * The only exceptions are i/os that we don't care about + * (OPTIONAL or REPAIR). + */ + ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) || + done != NULL); + + /* + * In the common case, where the parent zio was to a normal vdev, + * the child zio must be to a child vdev of that vdev. Otherwise, + * the child zio must be to a top-level vdev. + */ + if (pio->io_vd != NULL && pio->io_vd->vdev_ops != &vdev_indirect_ops) { + ASSERT3P(vd->vdev_parent, ==, pio->io_vd); + } else { + ASSERT3P(vd, ==, vd->vdev_top); + } if (type == ZIO_TYPE_READ && bp != NULL) { /* @@ -1214,10 +1234,12 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } - if (vd->vdev_children == 0) + if (vd->vdev_ops->vdev_op_leaf) { + ASSERT0(vd->vdev_children); offset += VDEV_LABEL_START_SIZE; + } - flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; + flags |= ZIO_VDEV_CHILD_FLAGS(pio); /* * If we've decided to do a repair, the write is not speculative -- @@ -1318,6 +1340,8 @@ zio_read_bp_init(zio_t *zio) uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); + ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { @@ -1341,6 +1365,7 @@ zio_read_bp_init(zio_t *zio) abd_return_buf_copy(zio->io_abd, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); } if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) @@ -1614,6 +1639,8 @@ zio_free_bp_init(zio_t *zio) zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; } + ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); + return (ZIO_PIPELINE_CONTINUE); } @@ -3447,6 +3474,15 @@ zio_vdev_io_start(zio_t *zio) } ASSERT3P(zio->io_logical, !=, zio); + if (zio->io_type == ZIO_TYPE_WRITE && zio->io_vd->vdev_removing) { + /* + * Note: the code can handle other kinds of writes, + * but we don't expect them. + */ + ASSERT(zio->io_flags & + (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | + ZIO_FLAG_INDUCE_DAMAGE)); + } align = 1ULL << vd->vdev_top->vdev_ashift; |