diff options
Diffstat (limited to 'module')
39 files changed, 1094 insertions, 604 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c index d59dfe2d9..dd9f914b3 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -3491,6 +3491,7 @@ arc_fini(void) mutex_destroy(&arc_mru_ghost->arcs_mtx); mutex_destroy(&arc_mfu->arcs_mtx); mutex_destroy(&arc_mfu_ghost->arcs_mtx); + mutex_destroy(&arc_l2c_only->arcs_mtx); mutex_destroy(&zfs_write_limit_lock); @@ -4457,7 +4458,7 @@ l2arc_fini(void) void l2arc_start(void) { - if (!(spa_mode & FWRITE)) + if (!(spa_mode_global & FWRITE)) return; (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, @@ -4467,7 +4468,7 @@ l2arc_start(void) void l2arc_stop(void) { - if (!(spa_mode & FWRITE)) + if (!(spa_mode_global & FWRITE)) return; mutex_enter(&l2arc_feed_thr_lock); diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 512401470..197284e1e 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -119,7 +119,7 @@ traverse_zil(struct traverse_data *td, zil_header_t *zh) * We only want to visit blocks that have been claimed but not yet * replayed (or, in read-only mode, blocks that *would* be claimed). */ - if (claim_txg == 0 && (spa_mode & FWRITE)) + if (claim_txg == 0 && spa_writeable(td->td_spa)) return; zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index e77834d60..8686ab983 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -56,6 +56,8 @@ dnode_cons(void *arg, void *unused, int kmflag) rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL); + refcount_create(&dn->dn_holds); refcount_create(&dn->dn_tx_holds); @@ -84,6 +86,7 @@ dnode_dest(void *arg, void *unused) rw_destroy(&dn->dn_struct_rwlock); mutex_destroy(&dn->dn_mtx); mutex_destroy(&dn->dn_dbufs_mtx); + cv_destroy(&dn->dn_notxholds); refcount_destroy(&dn->dn_holds); refcount_destroy(&dn->dn_tx_holds); diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 93ea8aa11..e488b2bdd 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -1948,6 +1948,9 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) if (ds->ds_phys->ds_next_snap_obj) { stat->dds_is_snapshot = B_TRUE; stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; + } else { + stat->dds_is_snapshot = B_FALSE; + stat->dds_num_clones = 0; } /* clone origin is really a dsl_dir thing... */ @@ -1959,6 +1962,8 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); dsl_dataset_name(ods, stat->dds_origin); dsl_dataset_drop_ref(ods, FTAG); + } else { + stat->dds_origin[0] = '\0'; } rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); } diff --git a/module/zfs/dsl_scrub.c b/module/zfs/dsl_scrub.c index 950a91f78..dbdfe8c75 100644 --- a/module/zfs/dsl_scrub.c +++ b/module/zfs/dsl_scrub.c @@ -391,7 +391,7 @@ traverse_zil(dsl_pool_t *dp, zil_header_t *zh) * We only want to visit blocks that have been claimed but not yet * replayed (or, in read-only mode, blocks that *would* be claimed). */ - if (claim_txg == 0 && (spa_mode & FWRITE)) + if (claim_txg == 0 && spa_writeable(dp->dp_spa)) return; zilog = zil_alloc(dp->dp_meta_objset, zh); @@ -409,9 +409,6 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, int err; arc_buf_t *buf = NULL; - if (bp->blk_birth == 0) - return; - if (bp->blk_birth <= dp->dp_scrub_min_txg) return; @@ -740,6 +737,7 @@ enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) { + spa_t *spa = dp->dp_spa; zap_cursor_t zc; zap_attribute_t za; boolean_t complete = B_TRUE; @@ -747,8 +745,10 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (dp->dp_scrub_func == SCRUB_FUNC_NONE) return; - /* If the spa is not fully loaded, don't bother. */ - if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE) + /* + * If the pool is not loaded, or is trying to unload, leave it alone. + */ + if (spa->spa_load_state != SPA_LOAD_NONE || spa_shutting_down(spa)) return; if (dp->dp_scrub_restart) { @@ -757,13 +757,13 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) dsl_pool_scrub_setup_sync(dp, &func, kcred, tx); } - if (dp->dp_spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) { + if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) { /* * We must have resumed after rebooting; reset the vdev * stats to know that we're doing a scrub (although it * will think we're just starting now). */ - vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, + vdev_scrub_stat_update(spa->spa_root_vdev, dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : POOL_SCRUB_EVERYTHING, B_FALSE); } @@ -771,7 +771,7 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) dp->dp_scrub_pausing = B_FALSE; dp->dp_scrub_start_time = lbolt64; dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0); - dp->dp_spa->spa_scrub_active = B_TRUE; + spa->spa_scrub_active = B_TRUE; if (dp->dp_scrub_bookmark.zb_objset == 0) { /* First do the MOS & ORIGIN */ @@ -779,8 +779,8 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (dp->dp_scrub_pausing) goto out; - if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { - VERIFY(0 == dmu_objset_find_spa(dp->dp_spa, + if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { + VERIFY(0 == dmu_objset_find_spa(spa, NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); } else { scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx); @@ -830,15 +830,13 @@ out: VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, - &dp->dp_spa->spa_scrub_errors, tx)); + &spa->spa_scrub_errors, tx)); /* XXX this is scrub-clean specific */ - mutex_enter(&dp->dp_spa->spa_scrub_lock); - while (dp->dp_spa->spa_scrub_inflight > 0) { - cv_wait(&dp->dp_spa->spa_scrub_io_cv, - &dp->dp_spa->spa_scrub_lock); - } - mutex_exit(&dp->dp_spa->spa_scrub_lock); + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight > 0) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + mutex_exit(&spa->spa_scrub_lock); } void @@ -920,13 +918,17 @@ static int dsl_pool_scrub_clean_cb(dsl_pool_t *dp, const blkptr_t *bp, const zbookmark_t *zb) { - size_t size = BP_GET_LSIZE(bp); - int d; + size_t size = BP_GET_PSIZE(bp); spa_t *spa = dp->dp_spa; boolean_t needs_io; - int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; + int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; int zio_priority; + ASSERT(bp->blk_birth > dp->dp_scrub_min_txg); + + if (bp->blk_birth >= dp->dp_scrub_max_txg) + return (0); + count_block(dp->dp_blkstats, bp); if (dp->dp_scrub_isresilver == 0) { @@ -945,7 +947,7 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp, if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) zio_flags |= ZIO_FLAG_SPECULATIVE; - for (d = 0; d < BP_GET_NDVAS(bp); d++) { + for (int d = 0; d < BP_GET_NDVAS(bp); d++) { vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[d])); @@ -963,16 +965,17 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp, if (DVA_GET_GANG(&bp->blk_dva[d])) { /* * Gang members may be spread across multiple - * vdevs, so the best we can do is look at the - * pool-wide DTL. + * vdevs, so the best estimate we have is the + * scrub range, which has already been checked. * XXX -- it would be better to change our - * allocation policy to ensure that this can't - * happen. + * allocation policy to ensure that all + * gang members reside on the same vdev. */ - vd = spa->spa_root_vdev; + needs_io = B_TRUE; + } else { + needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, + bp->blk_birth, 1); } - needs_io = vdev_dtl_contains(&vd->vdev_dtl_map, - bp->blk_birth, 1); } } diff --git a/module/zfs/include/sys/spa.h b/module/zfs/include/sys/spa.h index 24b3ca447..519b1d0c0 100644 --- a/module/zfs/include/sys/spa.h +++ b/module/zfs/include/sys/spa.h @@ -332,7 +332,8 @@ extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props); extern int spa_import_faulted(const char *, nvlist_t *, nvlist_t *); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); extern int spa_destroy(char *pool); -extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force); +extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, + boolean_t hardforce); extern int spa_reset(char *pool); extern void spa_async_request(spa_t *spa, int flag); extern void spa_async_unrequest(spa_t *spa, int flag); @@ -351,7 +352,8 @@ extern void spa_inject_delref(spa_t *spa); extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing); -extern int spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done); +extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, + int replace_done); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); @@ -475,6 +477,8 @@ extern boolean_t spa_has_spare(spa_t *, uint64_t guid); extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp); extern boolean_t spa_has_slogs(spa_t *spa); extern boolean_t spa_is_root(spa_t *spa); +extern boolean_t spa_writeable(spa_t *spa); +extern int spa_mode(spa_t *spa); /* history logging */ typedef enum history_log_type { @@ -545,7 +549,7 @@ _NOTE(CONSTCOND) } while (0) #define dprintf_bp(bp, fmt, ...) #endif -extern int spa_mode; /* mode, e.g. FREAD | FWRITE */ +extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */ #ifdef __cplusplus } diff --git a/module/zfs/include/sys/spa_impl.h b/module/zfs/include/sys/spa_impl.h index 8aeb414fe..588b4f5a9 100644 --- a/module/zfs/include/sys/spa_impl.h +++ b/module/zfs/include/sys/spa_impl.h @@ -170,6 +170,7 @@ struct spa { boolean_t spa_import_faulted; /* allow faulted vdevs */ boolean_t spa_is_root; /* pool is root */ int spa_minref; /* num refs when first opened */ + int spa_mode; /* FREAD | FWRITE */ spa_log_state_t spa_log_state; /* log state */ /* * spa_refcnt & spa_config_lock must be the last elements diff --git a/module/zfs/include/sys/space_map.h b/module/zfs/include/sys/space_map.h index db9daef1f..8d7860660 100644 --- a/module/zfs/include/sys/space_map.h +++ b/module/zfs/include/sys/space_map.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_SPACE_MAP_H #define _SYS_SPACE_MAP_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/avl.h> #include <sys/dmu.h> @@ -58,6 +56,12 @@ typedef struct space_seg { uint64_t ss_end; /* ending offset (non-inclusive) */ } space_seg_t; +typedef struct space_ref { + avl_node_t sr_node; /* AVL node */ + uint64_t sr_offset; /* offset (start or end) */ + int64_t sr_refcnt; /* associated reference count */ +} space_ref_t; + typedef struct space_map_obj { uint64_t smo_object; /* on-disk space map object */ uint64_t smo_objsize; /* size of the object */ @@ -133,13 +137,12 @@ extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size, extern void space_map_destroy(space_map_t *sm); extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size); extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size); -extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size); +extern boolean_t space_map_contains(space_map_t *sm, + uint64_t start, uint64_t size); extern void space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest); extern void space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest); -extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size); -extern void space_map_union(space_map_t *smd, space_map_t *sms); extern void space_map_load_wait(space_map_t *sm); extern int space_map_load(space_map_t *sm, space_map_ops_t *ops, @@ -155,6 +158,15 @@ extern void space_map_sync(space_map_t *sm, uint8_t maptype, extern void space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx); +extern void space_map_ref_create(avl_tree_t *t); +extern void space_map_ref_destroy(avl_tree_t *t); +extern void space_map_ref_add_seg(avl_tree_t *t, + uint64_t start, uint64_t end, int64_t refcnt); +extern void space_map_ref_add_map(avl_tree_t *t, + space_map_t *sm, int64_t refcnt); +extern void space_map_ref_generate_map(avl_tree_t *t, + space_map_t *sm, int64_t minref); + #ifdef __cplusplus } #endif diff --git a/module/zfs/include/sys/uberblock_impl.h b/module/zfs/include/sys/uberblock_impl.h index 55a0dd5ae..b49df8ae0 100644 --- a/module/zfs/include/sys/uberblock_impl.h +++ b/module/zfs/include/sys/uberblock_impl.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_UBERBLOCK_IMPL_H #define _SYS_UBERBLOCK_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/uberblock.h> #ifdef __cplusplus @@ -35,6 +33,11 @@ extern "C" { #endif /* + * For zdb use and debugging purposes only + */ +extern uint64_t ub_max_txg; + +/* * The uberblock version is incremented whenever an incompatible on-disk * format change is made to the SPA, DMU, or ZAP. * diff --git a/module/zfs/include/sys/vdev.h b/module/zfs/include/sys/vdev.h index c070d6f3d..b8313a920 100644 --- a/module/zfs/include/sys/vdev.h +++ b/module/zfs/include/sys/vdev.h @@ -36,6 +36,14 @@ extern "C" { #endif +typedef enum vdev_dtl_type { + DTL_MISSING, /* 0% replication: no copies of the data */ + DTL_PARTIAL, /* less than 100% replication: some copies missing */ + DTL_SCRUB, /* unable to fully repair during scrub/resilver */ + DTL_OUTAGE, /* temporarily missing (used to attempt detach) */ + DTL_TYPES +} vdev_dtl_type_t; + extern boolean_t zfs_nocacheflush; extern int vdev_open(vdev_t *); @@ -50,10 +58,14 @@ extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio); extern boolean_t vdev_is_bootable(vdev_t *vd); extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid); -extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size); -extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size); +extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d, + uint64_t txg, uint64_t size); +extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, + uint64_t txg, uint64_t size); +extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d); extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done); +extern boolean_t vdev_dtl_required(vdev_t *vd); extern boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp); diff --git a/module/zfs/include/sys/vdev_impl.h b/module/zfs/include/sys/vdev_impl.h index 26904d089..652349341 100644 --- a/module/zfs/include/sys/vdev_impl.h +++ b/module/zfs/include/sys/vdev_impl.h @@ -123,8 +123,7 @@ struct vdev { vdev_t *vdev_parent; /* parent vdev */ vdev_t **vdev_child; /* array of children */ uint64_t vdev_children; /* number of children */ - space_map_t vdev_dtl_map; /* dirty time log in-core state */ - space_map_t vdev_dtl_scrub; /* DTL for scrub repair writes */ + space_map_t vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */ vdev_stat_t vdev_stat; /* virtual device statistics */ /* @@ -149,7 +148,7 @@ struct vdev { * Leaf vdev state. */ uint64_t vdev_psize; /* physical device capacity */ - space_map_obj_t vdev_dtl; /* dirty time log on-disk state */ + space_map_obj_t vdev_dtl_smo; /* dirty time log space map obj */ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ uint64_t vdev_wholedisk; /* true if this is a whole disk */ uint64_t vdev_offline; /* persistent offline state */ diff --git a/module/zfs/include/sys/zfs_vfsops.h b/module/zfs/include/sys/zfs_vfsops.h index 87b75e6e7..7e0440be4 100644 --- a/module/zfs/include/sys/zfs_vfsops.h +++ b/module/zfs/include/sys/zfs_vfsops.h @@ -26,8 +26,6 @@ #ifndef _SYS_FS_ZFS_VFSOPS_H #define _SYS_FS_ZFS_VFSOPS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/isa_defs.h> #include <sys/types32.h> #include <sys/list.h> @@ -49,7 +47,6 @@ struct zfsvfs { uint64_t z_root; /* id of root znode */ uint64_t z_unlinkedobj; /* id of unlinked zapobj */ uint64_t z_max_blksz; /* maximum block size for files */ - uint64_t z_assign; /* TXG_NOWAIT or set by zil_replay() */ uint64_t z_fuid_obj; /* fuid table object number */ uint64_t z_fuid_size; /* fuid table size */ avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ @@ -74,6 +71,7 @@ struct zfsvfs { boolean_t z_issnap; /* true if this is a snapshot */ boolean_t z_vscan; /* virus scan on/off */ boolean_t z_use_fuids; /* version allows fuids */ + boolean_t z_replay; /* set during ZIL replay */ kmutex_t z_online_recv_lock; /* recv in prog grabs as WRITER */ uint64_t z_version; /* ZPL version */ #define ZFS_OBJ_MTX_SZ 64 diff --git a/module/zfs/include/sys/zil.h b/module/zfs/include/sys/zil.h index 4d02d14f7..b69323cfa 100644 --- a/module/zfs/include/sys/zil.h +++ b/module/zfs/include/sys/zil.h @@ -335,7 +335,6 @@ typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, uint64_t txg); typedef int zil_replay_func_t(); -typedef void zil_replay_cleaner_t(); typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio); extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, @@ -350,9 +349,8 @@ extern void zil_free(zilog_t *zilog); extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data); extern void zil_close(zilog_t *zilog); -extern void zil_replay(objset_t *os, void *arg, uint64_t *txgp, - zil_replay_func_t *replay_func[TX_MAX_TYPE], - zil_replay_cleaner_t *replay_cleaner); +extern void zil_replay(objset_t *os, void *arg, + zil_replay_func_t *replay_func[TX_MAX_TYPE]); extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx); diff --git a/module/zfs/include/sys/zil_impl.h b/module/zfs/include/sys/zil_impl.h index 0fc800b96..3f2582931 100644 --- a/module/zfs/include/sys/zil_impl.h +++ b/module/zfs/include/sys/zil_impl.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZIL_IMPL_H #define _SYS_ZIL_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zil.h> #include <sys/dmu_objset.h> @@ -74,13 +72,14 @@ struct zilog { uint64_t zl_commit_seq; /* committed upto this number */ uint64_t zl_lr_seq; /* log record sequence number */ uint64_t zl_destroy_txg; /* txg of last zil_destroy() */ - uint64_t zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */ + uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */ + uint64_t zl_replaying_seq; /* current replay seq number */ uint32_t zl_suspend; /* log suspend count */ kcondvar_t zl_cv_writer; /* log writer thread completion */ kcondvar_t zl_cv_suspend; /* log suspend completion */ uint8_t zl_suspending; /* log is currently suspending */ uint8_t zl_keep_first; /* keep first log block in destroy */ - uint8_t zl_stop_replay; /* don't replay any further */ + uint8_t zl_replay; /* replaying records while set */ uint8_t zl_stop_sync; /* for debugging */ uint8_t zl_writer; /* boolean: write setup in progress */ uint8_t zl_log_error; /* boolean: log write error */ diff --git a/module/zfs/include/sys/zio.h b/module/zfs/include/sys/zio.h index 4de78dfee..21b0fbc6b 100644 --- a/module/zfs/include/sys/zio.h +++ b/module/zfs/include/sys/zio.h @@ -132,12 +132,14 @@ enum zio_compress { #define ZIO_FLAG_IO_RETRY 0x00400 #define ZIO_FLAG_IO_REWRITE 0x00800 -#define ZIO_FLAG_PROBE 0x01000 +#define ZIO_FLAG_SELF_HEAL 0x01000 #define ZIO_FLAG_RESILVER 0x02000 #define ZIO_FLAG_SCRUB 0x04000 #define ZIO_FLAG_SCRUB_THREAD 0x08000 -#define ZIO_FLAG_GANG_CHILD 0x10000 +#define ZIO_FLAG_PROBE 0x10000 +#define ZIO_FLAG_GANG_CHILD 0x20000 +#define ZIO_FLAG_RAW 0x40000 #define ZIO_FLAG_GANG_INHERIT \ (ZIO_FLAG_CANFAIL | \ @@ -146,6 +148,7 @@ enum zio_compress { ZIO_FLAG_DONT_RETRY | \ ZIO_FLAG_DONT_CACHE | \ ZIO_FLAG_DONT_AGGREGATE | \ + ZIO_FLAG_SELF_HEAL | \ ZIO_FLAG_RESILVER | \ ZIO_FLAG_SCRUB | \ ZIO_FLAG_SCRUB_THREAD) @@ -156,6 +159,14 @@ enum zio_compress { ZIO_FLAG_IO_RETRY | \ ZIO_FLAG_PROBE) +#define ZIO_FLAG_AGG_INHERIT \ + (ZIO_FLAG_DONT_AGGREGATE | \ + ZIO_FLAG_IO_REPAIR | \ + ZIO_FLAG_SELF_HEAL | \ + ZIO_FLAG_RESILVER | \ + ZIO_FLAG_SCRUB | \ + ZIO_FLAG_SCRUB_THREAD) + #define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_STOP 0x101 diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 87727fac2..412832968 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -720,6 +720,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, vdev_t *vd; int dshift = 3; int all_zero; + int zio_lock = B_FALSE; + boolean_t allocatable; uint64_t offset = -1ULL; uint64_t asize; uint64_t distance; @@ -778,11 +780,20 @@ top: all_zero = B_TRUE; do { vd = mg->mg_vd; + /* * Don't allocate from faulted devices. */ - if (!vdev_allocatable(vd)) + if (zio_lock) { + spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); + allocatable = vdev_allocatable(vd); + spa_config_exit(spa, SCL_ZIO, FTAG); + } else { + allocatable = vdev_allocatable(vd); + } + if (!allocatable) goto next; + /* * Avoid writing single-copy data to a failing vdev */ @@ -858,6 +869,12 @@ next: goto top; } + if (!zio_lock) { + dshift = 3; + zio_lock = B_TRUE; + goto top; + } + bzero(&dva[d], sizeof (dva_t)); return (ENOSPC); @@ -946,7 +963,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) space_map_claim(&msp->ms_map, offset, size); - if (spa_mode & FWRITE) { /* don't dirty if we're zdb(1M) */ + if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) vdev_dirty(vd, VDD_METASLAB, msp, txg); space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index fb1b96f8b..ef04b7c94 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -486,11 +486,12 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) * Activate an uninitialized pool. */ static void -spa_activate(spa_t *spa) +spa_activate(spa_t *spa, int mode) { ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); spa->spa_state = POOL_STATE_ACTIVE; + spa->spa_mode = mode; spa->spa_normal_class = metaslab_class_create(); spa->spa_log_class = metaslab_class_create(); @@ -640,11 +641,6 @@ spa_unload(spa_t *spa) mutex_exit(&spa->spa_async_root_lock); /* - * Drop and purge level 2 cache - */ - spa_l2cache_drop(spa); - - /* * Close the dsl pool. */ if (spa->spa_dsl_pool) { @@ -652,6 +648,13 @@ spa_unload(spa_t *spa) spa->spa_dsl_pool = NULL; } + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + /* + * Drop and purge level 2 cache + */ + spa_l2cache_drop(spa); + /* * Close all vdevs. */ @@ -686,6 +689,8 @@ spa_unload(spa_t *spa) spa->spa_l2cache.sav_count = 0; spa->spa_async_suspended = 0; + + spa_config_exit(spa, SCL_ALL, FTAG); } /* @@ -897,12 +902,9 @@ spa_load_l2cache(spa_t *spa) vd = oldvdevs[i]; if (vd != NULL) { - if ((spa_mode & FWRITE) && - spa_l2cache_exists(vd->vdev_guid, &pool) && - pool != 0ULL && - l2arc_vdev_present(vd)) { + if (spa_l2cache_exists(vd->vdev_guid, &pool) && + pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); - } (void) vdev_close(vd); spa_l2cache_remove(vd); } @@ -1018,8 +1020,16 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) uint64_t pool_guid; uint64_t version; uint64_t autoreplace = 0; + int orig_mode = spa->spa_mode; char *ereport = FM_EREPORT_ZFS_POOL; + /* + * If this is an untrusted config, access the pool in read-only mode. + * This prevents things like resilvering recently removed devices. + */ + if (!mosconfig) + spa->spa_mode = FREAD; + ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa->spa_load_state = state; @@ -1077,12 +1087,13 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) * Validate the labels for all leaf vdevs. We need to grab the config * lock because all label I/O is done with ZIO_FLAG_CONFIG_WRITER. */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = vdev_validate(rvd); - spa_config_exit(spa, SCL_ALL, FTAG); - - if (error != 0) - goto out; + if (mosconfig) { + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = vdev_validate(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + if (error != 0) + goto out; + } if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { error = ENXIO; @@ -1184,7 +1195,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) spa_config_set(spa, newconfig); spa_unload(spa); spa_deactivate(spa); - spa_activate(spa); + spa_activate(spa, orig_mode); return (spa_load(spa, newconfig, state, B_TRUE)); } @@ -1376,10 +1387,11 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) goto out; } - if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { + if (spa_writeable(spa)) { dmu_tx_t *tx; int need_update = B_FALSE; - int c; + + ASSERT(state != SPA_LOAD_TRYIMPORT); /* * Claim log blocks that haven't been committed yet. @@ -1407,7 +1419,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) state == SPA_LOAD_IMPORT) need_update = B_TRUE; - for (c = 0; c < rvd->vdev_children; c++) + for (int c = 0; c < rvd->vdev_children; c++) if (rvd->vdev_child[c]->vdev_ms_array == 0) need_update = B_TRUE; @@ -1417,6 +1429,12 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) */ if (need_update) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + + /* + * Check all DTLs to see if anything needs resilvering. + */ + if (vdev_resilver_needed(rvd, NULL, NULL)) + spa_async_request(spa, SPA_ASYNC_RESILVER); } error = 0; @@ -1469,7 +1487,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) } if (spa->spa_state == POOL_STATE_UNINITIALIZED) { - spa_activate(spa); + spa_activate(spa, spa_mode_global); error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); @@ -1873,11 +1891,9 @@ spa_l2cache_drop(spa_t *spa) vd = sav->sav_vdevs[i]; ASSERT(vd != NULL); - if ((spa_mode & FWRITE) && - spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && - l2arc_vdev_present(vd)) { + if (spa_l2cache_exists(vd->vdev_guid, &pool) && + pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); - } if (vd->vdev_isl2cache) spa_l2cache_remove(vd); vdev_clear_stats(vd); @@ -1918,7 +1934,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); spa = spa_add(pool, altroot); - spa_activate(spa); + spa_activate(spa, spa_mode_global); spa->spa_uberblock.ub_txg = txg - 1; @@ -2121,7 +2137,7 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props, (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); spa = spa_add(pool, altroot); - spa_activate(spa); + spa_activate(spa, spa_mode_global); if (allowfaulted) spa->spa_import_faulted = B_TRUE; @@ -2160,7 +2176,8 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props, VDEV_ALLOC_L2CACHE); spa_config_exit(spa, SCL_ALL, FTAG); - if (error != 0 || (props && (error = spa_prop_set(spa, props)))) { + if (error != 0 || (props && spa_writeable(spa) && + (error = spa_prop_set(spa, props)))) { if (loaderr != 0 && loaderr != EINVAL && allowfaulted) { /* * If we failed to load the pool, but 'allowfaulted' is @@ -2219,7 +2236,7 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props, spa->spa_l2cache.sav_sync = B_TRUE; } - if (spa_mode & FWRITE) { + if (spa_writeable(spa)) { /* * Update the config cache to include the newly-imported pool. */ @@ -2367,11 +2384,11 @@ spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf) char *cdevid, *cpath; uint64_t tmptxg; + cpath = NULL; + cdevid = NULL; if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH, - &cpath) != 0) - return (EINVAL); - if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_DEVID, - &cdevid) != 0) + &cpath) != 0 && nvlist_lookup_string(child[c], + ZPOOL_CONFIG_DEVID, &cdevid) != 0) return (EINVAL); if ((spa_check_rootconf(cpath, cdevid, NULL, &tmptxg) == 0) && (tmptxg > txg)) { @@ -2489,7 +2506,7 @@ spa_tryimport(nvlist_t *tryconfig) */ mutex_enter(&spa_namespace_lock); spa = spa_add(TRYIMPORT_NAME, NULL); - spa_activate(spa); + spa_activate(spa, FREAD); /* * Pass off the heavy lifting to spa_load(). @@ -2563,18 +2580,19 @@ spa_tryimport(nvlist_t *tryconfig) * The act of destroying or exporting a pool is very simple. We make sure there * is no more pending I/O and any references to the pool are gone. Then, we * update the pool state and sync all the labels to disk, removing the - * configuration from the cache afterwards. + * configuration from the cache afterwards. If the 'hardforce' flag is set, then + * we don't sync the labels or remove the configuration cache. */ static int spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, - boolean_t force) + boolean_t force, boolean_t hardforce) { spa_t *spa; if (oldconfig) *oldconfig = NULL; - if (!(spa_mode & FWRITE)) + if (!(spa_mode_global & FWRITE)) return (EROFS); mutex_enter(&spa_namespace_lock); @@ -2635,7 +2653,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, * so mark them all dirty. spa_unload() will do the * final sync that pushes these changes out. */ - if (new_state != POOL_STATE_UNINITIALIZED) { + if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_state = new_state; spa->spa_final_txg = spa_last_synced_txg(spa) + 1; @@ -2655,7 +2673,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); if (new_state != POOL_STATE_UNINITIALIZED) { - spa_config_sync(spa, B_TRUE, B_TRUE); + if (!hardforce) + spa_config_sync(spa, B_TRUE, B_TRUE); spa_remove(spa); } mutex_exit(&spa_namespace_lock); @@ -2669,16 +2688,19 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, int spa_destroy(char *pool) { - return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE)); + return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, + B_FALSE, B_FALSE)); } /* * Export a storage pool. */ int -spa_export(char *pool, nvlist_t **oldconfig, boolean_t force) +spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, + boolean_t hardforce) { - return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force)); + return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, + force, hardforce)); } /* @@ -2689,7 +2711,7 @@ int spa_reset(char *pool) { return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, - B_FALSE)); + B_FALSE, B_FALSE)); } /* @@ -2705,7 +2727,7 @@ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) { uint64_t txg; - int c, error; + int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; nvlist_t **spares, **l2cache; @@ -2744,7 +2766,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) /* * Transfer each new top-level vdev from vd to rvd. */ - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); tvd->vdev_id = rvd->vdev_children; @@ -2952,10 +2974,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ open_txg = txg + TXG_CONCURRENT_STATES - 1; - mutex_enter(&newvd->vdev_dtl_lock); - space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, - open_txg - TXG_INITIAL + 1); - mutex_exit(&newvd->vdev_dtl_lock); + vdev_dtl_dirty(newvd, DTL_MISSING, + TXG_INITIAL, open_txg - TXG_INITIAL + 1); if (newvd->vdev_isspare) spa_spare_activate(newvd); @@ -2999,10 +3019,10 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) * is a replacing vdev. */ int -spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) +spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) { uint64_t txg; - int c, t, error; + int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; @@ -3022,6 +3042,22 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) pvd = vd->vdev_parent; /* + * If the parent/child relationship is not as expected, don't do it. + * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing + * vdev that's replacing B with C. The user's intent in replacing + * is to go from M(A,B) to M(A,C). If the user decides to cancel + * the replace by detaching C, the expected behavior is to end up + * M(A,B). But suppose that right after deciding to detach C, + * the replacement of B completes. We would have M(A,C), and then + * ask to detach C, which would leave us with just A -- not what + * the user wanted. To prevent this, we make sure that the + * parent/child relationship hasn't changed -- in this example, + * that C's parent is still the replacing vdev R. + */ + if (pvd->vdev_guid != pguid && pguid != 0) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + + /* * If replace_done is specified, only remove this device if it's * the first child of a replacing vdev. For the 'spare' vdev, either * disk can be removed. @@ -3047,36 +3083,13 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); /* - * If there's only one replica, you can't detach it. + * If this device has the only valid copy of some data, + * we cannot safely detach it. */ - if (pvd->vdev_children <= 1) + if (vdev_dtl_required(vd)) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); - /* - * If all siblings have non-empty DTLs, this device may have the only - * valid copy of the data, which means we cannot safely detach it. - * - * XXX -- as in the vdev_offline() case, we really want a more - * precise DTL check. - */ - for (c = 0; c < pvd->vdev_children; c++) { - uint64_t dirty; - - cvd = pvd->vdev_child[c]; - if (cvd == vd) - continue; - if (vdev_is_dead(cvd)) - continue; - mutex_enter(&cvd->vdev_dtl_lock); - dirty = cvd->vdev_dtl_map.sm_space | - cvd->vdev_dtl_scrub.sm_space; - mutex_exit(&cvd->vdev_dtl_lock); - if (!dirty) - break; - } - - if (c == pvd->vdev_children) - return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + ASSERT(pvd->vdev_children >= 2); /* * If we are detaching the second disk from a replacing vdev, then @@ -3102,7 +3115,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) * active spare list for the pool. */ if (pvd->vdev_ops == &vdev_spare_ops && - vd->vdev_id == 0) + vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare) unspare = B_TRUE; /* @@ -3128,14 +3141,18 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) /* * If we need to remove the remaining child from the list of hot spares, - * do it now, marking the vdev as no longer a spare in the process. We - * must do this before vdev_remove_parent(), because that can change the - * GUID if it creates a new toplevel GUID. + * do it now, marking the vdev as no longer a spare in the process. + * We must do this before vdev_remove_parent(), because that can + * change the GUID if it creates a new toplevel GUID. For a similar + * reason, we must remove the spare now, in the same txg as the detach; + * otherwise someone could attach a new sibling, change the GUID, and + * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. */ if (unspare) { ASSERT(cvd->vdev_isspare); spa_spare_remove(cvd); unspare_guid = cvd->vdev_guid; + (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); } /* @@ -3173,7 +3190,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) * But first make sure we're not on any *other* txg's DTL list, to * prevent vd from being accessed after it's freed. */ - for (t = 0; t < TXG_SIZE; t++) + for (int t = 0; t < TXG_SIZE; t++) (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); @@ -3188,11 +3205,14 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) * list of every other pool. */ if (unspare) { + spa_t *myspa = spa; spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (spa->spa_state != POOL_STATE_ACTIVE) continue; + if (spa == myspa) + continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); @@ -3256,10 +3276,12 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) vdev_t *vd; nvlist_t **spares, **l2cache, *nv; uint_t nspares, nl2cache; - uint64_t txg; + uint64_t txg = 0; int error = 0; + boolean_t locked = MUTEX_HELD(&spa_namespace_lock); - txg = spa_vdev_enter(spa); + if (!locked) + txg = spa_vdev_enter(spa); vd = spa_lookup_by_guid(spa, guid, B_FALSE); @@ -3302,7 +3324,10 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) error = ENOENT; } - return (spa_vdev_exit(spa, NULL, txg, error)); + if (!locked) + return (spa_vdev_exit(spa, NULL, txg, error)); + + return (error); } /* @@ -3328,13 +3353,9 @@ spa_vdev_resilver_done_hunt(vdev_t *vd) oldvd = vd->vdev_child[0]; newvd = vd->vdev_child[1]; - mutex_enter(&newvd->vdev_dtl_lock); - if (newvd->vdev_dtl_map.sm_space == 0 && - newvd->vdev_dtl_scrub.sm_space == 0) { - mutex_exit(&newvd->vdev_dtl_lock); + if (vdev_dtl_empty(newvd, DTL_MISSING) && + !vdev_dtl_required(oldvd)) return (oldvd); - } - mutex_exit(&newvd->vdev_dtl_lock); } /* @@ -3344,15 +3365,12 @@ spa_vdev_resilver_done_hunt(vdev_t *vd) newvd = vd->vdev_child[0]; oldvd = vd->vdev_child[1]; - mutex_enter(&newvd->vdev_dtl_lock); if (newvd->vdev_unspare && - newvd->vdev_dtl_map.sm_space == 0 && - newvd->vdev_dtl_scrub.sm_space == 0) { + vdev_dtl_empty(newvd, DTL_MISSING) && + !vdev_dtl_required(oldvd)) { newvd->vdev_unspare = 0; - mutex_exit(&newvd->vdev_dtl_lock); return (oldvd); } - mutex_exit(&newvd->vdev_dtl_lock); } return (NULL); @@ -3361,36 +3379,37 @@ spa_vdev_resilver_done_hunt(vdev_t *vd) static void spa_vdev_resilver_done(spa_t *spa) { - vdev_t *vd; - vdev_t *pvd; - uint64_t guid; - uint64_t pguid = 0; + vdev_t *vd, *pvd, *ppvd; + uint64_t guid, sguid, pguid, ppguid; - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { + pvd = vd->vdev_parent; + ppvd = pvd->vdev_parent; guid = vd->vdev_guid; + pguid = pvd->vdev_guid; + ppguid = ppvd->vdev_guid; + sguid = 0; /* * If we have just finished replacing a hot spared device, then * we need to detach the parent's first child (the original hot * spare) as well. */ - pvd = vd->vdev_parent; - if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && - pvd->vdev_id == 0) { + if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) { ASSERT(pvd->vdev_ops == &vdev_replacing_ops); - ASSERT(pvd->vdev_parent->vdev_children == 2); - pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; + ASSERT(ppvd->vdev_children == 2); + sguid = ppvd->vdev_child[1]->vdev_guid; } - spa_config_exit(spa, SCL_CONFIG, FTAG); - if (spa_vdev_detach(spa, guid, B_TRUE) != 0) + spa_config_exit(spa, SCL_ALL, FTAG); + if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) return; - if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) + if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) return; - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); } - spa_config_exit(spa, SCL_CONFIG, FTAG); + spa_config_exit(spa, SCL_ALL, FTAG); } /* @@ -3925,9 +3944,22 @@ spa_sync(spa_t *spa, uint64_t txg) * into config changes that go out with this transaction group. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { - vdev_state_clean(vd); - vdev_config_dirty(vd); + while (list_head(&spa->spa_state_dirty_list) != NULL) { + /* + * We need the write lock here because, for aux vdevs, + * calling vdev_config_dirty() modifies sav_config. + * This is ugly and will become unnecessary when we + * eliminate the aux vdev wart by integrating all vdevs + * into the root vdev tree. + */ + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); + while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { + vdev_state_clean(vd); + vdev_config_dirty(vd); + } + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } spa_config_exit(spa, SCL_STATE, FTAG); diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index ee425a916..252869d69 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -208,6 +208,9 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (rootdir == NULL) + return; + /* * Iterate over all cachefiles for the pool, past or present. When the * cachefile is changed, the new one is pushed onto this list, allowing diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 36046e6df..485e83fce 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -230,7 +230,7 @@ static kmutex_t spa_l2cache_lock; static avl_tree_t spa_l2cache_avl; kmem_cache_t *spa_buffer_pool; -int spa_mode; +int spa_mode_global; #ifdef ZFS_DEBUG /* Everything except dprintf is on by default in debug builds */ @@ -880,8 +880,10 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) txg_wait_synced(spa->spa_dsl_pool, txg); if (vd != NULL) { - ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0); + ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0); + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); vdev_free(vd); + spa_config_exit(spa, SCL_ALL, spa); } /* @@ -912,6 +914,15 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) spa_config_exit(spa, SCL_STATE_ALL, spa); + /* + * If anything changed, wait for it to sync. This ensures that, + * from the system administrator's perspective, zpool(1M) commands + * are synchronous. This is important for things like zpool offline: + * when the command completes, you expect no further I/O from ZFS. + */ + if (vd != NULL) + txg_wait_synced(spa->spa_dsl_pool, 0); + return (error); } @@ -1351,7 +1362,7 @@ spa_init(int mode) avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); - spa_mode = mode; + spa_mode_global = mode; refcount_init(); unique_init(); @@ -1408,3 +1419,15 @@ spa_is_root(spa_t *spa) { return (spa->spa_is_root); } + +boolean_t +spa_writeable(spa_t *spa) +{ + return (!!(spa->spa_mode & FWRITE)); +} + +int +spa_mode(spa_t *spa) +{ + return (spa->spa_mode); +} diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index 0a1fd59ea..1cdacc81d 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/dmu.h> @@ -60,6 +58,8 @@ space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift, { bzero(sm, sizeof (*sm)); + cv_init(&sm->sm_load_cv, NULL, CV_DEFAULT, NULL); + avl_create(&sm->sm_root, space_map_seg_compare, sizeof (space_seg_t), offsetof(struct space_seg, ss_node)); @@ -75,6 +75,7 @@ space_map_destroy(space_map_t *sm) ASSERT(!sm->sm_loaded && !sm->sm_loading); VERIFY3U(sm->sm_space, ==, 0); avl_destroy(&sm->sm_root); + cv_destroy(&sm->sm_load_cv); } void @@ -180,7 +181,7 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) sm->sm_space -= size; } -int +boolean_t space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) { avl_index_t where; @@ -220,59 +221,10 @@ space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) { space_seg_t *ss; - for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) - func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); -} - -void -space_map_excise(space_map_t *sm, uint64_t start, uint64_t size) -{ - avl_tree_t *t = &sm->sm_root; - avl_index_t where; - space_seg_t *ss, search; - uint64_t end = start + size; - uint64_t rm_start, rm_end; - ASSERT(MUTEX_HELD(sm->sm_lock)); - search.ss_start = start; - search.ss_end = start; - - for (;;) { - ss = avl_find(t, &search, &where); - - if (ss == NULL) - ss = avl_nearest(t, where, AVL_AFTER); - - if (ss == NULL || ss->ss_start >= end) - break; - - rm_start = MAX(ss->ss_start, start); - rm_end = MIN(ss->ss_end, end); - - space_map_remove(sm, rm_start, rm_end - rm_start); - } -} - -/* - * Replace smd with the union of smd and sms. - */ -void -space_map_union(space_map_t *smd, space_map_t *sms) -{ - avl_tree_t *t = &sms->sm_root; - space_seg_t *ss; - - ASSERT(MUTEX_HELD(smd->sm_lock)); - - /* - * For each source segment, remove any intersections with the - * destination, then add the source segment to the destination. - */ - for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) { - space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start); - space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start); - } + for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) + func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); } /* @@ -504,3 +456,131 @@ space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx) smo->smo_objsize = 0; smo->smo_alloc = 0; } + +/* + * Space map reference trees. + * + * A space map is a collection of integers. Every integer is either + * in the map, or it's not. A space map reference tree generalizes + * the idea: it allows its members to have arbitrary reference counts, + * as opposed to the implicit reference count of 0 or 1 in a space map. + * This representation comes in handy when computing the union or + * intersection of multiple space maps. For example, the union of + * N space maps is the subset of the reference tree with refcnt >= 1. + * The intersection of N space maps is the subset with refcnt >= N. + * + * [It's very much like a Fourier transform. Unions and intersections + * are hard to perform in the 'space map domain', so we convert the maps + * into the 'reference count domain', where it's trivial, then invert.] + * + * vdev_dtl_reassess() uses computations of this form to determine + * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev + * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev + * has an outage wherever refcnt >= vdev_children. + */ +static int +space_map_ref_compare(const void *x1, const void *x2) +{ + const space_ref_t *sr1 = x1; + const space_ref_t *sr2 = x2; + + if (sr1->sr_offset < sr2->sr_offset) + return (-1); + if (sr1->sr_offset > sr2->sr_offset) + return (1); + + if (sr1 < sr2) + return (-1); + if (sr1 > sr2) + return (1); + + return (0); +} + +void +space_map_ref_create(avl_tree_t *t) +{ + avl_create(t, space_map_ref_compare, + sizeof (space_ref_t), offsetof(space_ref_t, sr_node)); +} + +void +space_map_ref_destroy(avl_tree_t *t) +{ + space_ref_t *sr; + void *cookie = NULL; + + while ((sr = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(sr, sizeof (*sr)); + + avl_destroy(t); +} + +static void +space_map_ref_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt) +{ + space_ref_t *sr; + + sr = kmem_alloc(sizeof (*sr), KM_SLEEP); + sr->sr_offset = offset; + sr->sr_refcnt = refcnt; + + avl_add(t, sr); +} + +void +space_map_ref_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, + int64_t refcnt) +{ + space_map_ref_add_node(t, start, refcnt); + space_map_ref_add_node(t, end, -refcnt); +} + +/* + * Convert (or add) a space map into a reference tree. + */ +void +space_map_ref_add_map(avl_tree_t *t, space_map_t *sm, int64_t refcnt) +{ + space_seg_t *ss; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + + for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) + space_map_ref_add_seg(t, ss->ss_start, ss->ss_end, refcnt); +} + +/* + * Convert a reference tree into a space map. The space map will contain + * all members of the reference tree for which refcnt >= minref. + */ +void +space_map_ref_generate_map(avl_tree_t *t, space_map_t *sm, int64_t minref) +{ + uint64_t start = -1ULL; + int64_t refcnt = 0; + space_ref_t *sr; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + + space_map_vacate(sm, NULL, NULL); + + for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) { + refcnt += sr->sr_refcnt; + if (refcnt >= minref) { + if (start == -1ULL) { + start = sr->sr_offset; + } + } else { + if (start != -1ULL) { + uint64_t end = sr->sr_offset; + ASSERT(start <= end); + if (end > start) + space_map_add(sm, start, end - start); + start = -1ULL; + } + } + } + ASSERT(refcnt == 0); + ASSERT(start == -1ULL); +} diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 2bbf2f086..e3c0e2a13 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -63,6 +63,12 @@ txg_init(dsl_pool_t *dp, uint64_t txg) rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL); mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL); + tx->tx_open_txg = txg; } @@ -80,6 +86,12 @@ txg_fini(dsl_pool_t *dp) rw_destroy(&tx->tx_suspend); mutex_destroy(&tx->tx_sync_lock); + cv_destroy(&tx->tx_sync_more_cv); + cv_destroy(&tx->tx_sync_done_cv); + cv_destroy(&tx->tx_quiesce_more_cv); + cv_destroy(&tx->tx_quiesce_done_cv); + cv_destroy(&tx->tx_exit_cv); + for (c = 0; c < max_ncpus; c++) { int i; diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 16a27e514..d9689e803 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -316,8 +316,10 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); - space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock); - space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock); + for (int t = 0; t < DTL_TYPES; t++) { + space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0, + &vd->vdev_dtl_lock); + } txg_list_create(&vd->vdev_ms_list, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, @@ -474,7 +476,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, - &vd->vdev_dtl.smo_object); + &vd->vdev_dtl_smo.smo_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } @@ -566,12 +568,14 @@ vdev_free(vdev_t *vd) txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); + mutex_enter(&vd->vdev_dtl_lock); - space_map_unload(&vd->vdev_dtl_map); - space_map_destroy(&vd->vdev_dtl_map); - space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); - space_map_destroy(&vd->vdev_dtl_scrub); + for (int t = 0; t < DTL_TYPES; t++) { + space_map_unload(&vd->vdev_dtl[t]); + space_map_destroy(&vd->vdev_dtl[t]); + } mutex_exit(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); @@ -709,14 +713,18 @@ vdev_remove_parent(vdev_t *cvd) vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); + /* * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * Otherwise, we could have detached an offline device, and when we * go to import the pool we'll think we have two top-level vdevs, * instead of a different version of the same top-level vdev. */ - if (mvd->vdev_top == mvd) - cvd->vdev_guid = cvd->vdev_guid_sum = mvd->vdev_guid; + if (mvd->vdev_top == mvd) { + uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; + cvd->vdev_guid += guid_delta; + cvd->vdev_guid_sum += guid_delta; + } cvd->vdev_id = mvd->vdev_id; vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); @@ -815,6 +823,7 @@ typedef struct vdev_probe_stats { static void vdev_probe_done(zio_t *zio) { + spa_t *spa = zio->io_spa; vdev_probe_stats_t *vps = zio->io_private; vdev_t *vd = vps->vps_vd; @@ -822,7 +831,7 @@ vdev_probe_done(zio_t *zio) ASSERT(zio->io_vd == vd); if (zio->io_error == 0) vps->vps_readable = 1; - if (zio->io_error == 0 && (spa_mode & FWRITE)) { + if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vps->vps_root, vd, zio->io_offset, zio->io_size, zio->io_data, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, @@ -843,12 +852,12 @@ vdev_probe_done(zio_t *zio) vd->vdev_cant_write |= !vps->vps_writeable; if (vdev_readable(vd) && - (vdev_writeable(vd) || !(spa_mode & FWRITE))) { + (vdev_writeable(vd) || !spa_writeable(spa))) { zio->io_error = 0; } else { ASSERT(zio->io_error != 0); zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, - zio->io_spa, vd, NULL, 0, 0); + spa, vd, NULL, 0, 0); zio->io_error = ENXIO; } kmem_free(vps, sizeof (*vps)); @@ -916,12 +925,15 @@ vdev_probe(vdev_t *vd, zio_t *pio) int vdev_open(vdev_t *vd) { + spa_t *spa = vd->vdev_spa; int error; int c; uint64_t osize = 0; uint64_t asize, psize; uint64_t ashift = 0; + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); @@ -1055,16 +1067,12 @@ vdev_open(vdev_t *vd) /* * If a leaf vdev has a DTL, and seems healthy, then kick off a - * resilver. But don't do this if we are doing a reopen for a - * scrub, since this would just restart the scrub we are already - * doing. + * resilver. But don't do this if we are doing a reopen for a scrub, + * since this would just restart the scrub we are already doing. */ - if (vd->vdev_children == 0 && !vd->vdev_spa->spa_scrub_reopen) { - mutex_enter(&vd->vdev_dtl_lock); - if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) - spa_async_request(vd->vdev_spa, SPA_ASYNC_RESILVER); - mutex_exit(&vd->vdev_dtl_lock); - } + if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && + vdev_resilver_needed(vd, NULL, NULL)) + spa_async_request(spa, SPA_ASYNC_RESILVER); return (0); } @@ -1165,6 +1173,10 @@ vdev_validate(vdev_t *vd) void vdev_close(vdev_t *vd) { + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + vd->vdev_ops->vdev_op_close(vd); vdev_cache_purge(vd); @@ -1283,34 +1295,88 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } +/* + * DTLs. + * + * A vdev's DTL (dirty time log) is the set of transaction groups for which + * the vdev has less than perfect replication. There are three kinds of DTL: + * + * DTL_MISSING: txgs for which the vdev has no valid copies of the data + * + * DTL_PARTIAL: txgs for which data is available, but not fully replicated + * + * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon + * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of + * txgs that was scrubbed. + * + * DTL_OUTAGE: txgs which cannot currently be read, whether due to + * persistent errors or just some device being offline. + * Unlike the other three, the DTL_OUTAGE map is not generally + * maintained; it's only computed when needed, typically to + * determine whether a device can be detached. + * + * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device + * either has the data or it doesn't. + * + * For interior vdevs such as mirror and RAID-Z the picture is more complex. + * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because + * if any child is less than fully replicated, then so is its parent. + * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, + * comprising only those txgs which appear in 'maxfaults' or more children; + * those are the txgs we don't have enough replication to read. For example, + * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); + * thus, its DTL_MISSING consists of the set of txgs that appear in more than + * two child DTL_MISSING maps. + * + * It should be clear from the above that to compute the DTLs and outage maps + * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. + * Therefore, that is all we keep on disk. When loading the pool, or after + * a configuration change, we generate all other DTLs from first principles. + */ void -vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size) +vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { + space_map_t *sm = &vd->vdev_dtl[t]; + + ASSERT(t < DTL_TYPES); + ASSERT(vd != vd->vdev_spa->spa_root_vdev); + mutex_enter(sm->sm_lock); if (!space_map_contains(sm, txg, size)) space_map_add(sm, txg, size); mutex_exit(sm->sm_lock); } -int -vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size) +boolean_t +vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { - int dirty; + space_map_t *sm = &vd->vdev_dtl[t]; + boolean_t dirty = B_FALSE; - /* - * Quick test without the lock -- covers the common case that - * there are no dirty time segments. - */ - if (sm->sm_space == 0) - return (0); + ASSERT(t < DTL_TYPES); + ASSERT(vd != vd->vdev_spa->spa_root_vdev); mutex_enter(sm->sm_lock); - dirty = space_map_contains(sm, txg, size); + if (sm->sm_space != 0) + dirty = space_map_contains(sm, txg, size); mutex_exit(sm->sm_lock); return (dirty); } +boolean_t +vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) +{ + space_map_t *sm = &vd->vdev_dtl[t]; + boolean_t empty; + + mutex_enter(sm->sm_lock); + empty = (sm->sm_space == 0); + mutex_exit(sm->sm_lock); + + return (empty); +} + /* * Reassess DTLs after a config change or scrub completion. */ @@ -1318,11 +1384,19 @@ void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) { spa_t *spa = vd->vdev_spa; - int c; + avl_tree_t reftree; + int minref; - ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); - if (vd->vdev_children == 0) { + for (int c = 0; c < vd->vdev_children; c++) + vdev_dtl_reassess(vd->vdev_child[c], txg, + scrub_txg, scrub_done); + + if (vd == spa->spa_root_vdev) + return; + + if (vd->vdev_ops->vdev_op_leaf) { mutex_enter(&vd->vdev_dtl_lock); if (scrub_txg != 0 && (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) { @@ -1333,12 +1407,38 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) * will be valid, so excise the old region and * fold in the scrub dtl. Otherwise, leave the * dtl as-is if there was an error. + * + * There's little trick here: to excise the beginning + * of the DTL_MISSING map, we put it into a reference + * tree and then add a segment with refcnt -1 that + * covers the range [0, scrub_txg). This means + * that each txg in that range has refcnt -1 or 0. + * We then add DTL_SCRUB with a refcnt of 2, so that + * entries in the range [0, scrub_txg) will have a + * positive refcnt -- either 1 or 2. We then convert + * the reference tree into the new DTL_MISSING map. */ - space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg); - space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub); + space_map_ref_create(&reftree); + space_map_ref_add_map(&reftree, + &vd->vdev_dtl[DTL_MISSING], 1); + space_map_ref_add_seg(&reftree, 0, scrub_txg, -1); + space_map_ref_add_map(&reftree, + &vd->vdev_dtl[DTL_SCRUB], 2); + space_map_ref_generate_map(&reftree, + &vd->vdev_dtl[DTL_MISSING], 1); + space_map_ref_destroy(&reftree); } + space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); + space_map_walk(&vd->vdev_dtl[DTL_MISSING], + space_map_add, &vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) - space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); + space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL); + space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); + if (!vdev_readable(vd)) + space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); + else + space_map_walk(&vd->vdev_dtl[DTL_MISSING], + space_map_add, &vd->vdev_dtl[DTL_OUTAGE]); mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) @@ -1346,35 +1446,34 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) return; } - /* - * Make sure the DTLs are always correct under the scrub lock. - */ - if (vd == spa->spa_root_vdev) - mutex_enter(&spa->spa_scrub_lock); - mutex_enter(&vd->vdev_dtl_lock); - space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); - space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); - mutex_exit(&vd->vdev_dtl_lock); - - for (c = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done); - mutex_enter(&vd->vdev_dtl_lock); - space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map); - space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub); - mutex_exit(&vd->vdev_dtl_lock); + for (int t = 0; t < DTL_TYPES; t++) { + if (t == DTL_SCRUB) + continue; /* leaf vdevs only */ + if (t == DTL_PARTIAL) + minref = 1; /* i.e. non-zero */ + else if (vd->vdev_nparity != 0) + minref = vd->vdev_nparity + 1; /* RAID-Z */ + else + minref = vd->vdev_children; /* any kind of mirror */ + space_map_ref_create(&reftree); + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + mutex_enter(&cvd->vdev_dtl_lock); + space_map_ref_add_map(&reftree, &cvd->vdev_dtl[t], 1); + mutex_exit(&cvd->vdev_dtl_lock); + } + space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref); + space_map_ref_destroy(&reftree); } - - if (vd == spa->spa_root_vdev) - mutex_exit(&spa->spa_scrub_lock); + mutex_exit(&vd->vdev_dtl_lock); } static int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; - space_map_obj_t *smo = &vd->vdev_dtl; + space_map_obj_t *smo = &vd->vdev_dtl_smo; objset_t *mos = spa->spa_meta_objset; dmu_buf_t *db; int error; @@ -1392,7 +1491,8 @@ vdev_dtl_load(vdev_t *vd) dmu_buf_rele(db, FTAG); mutex_enter(&vd->vdev_dtl_lock); - error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos); + error = space_map_load(&vd->vdev_dtl[DTL_MISSING], + NULL, SM_ALLOC, smo, mos); mutex_exit(&vd->vdev_dtl_lock); return (error); @@ -1402,8 +1502,8 @@ void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; - space_map_obj_t *smo = &vd->vdev_dtl; - space_map_t *sm = &vd->vdev_dtl_map; + space_map_obj_t *smo = &vd->vdev_dtl_smo; + space_map_t *sm = &vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; space_map_t smsync; kmutex_t smlock; @@ -1461,6 +1561,37 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) } /* + * Determine whether the specified vdev can be offlined/detached/removed + * without losing data. + */ +boolean_t +vdev_dtl_required(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *tvd = vd->vdev_top; + uint8_t cant_read = vd->vdev_cant_read; + boolean_t required; + + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + + if (vd == spa->spa_root_vdev || vd == tvd) + return (B_TRUE); + + /* + * Temporarily mark the device as unreadable, and then determine + * whether this results in any DTL outages in the top-level vdev. + * If not, we can safely offline/detach/remove the device. + */ + vd->vdev_cant_read = B_TRUE; + vdev_dtl_reassess(tvd, 0, 0, B_FALSE); + required = !vdev_dtl_empty(tvd, DTL_OUTAGE); + vd->vdev_cant_read = cant_read; + vdev_dtl_reassess(tvd, 0, 0, B_FALSE); + + return (required); +} + +/* * Determine if resilver is needed, and if so the txg range. */ boolean_t @@ -1472,19 +1603,19 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); - if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) { + if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 && + vdev_writeable(vd)) { space_seg_t *ss; - ss = avl_first(&vd->vdev_dtl_map.sm_root); + ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root); thismin = ss->ss_start - 1; - ss = avl_last(&vd->vdev_dtl_map.sm_root); + ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root); thismax = ss->ss_end; needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); } else { - int c; - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; uint64_t cmin, cmax; @@ -1506,12 +1637,10 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) void vdev_load(vdev_t *vd) { - int c; - /* * Recursively load all children. */ - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_load(vd->vdev_child[c]); /* @@ -1731,11 +1860,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; - (void) spa_vdev_state_exit(spa, vd, 0); - - VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); - - return (0); + return (spa_vdev_state_exit(spa, vd, 0)); } int @@ -1756,13 +1881,10 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) */ if (!vd->vdev_offline) { /* - * If this device's top-level vdev has a non-empty DTL, - * don't allow the device to be offlined. - * - * XXX -- make this more precise by allowing the offline - * as long as the remaining devices don't have any DTL holes. + * If this device has the only valid copy of some data, + * don't allow it to be offlined. */ - if (vd->vdev_top->vdev_dtl_map.sm_space != 0) + if (vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, EBUSY)); /* @@ -1772,7 +1894,7 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) */ vd->vdev_offline = B_TRUE; vdev_reopen(vd->vdev_top); - if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) { + if (vd->vdev_aux == NULL && vdev_is_dead(vd->vdev_top)) { vd->vdev_offline = B_FALSE; vdev_reopen(vd->vdev_top); return (spa_vdev_state_exit(spa, NULL, EBUSY)); @@ -1852,13 +1974,17 @@ vdev_writeable(vdev_t *vd) boolean_t vdev_allocatable(vdev_t *vd) { + uint64_t state = vd->vdev_state; + /* - * We currently allow allocations from vdevs which maybe in the + * We currently allow allocations from vdevs which may be in the * process of reopening (i.e. VDEV_STATE_CLOSED). If the device * fails to reopen then we'll catch it later when we're holding - * the proper locks. + * the proper locks. Note that we have to get the vdev state + * in a local variable because although it changes atomically, + * we're asking two separate questions about it. */ - return (!(vdev_is_dead(vd) && vd->vdev_state != VDEV_STATE_CLOSED) && + return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && !vd->vdev_cant_write); } @@ -1928,7 +2054,8 @@ vdev_clear_stats(vdev_t *vd) void vdev_stat_update(zio_t *zio, uint64_t psize) { - vdev_t *rvd = zio->io_spa->spa_root_vdev; + spa_t *spa = zio->io_spa; + vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; @@ -1961,21 +2088,23 @@ vdev_stat_update(zio_t *zio, uint64_t psize) return; ASSERT(vd == zio->io_vd); - if (!(flags & ZIO_FLAG_IO_BYPASS)) { - mutex_enter(&vd->vdev_stat_lock); - vs->vs_ops[type]++; - vs->vs_bytes[type] += psize; - mutex_exit(&vd->vdev_stat_lock); - } + + if (flags & ZIO_FLAG_IO_BYPASS) + return; + + mutex_enter(&vd->vdev_stat_lock); + if (flags & ZIO_FLAG_IO_REPAIR) { - ASSERT(zio->io_delegate_list == NULL); - mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_SCRUB_THREAD) vs->vs_scrub_repaired += psize; - else + if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; - mutex_exit(&vd->vdev_stat_lock); } + + vs->vs_ops[type]++; + vs->vs_bytes[type] += psize; + + mutex_exit(&vd->vdev_stat_lock); return; } @@ -1993,19 +2122,39 @@ vdev_stat_update(zio_t *zio, uint64_t psize) vs->vs_write_errors++; mutex_exit(&vd->vdev_stat_lock); - if (type == ZIO_TYPE_WRITE && txg != 0 && vd->vdev_children == 0) { - if (flags & ZIO_FLAG_SCRUB_THREAD) { - ASSERT(flags & ZIO_FLAG_IO_REPAIR); - for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) - vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1); - } - if (!(flags & ZIO_FLAG_IO_REPAIR)) { - if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1)) + if (type == ZIO_TYPE_WRITE && txg != 0 && + (!(flags & ZIO_FLAG_IO_REPAIR) || + (flags & ZIO_FLAG_SCRUB_THREAD))) { + /* + * This is either a normal write (not a repair), or it's a + * repair induced by the scrub thread. In the normal case, + * we commit the DTL change in the same txg as the block + * was born. In the scrub-induced repair case, we know that + * scrubs run in first-pass syncing context, so we commit + * the DTL change in spa->spa_syncing_txg. + * + * We currently do not make DTL entries for failed spontaneous + * self-healing writes triggered by normal (non-scrubbing) + * reads, because we have no transactional context in which to + * do so -- and it's not clear that it'd be desirable anyway. + */ + if (vd->vdev_ops->vdev_op_leaf) { + uint64_t commit_txg = txg; + if (flags & ZIO_FLAG_SCRUB_THREAD) { + ASSERT(flags & ZIO_FLAG_IO_REPAIR); + ASSERT(spa_sync_pass(spa) == 1); + vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); + commit_txg = spa->spa_syncing_txg; + } + ASSERT(commit_txg >= spa->spa_syncing_txg); + if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; - vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); - for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) - vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1); + for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) + vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); + vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } + if (vd != rvd) + vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); } } @@ -2218,7 +2367,8 @@ vdev_state_clean(vdev_t *vd) void vdev_propagate_state(vdev_t *vd) { - vdev_t *rvd = vd->vdev_spa->spa_root_vdev; + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; int c; @@ -2229,7 +2379,7 @@ vdev_propagate_state(vdev_t *vd) child = vd->vdev_child[c]; if (!vdev_readable(child) || - (!vdev_writeable(child) && (spa_mode & FWRITE))) { + (!vdev_writeable(child) && spa_writeable(spa))) { /* * Root special: if there is a top-level log * device, treat the root vdev as if it were diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index dc0e920bf..f91dddbe5 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -61,7 +61,7 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) */ ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, - spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); + spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; @@ -105,7 +105,8 @@ vdev_file_close(vdev_t *vd) if (vf->vf_vnode != NULL) { (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL); - (void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred, NULL); + (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0, + kcred, NULL); VN_RELE(vf->vf_vnode); } diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index bf930466f..f8f90196b 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -277,9 +277,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vd->vdev_islog) == 0); } - if (vd->vdev_dtl.smo_object != 0) + if (vd->vdev_dtl_smo.smo_object != 0) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, - vd->vdev_dtl.smo_object) == 0); + vd->vdev_dtl_smo.smo_object) == 0); if (getstats) { vdev_stat_t vs; @@ -520,9 +520,6 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid)) return (EBUSY); - ASSERT(reason != VDEV_LABEL_REMOVE || - vdev_inuse(vd, crtxg, reason, NULL, NULL)); - /* * If this is a request to add or replace a spare or l2cache device * that is in use elsewhere on the system, then we must update the @@ -705,6 +702,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) */ /* + * For use by zdb and debugging purposes only + */ +uint64_t ub_max_txg = UINT64_MAX; + +/* * Consider the following situation: txg is safely synced to disk. We've * written the first uberblock for txg + 1, and then we lose power. When we * come back up, we fail to see the uberblock for txg + 1 because, say, @@ -741,7 +743,8 @@ vdev_uberblock_load_done(zio_t *zio) if (zio->io_error == 0 && uberblock_verify(ub) == 0) { mutex_enter(&rio->io_lock); - if (vdev_uberblock_compare(ub, ubbest) > 0) + if (ub->ub_txg <= ub_max_txg && + vdev_uberblock_compare(ub, ubbest) > 0) *ubbest = *ub; mutex_exit(&rio->io_lock); } diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index c4629ff45..184da82ab 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -225,7 +225,7 @@ vdev_mirror_child_select(zio_t *zio) mc->mc_skipped = 1; continue; } - if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1)) + if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) return (c); mc->mc_error = ESTALE; mc->mc_skipped = 1; @@ -282,20 +282,10 @@ vdev_mirror_io_start(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_WRITE); /* - * If this is a resilvering I/O to a replacing vdev, - * only the last child should be written -- unless the - * first child happens to have a DTL entry here as well. - * All other writes go to all children. + * Writes go to all children. */ - if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing && - !vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map, - zio->io_txg, 1)) { - c = mm->mm_children - 1; - children = 1; - } else { - c = 0; - children = mm->mm_children; - } + c = 0; + children = mm->mm_children; } while (children--) { @@ -398,7 +388,7 @@ vdev_mirror_io_done(zio_t *zio) ASSERT(zio->io_error != 0); } - if (good_copies && (spa_mode & FWRITE) && + if (good_copies && spa_writeable(zio->io_spa) && (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER) || ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) { @@ -419,7 +409,7 @@ vdev_mirror_io_done(zio_t *zio) if (mc->mc_tried) continue; if (!(zio->io_flags & ZIO_FLAG_SCRUB) && - !vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, + !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, zio->io_txg, 1)) continue; mc->mc_error = ESTALE; @@ -429,7 +419,8 @@ vdev_mirror_io_done(zio_t *zio) mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, ZIO_TYPE_WRITE, zio->io_priority, - ZIO_FLAG_IO_REPAIR, NULL, NULL)); + ZIO_FLAG_IO_REPAIR | (unexpected_errors ? + ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } } diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 46fca0e3b..137afdd42 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -176,6 +176,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) zio_t *fio, *lio, *aio, *dio; avl_tree_t *tree; uint64_t size; + int flags; ASSERT(MUTEX_HELD(&vq->vq_lock)); @@ -187,21 +188,32 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) tree = fio->io_vdev_tree; size = fio->io_size; - - while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) && - !((dio->io_flags | fio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) && - size + dio->io_size <= zfs_vdev_aggregation_limit) { - dio->io_delegate_next = fio; - fio = dio; - size += dio->io_size; - } - - while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) && - !((lio->io_flags | dio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) && - size + dio->io_size <= zfs_vdev_aggregation_limit) { - lio->io_delegate_next = dio; - lio = dio; - size += dio->io_size; + flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; + + if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { + /* + * We can aggregate I/Os that are adjacent and of the + * same flavor, as expressed by the AGG_INHERIT flags. + * The latter is necessary so that certain attributes + * of the I/O, such as whether it's a normal I/O or a + * scrub/resilver, can be preserved in the aggregate. + */ + while ((dio = AVL_PREV(tree, fio)) != NULL && + IS_ADJACENT(dio, fio) && + (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && + size + dio->io_size <= zfs_vdev_aggregation_limit) { + dio->io_delegate_next = fio; + fio = dio; + size += dio->io_size; + } + while ((dio = AVL_NEXT(tree, lio)) != NULL && + IS_ADJACENT(lio, dio) && + (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && + size + dio->io_size <= zfs_vdev_aggregation_limit) { + lio->io_delegate_next = dio; + lio = dio; + size += dio->io_size; + } } if (fio != lio) { @@ -212,7 +224,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, buf, size, fio->io_type, ZIO_PRIORITY_NOW, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, + flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_delegate_list = fio; diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 69e314468..ad997f528 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -687,7 +687,7 @@ vdev_raidz_io_start(zio_t *zio) rc->rc_skipped = 1; continue; } - if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { + if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) { if (c >= rm->rm_firstdatacol) rm->rm_missingdata++; else @@ -1165,7 +1165,7 @@ vdev_raidz_io_done(zio_t *zio) done: zio_checksum_verified(zio); - if (zio->io_error == 0 && (spa_mode & FWRITE) && + if (zio->io_error == 0 && spa_writeable(zio->io_spa) && (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { /* * Use the good data we have in hand to repair damaged children. @@ -1180,7 +1180,8 @@ done: zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_data, rc->rc_size, ZIO_TYPE_WRITE, zio->io_priority, - ZIO_FLAG_IO_REPAIR, NULL, NULL)); + ZIO_FLAG_IO_REPAIR | (unexpected_errors ? + ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } } diff --git a/module/zfs/zfs_acl.c b/module/zfs/zfs_acl.c index 341dc4dfe..fdf92a156 100644 --- a/module/zfs/zfs_acl.c +++ b/module/zfs/zfs_acl.c @@ -2148,12 +2148,12 @@ top: } } - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { mutex_exit(&zp->z_acl_lock); mutex_exit(&zp->z_lock); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -2208,7 +2208,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, *check_privs = B_TRUE; - if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ + if (zfsvfs->z_replay) { *working_mode = 0; return (0); } diff --git a/module/zfs/zfs_byteswap.c b/module/zfs/zfs_byteswap.c index ab97f83eb..cd36696f9 100644 --- a/module/zfs/zfs_byteswap.c +++ b/module/zfs/zfs_byteswap.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/vfs.h> #include <sys/fs/zfs.h> @@ -63,6 +61,20 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout) while (ptr < end) { if (zfs_layout) { + /* + * Avoid overrun. Embedded aces can have one + * of several sizes. We don't know exactly + * how many our present, only the size of the + * buffer containing them. That size may be + * larger than needed to hold the aces + * present. As long as we do not do any + * swapping beyond the end of our block we are + * okay. It it safe to swap any non-ace data + * within the block since it is just zeros. + */ + if (ptr + sizeof (zfs_ace_hdr_t) > end) { + break; + } zacep = (zfs_ace_t *)ptr; zacep->z_hdr.z_access_mask = BSWAP_32(zacep->z_hdr.z_access_mask); @@ -71,6 +83,10 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout) BSWAP_16(zacep->z_hdr.z_type); entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS; } else { + /* Overrun avoidance */ + if (ptr + sizeof (ace_t) > end) { + break; + } acep = (ace_t *)ptr; acep->a_access_mask = BSWAP_32(acep->a_access_mask); acep->a_flags = BSWAP_16(acep->a_flags); @@ -87,8 +103,14 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout) break; case ACE_IDENTIFIER_GROUP: default: + /* Overrun avoidance */ if (zfs_layout) { - zacep->z_fuid = BSWAP_64(zacep->z_fuid); + if (ptr + sizeof (zfs_ace_t) <= end) { + zacep->z_fuid = BSWAP_64(zacep->z_fuid); + } else { + entry_size = sizeof (zfs_ace_t); + break; + } } switch (ace_type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: @@ -169,7 +191,8 @@ zfs_znode_byteswap(void *buf, size_t size) if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) { zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0], ZFS_ACE_SPACE); - } else + } else { zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0], ACE_SLOT_CNT); + } } diff --git a/module/zfs/zfs_dir.c b/module/zfs/zfs_dir.c index 1ec493264..9353d0199 100644 --- a/module/zfs/zfs_dir.c +++ b/module/zfs/zfs_dir.c @@ -562,24 +562,6 @@ zfs_rmnode(znode_t *zp) ASSERT(zp->z_phys->zp_links == 0); /* - * If this is a ZIL replay then leave the object in the unlinked set. - * Otherwise we can get a deadlock, because the delete can be - * quite large and span multiple tx's and txgs, but each replay - * creates a tx to atomically run the replay function and mark the - * replay record as complete. We deadlock trying to start a tx in - * a new txg to further the deletion but can't because the replay - * tx hasn't finished. - * - * We actually delete the object if we get a failure to create an - * object in zil_replay_log_record(), or after calling zil_replay(). - */ - if (zfsvfs->z_assign >= TXG_INITIAL) { - zfs_znode_dmu_fini(zp); - zfs_znode_free(zp); - return; - } - - /* * If this is an attribute directory, purge its contents. */ if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR)) { @@ -845,9 +827,9 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr) FUID_SIZE_ESTIMATE(zfsvfs)); } } - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) + if (error == ERESTART) dmu_tx_wait(tx); dmu_tx_abort(tx); return (error); @@ -930,7 +912,7 @@ top: error = zfs_make_xattrdir(zp, &va, xvpp, cr); zfs_dirent_unlock(dl); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { /* NB: we already did dmu_tx_wait() if necessary */ goto top; } @@ -959,7 +941,7 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) uid_t fowner; zfsvfs_t *zfsvfs = zdp->z_zfsvfs; - if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */ + if (zdp->z_zfsvfs->z_replay) return (0); if ((zdp->z_phys->zp_mode & S_ISVTX) == 0) diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c index 7cb505258..286dafba8 100644 --- a/module/zfs/zfs_fuid.c +++ b/module/zfs/zfs_fuid.c @@ -519,7 +519,6 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, uint32_t rid; idmap_stat status; uint64_t idx; - boolean_t is_replay = (zfsvfs->z_assign >= TXG_INITIAL); zfs_fuid_t *zfuid = NULL; zfs_fuid_info_t *fuidp; @@ -534,7 +533,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0) return (id); - if (is_replay) { + if (zfsvfs->z_replay) { fuidp = zfsvfs->z_fuid_replay; /* @@ -584,7 +583,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx); - if (!is_replay) + if (!zfsvfs->z_replay) zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type); else if (zfuid != NULL) { list_remove(&fuidp->z_fuids, zfuid); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index b6ad57451..49ee55265 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -856,9 +856,10 @@ zfs_ioc_pool_export(zfs_cmd_t *zc) { int error; boolean_t force = (boolean_t)zc->zc_cookie; + boolean_t hardforce = (boolean_t)zc->zc_guid; zfs_log_history(zc); - error = spa_export(zc->zc_name, NULL, force); + error = spa_export(zc->zc_name, NULL, force, hardforce); return (error); } @@ -1162,7 +1163,7 @@ zfs_ioc_vdev_detach(zfs_cmd_t *zc) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE); + error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE); spa_close(spa, FTAG); return (error); diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 11cd4c264..84d64b4df 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -45,13 +45,33 @@ #include <sys/spa.h> #include <sys/zfs_fuid.h> #include <sys/ddi.h> +#include <sys/dsl_dataset.h> + +#define ZFS_HANDLE_REPLAY(zilog, tx) \ + if (zilog->zl_replay) { \ + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); \ + zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = \ + zilog->zl_replaying_seq; \ + return; \ + } /* - * All the functions in this file are used to construct the log entries - * to record transactions. They allocate * an intent log transaction - * structure (itx_t) and save within it all the information necessary to - * possibly replay the transaction. The itx is then assigned a sequence - * number and inserted in the in-memory list anchored in the zilog. + * These zfs_log_* functions must be called within a dmu tx, in one + * of 2 contexts depending on zilog->z_replay: + * + * Non replay mode + * --------------- + * We need to record the transaction so that if it is committed to + * the Intent Log then it can be replayed. An intent log transaction + * structure (itx_t) is allocated and all the information necessary to + * possibly replay the transaction is saved in it. The itx is then assigned + * a sequence number and inserted in the in-memory list anchored in the zilog. + * + * Replay mode + * ----------- + * We need to mark the intent log record as replayed in the log header. + * This is done in the same transaction as the replay so that they + * commit atomically. */ int @@ -231,6 +251,8 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, if (zilog == NULL) return; + ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ + /* * If we have FUIDs present then add in space for * domains and ACE fuid's if any. @@ -334,6 +356,8 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, if (zilog == NULL) return; + ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ + itx = zil_itx_create(txtype, sizeof (*lr) + namesize); lr = (lr_remove_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; @@ -358,6 +382,8 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, if (zilog == NULL) return; + ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ + itx = zil_itx_create(txtype, sizeof (*lr) + namesize); lr = (lr_link_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; @@ -385,6 +411,8 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, if (zilog == NULL) return; + ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ + itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize); lr = (lr_create_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; @@ -419,6 +447,8 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, if (zilog == NULL) return; + ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ + itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); lr = (lr_rename_t *)&itx->itx_lr; lr->lr_sdoid = sdzp->z_id; @@ -451,6 +481,8 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, if (zilog == NULL || zp->z_unlinked) return; + ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ + /* * Writes are handled in three different ways: * @@ -549,6 +581,8 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, if (zilog == NULL || zp->z_unlinked) return; + ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ + itx = zil_itx_create(txtype, sizeof (*lr)); lr = (lr_truncate_t *)&itx->itx_lr; lr->lr_foid = zp->z_id; @@ -578,6 +612,8 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, if (zilog == NULL || zp->z_unlinked) return; + ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ + /* * If XVATTR set, then log record size needs to allow * for lr_attr_t + xvattr mask, mapsize and create time @@ -644,6 +680,8 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, if (zilog == NULL || zp->z_unlinked) return; + ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ + txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ? TX_ACL_V0 : TX_ACL; diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index 06b4dee46..1bf1bc527 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -583,21 +583,50 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) * allow replays to succeed. */ readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; - zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; - - /* - * Parse and replay the intent log. - */ - zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign, - zfs_replay_vector, zfs_unlinked_drain); + if (readonly != 0) + zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; + else + zfs_unlinked_drain(zfsvfs); - zfs_unlinked_drain(zfsvfs); + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); + if (zil_disable) { + zil_destroy(zfsvfs->z_log, 0); + zfsvfs->z_log = NULL; + } else { + /* + * Parse and replay the intent log. + * + * Because of ziltest, this must be done after + * zfs_unlinked_drain(). (Further note: ziltest + * doesn't use readonly mounts, where + * zfs_unlinked_drain() isn't called.) This is because + * ziltest causes spa_sync() to think it's committed, + * but actually it is not, so the intent log contains + * many txg's worth of changes. + * + * In particular, if object N is in the unlinked set in + * the last txg to actually sync, then it could be + * actually freed in a later txg and then reallocated + * in a yet later txg. This would write a "create + * object N" record to the intent log. Normally, this + * would be fine because the spa_sync() would have + * written out the fact that object N is free, before + * we could write the "create object N" intent log + * record. + * + * But when we are in ziltest mode, we advance the "open + * txg" without actually spa_sync()-ing the changes to + * disk. So we would see that object N is still + * allocated and in the unlinked set, and there is an + * intent log record saying to allocate it. + */ + zfsvfs->z_replay = B_TRUE; + zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); + zfsvfs->z_replay = B_FALSE; + } zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ } - if (!zil_disable) - zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); - return (0); } @@ -634,7 +663,6 @@ zfs_domount(vfs_t *vfsp, char *osname) zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); zfsvfs->z_vfs = vfsp; zfsvfs->z_parent = zfsvfs; - zfsvfs->z_assign = TXG_NOWAIT; zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 8e0037e37..f62d3bfa0 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -105,9 +105,7 @@ * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * - * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). - * In normal operation, this will be TXG_NOWAIT. During ZIL replay, - * it will be a specific txg. Either way, dmu_tx_assign() never blocks. + * (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign(). * This is critical because we don't want to block while holding locks. * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing to @@ -124,6 +122,8 @@ * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. + * During ZIL replay the zfs_log_* functions will update the sequence + * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. @@ -139,12 +139,12 @@ * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify - * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign + * error = dmu_tx_assign(tx, TXG_NOWAIT); // try to assign * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes - * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + * if (error == ERESTART) { * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; @@ -698,10 +698,9 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { - if (error == ERESTART && - zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); continue; @@ -807,7 +806,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) * If we're in replay mode, or we made no progress, return error. * Otherwise, it's at least a partial write, so it's successful. */ - if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { + if (zfsvfs->z_replay || uio->uio_resid == start_resid) { ZFS_EXIT(zfsvfs); return (error); } @@ -1233,11 +1232,10 @@ top: dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); } - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); - if (error == ERESTART && - zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -1449,11 +1447,11 @@ top: /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); VN_RELE(vp); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -1659,10 +1657,10 @@ top: if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -1789,13 +1787,13 @@ top: dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); zfs_dirent_unlock(dl); VN_RELE(vp); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -2342,6 +2340,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; + xvattr_t tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask; int trim_mask = 0; @@ -2396,6 +2395,8 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, */ xoap = xva_getxoptattr(xvap); + xva_init(&tmpxvattr); + /* * Immutable files can only alter immutable bit and atime */ @@ -2518,28 +2519,78 @@ top: oldva.va_mode = pzp->zp_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & AT_XVATTR) { - if ((need_policy == FALSE) && - (XVA_ISSET_REQ(xvap, XAT_APPENDONLY) && - xoap->xoa_appendonly != - ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) || - (XVA_ISSET_REQ(xvap, XAT_NOUNLINK) && - xoap->xoa_nounlink != - ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) || - (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) && - xoap->xoa_immutable != - ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) || - (XVA_ISSET_REQ(xvap, XAT_NODUMP) && - xoap->xoa_nodump != - ((pzp->zp_flags & ZFS_NODUMP) != 0)) || - (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) && - xoap->xoa_av_modified != - ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) || - ((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) && - ((vp->v_type != VREG && xoap->xoa_av_quarantined) || - xoap->xoa_av_quarantined != - ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) || - (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || - (XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { + /* + * Update xvattr mask to include only those attributes + * that are actually changing. + * + * the bits will be restored prior to actually setting + * the attributes so the caller thinks they were set. + */ + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + if (xoap->xoa_appendonly != + ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_APPENDONLY); + XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + if (xoap->xoa_nounlink != + ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NOUNLINK); + XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + if (xoap->xoa_immutable != + ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_IMMUTABLE); + XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + if (xoap->xoa_nodump != + ((pzp->zp_flags & ZFS_NODUMP) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NODUMP); + XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + if (xoap->xoa_av_modified != + ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); + XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + if ((vp->v_type != VREG && + xoap->xoa_av_quarantined) || + xoap->xoa_av_quarantined != + ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); + XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); + } + } + + if (need_policy == FALSE && + (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || + XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } @@ -2649,7 +2700,7 @@ top: dmu_tx_hold_bonus(tx, attrzp->z_id); } - err = dmu_tx_assign(tx, zfsvfs->z_assign); + err = dmu_tx_assign(tx, TXG_NOWAIT); if (err) { if (attrzp) VN_RELE(ZTOV(attrzp)); @@ -2659,7 +2710,7 @@ top: aclp = NULL; } - if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (err == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -2732,6 +2783,31 @@ top: */ if (xoap && (mask & AT_XVATTR)) { + + /* + * restore trimmed off masks + * so that return masks can be set for caller. + */ + + if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { + XVA_SET_REQ(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { + XVA_SET_REQ(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { + XVA_SET_REQ(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { + XVA_SET_REQ(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { + XVA_SET_REQ(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { + XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { size_t len; dmu_object_info_t doi; @@ -3104,7 +3180,7 @@ top: if (tzp) dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { if (zl != NULL) zfs_rename_unlock(&zl); @@ -3113,7 +3189,7 @@ top: VN_RELE(ZTOV(szp)); if (tzp) VN_RELE(ZTOV(tzp)); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -3242,10 +3318,10 @@ top: FUID_SIZE_ESTIMATE(zfsvfs)); } } - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -3462,10 +3538,10 @@ top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, szp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -3547,7 +3623,7 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, len = PAGESIZE; /* * If our blocksize is bigger than the page size, try to kluster - * muiltiple pages so that we write a full block (thus avoiding + * multiple pages so that we write a full block (thus avoiding * a read-modify-write). */ if (off < filesz && zp->z_blksz > PAGESIZE) { @@ -3589,9 +3665,9 @@ top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); dmu_tx_hold_bonus(tx, zp->z_id); - err = dmu_tx_assign(tx, zfsvfs->z_assign); + err = dmu_tx_assign(tx, TXG_NOWAIT); if (err != 0) { - if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (err == ERESTART) { zfs_range_unlock(rl); dmu_tx_wait(tx); dmu_tx_abort(tx); diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index 25751ae5f..9a7860380 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -734,7 +734,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); - if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ + if (zfsvfs->z_replay) { obj = vap->va_nodeid; flag |= IS_REPLAY; now = vap->va_ctime; /* see zfs_replay_create() */ @@ -1254,9 +1254,9 @@ top: newblksz = 0; } - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -1358,9 +1358,9 @@ zfs_trunc(znode_t *zp, uint64_t end) top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -1456,9 +1456,9 @@ zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) log: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto log; @@ -1562,7 +1562,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) bzero(&zfsvfs, sizeof (zfsvfs_t)); zfsvfs.z_os = os; - zfsvfs.z_assign = TXG_NOWAIT; zfsvfs.z_parent = &zfsvfs; zfsvfs.z_version = version; zfsvfs.z_use_fuids = USE_FUIDS(version, os); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 95101882b..83fef0d87 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -351,14 +351,20 @@ zil_create(zilog_t *zilog) blk = zh->zh_log; /* - * If we don't already have an initial log block, allocate one now. + * If we don't already have an initial log block or we have one + * but it's the wrong endianness then allocate one. */ - if (BP_IS_HOLE(&blk)) { + if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { tx = dmu_tx_create(zilog->zl_os); (void) dmu_tx_assign(tx, TXG_WAIT); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); + if (!BP_IS_HOLE(&blk)) { + zio_free_blk(zilog->zl_spa, &blk, txg); + BP_ZERO(&blk); + } + error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk, NULL, txg); @@ -1214,7 +1220,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) ASSERT(zilog->zl_stop_sync == 0); - zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK]; + zh->zh_replay_seq = zilog->zl_replayed_seq[txg & TXG_MASK]; if (zilog->zl_destroy_txg == txg) { blkptr_t blk = zh->zh_log; @@ -1223,7 +1229,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) ASSERT(spa_sync_pass(spa) == 1); bzero(zh, sizeof (zil_header_t)); - bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq)); + bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); if (zilog->zl_keep_first) { /* @@ -1460,9 +1466,7 @@ zil_resume(zilog_t *zilog) typedef struct zil_replay_arg { objset_t *zr_os; zil_replay_func_t **zr_replay; - zil_replay_cleaner_t *zr_replay_cleaner; void *zr_arg; - uint64_t *zr_txgp; boolean_t zr_byteswap; char *zr_lrbuf; } zil_replay_arg_t; @@ -1475,9 +1479,9 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) uint64_t reclen = lr->lrc_reclen; uint64_t txtype = lr->lrc_txtype; char *name; - int pass, error, sunk; + int pass, error; - if (zilog->zl_stop_replay) + if (!zilog->zl_replay) /* giving up */ return; if (lr->lrc_txg < claim_txg) /* already committed */ @@ -1489,6 +1493,11 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) /* Strip case-insensitive bit, still present in log record */ txtype &= ~TX_CI; + if (txtype == 0 || txtype >= TX_MAX_TYPE) { + error = EINVAL; + goto bad; + } + /* * Make a copy of the data so we can revise and extend it. */ @@ -1539,69 +1548,16 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) } /* - * Replay of large truncates can end up needing additional txs - * and a different txg. If they are nested within the replay tx - * as below then a hang is possible. So we do the truncate here - * and redo the truncate later (a no-op) and update the sequence - * number whilst in the replay tx. Fortunately, it's safe to repeat - * a truncate if we crash and the truncate commits. A create over - * an existing file will also come in as a TX_TRUNCATE record. - * - * Note, remove of large files and renames over large files is - * handled by putting the deleted object on a stable list - * and if necessary force deleting the object outside of the replay - * transaction using the zr_replay_cleaner. - */ - if (txtype == TX_TRUNCATE) { - *zr->zr_txgp = TXG_NOWAIT; - error = zr->zr_replay[TX_TRUNCATE](zr->zr_arg, zr->zr_lrbuf, - zr->zr_byteswap); - if (error) - goto bad; - zr->zr_byteswap = 0; /* only byteswap once */ - } - - /* * We must now do two things atomically: replay this log record, - * and update the log header to reflect the fact that we did so. - * We use the DMU's ability to assign into a specific txg to do this. + * and update the log header sequence number to reflect the fact that + * we did so. At the end of each replay function the sequence number + * is updated if we are in replay mode. */ - for (pass = 1, sunk = B_FALSE; /* CONSTANTCONDITION */; pass++) { - uint64_t replay_txg; - dmu_tx_t *replay_tx; - - replay_tx = dmu_tx_create(zr->zr_os); - error = dmu_tx_assign(replay_tx, TXG_WAIT); - if (error) { - dmu_tx_abort(replay_tx); - break; - } - - replay_txg = dmu_tx_get_txg(replay_tx); - - if (txtype == 0 || txtype >= TX_MAX_TYPE) { - error = EINVAL; - } else { - /* - * On the first pass, arrange for the replay vector - * to fail its dmu_tx_assign(). That's the only way - * to ensure that those code paths remain well tested. - * - * Only byteswap (if needed) on the 1st pass. - */ - *zr->zr_txgp = replay_txg - (pass == 1); - error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf, - zr->zr_byteswap && pass == 1); - *zr->zr_txgp = TXG_NOWAIT; - } - - if (error == 0) { - dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx); - zilog->zl_replay_seq[replay_txg & TXG_MASK] = - lr->lrc_seq; - } - - dmu_tx_commit(replay_tx); + for (pass = 1; pass <= 2; pass++) { + zilog->zl_replaying_seq = lr->lrc_seq; + /* Only byteswap (if needed) on the 1st pass. */ + error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf, + zr->zr_byteswap && pass == 1); if (!error) return; @@ -1609,37 +1565,22 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) /* * The DMU's dnode layer doesn't see removes until the txg * commits, so a subsequent claim can spuriously fail with - * EEXIST. So if we receive any error other than ERESTART - * we try syncing out any removes then retrying the - * transaction. + * EEXIST. So if we receive any error we try syncing out + * any removes then retry the transaction. */ - if (error != ERESTART && !sunk) { - if (zr->zr_replay_cleaner) - zr->zr_replay_cleaner(zr->zr_arg); + if (pass == 1) txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); - sunk = B_TRUE; - continue; /* retry */ - } - - if (error != ERESTART) - break; - - if (pass != 1) - txg_wait_open(spa_get_dsl(zilog->zl_spa), - replay_txg + 1); - - dprintf("pass %d, retrying\n", pass); } bad: - ASSERT(error && error != ERESTART); + ASSERT(error); name = kmem_alloc(MAXNAMELEN, KM_SLEEP); dmu_objset_name(zr->zr_os, name); cmn_err(CE_WARN, "ZFS replay transaction error %d, " "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype, (lr->lrc_txtype & TX_CI) ? "CI" : ""); - zilog->zl_stop_replay = 1; + zilog->zl_replay = B_FALSE; kmem_free(name, MAXNAMELEN); } @@ -1654,9 +1595,7 @@ zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) * If this dataset has a non-empty intent log, replay it and destroy it. */ void -zil_replay(objset_t *os, void *arg, uint64_t *txgp, - zil_replay_func_t *replay_func[TX_MAX_TYPE], - zil_replay_cleaner_t *replay_cleaner) +zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) { zilog_t *zilog = dmu_objset_zil(os); const zil_header_t *zh = zilog->zl_header; @@ -1669,9 +1608,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp, zr.zr_os = os; zr.zr_replay = replay_func; - zr.zr_replay_cleaner = replay_cleaner; zr.zr_arg = arg; - zr.zr_txgp = txgp; zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); @@ -1680,7 +1617,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp, */ txg_wait_synced(zilog->zl_dmu_pool, 0); - zilog->zl_stop_replay = 0; + zilog->zl_replay = B_TRUE; zilog->zl_replay_time = lbolt; ASSERT(zilog->zl_replay_blks == 0); (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, @@ -1689,6 +1626,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp, zil_destroy(zilog, B_FALSE); txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); + zilog->zl_replay = B_FALSE; } /* diff --git a/module/zfs/zio.c b/module/zfs/zio.c index d347920ea..62af799f5 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -767,7 +767,8 @@ zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; - if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_logical == zio) { + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && + zio->io_logical == zio && !(zio->io_flags & ZIO_FLAG_RAW)) { uint64_t csize = BP_GET_PSIZE(bp); void *cbuf = zio_buf_alloc(csize); @@ -1790,7 +1791,30 @@ zio_vdev_io_start(zio_t *zio) ASSERT(P2PHASE(zio->io_offset, align) == 0); ASSERT(P2PHASE(zio->io_size, align) == 0); - ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); + ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); + + /* + * If this is a repair I/O, and there's no self-healing involved -- + * that is, we're just resilvering what we expect to resilver -- + * then don't do the I/O unless zio's txg is actually in vd's DTL. + * This prevents spurious resilvering with nested replication. + * For example, given a mirror of mirrors, (A+B)+(C+D), if only + * A is out of date, we'll read from C+D, then use the data to + * resilver A+B -- but we don't actually want to resilver B, just A. + * The top-level mirror has no way to know this, so instead we just + * discard unnecessary repairs as we work our way down the vdev tree. + * The same logic applies to any form of nested replication: + * ditto + mirror, RAID-Z + replacing, etc. This covers them all. + */ + if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && + !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && + zio->io_txg != 0 && /* not a delegated i/o */ + !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_delegate_list == NULL); + zio_vdev_io_bypass(zio); + return (ZIO_PIPELINE_CONTINUE); + } if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { @@ -1806,7 +1830,6 @@ zio_vdev_io_start(zio_t *zio) zio_interrupt(zio); return (ZIO_PIPELINE_STOP); } - } return (vd->vdev_ops->vdev_op_io_start(zio)); @@ -2157,6 +2180,7 @@ zio_done(zio_t *zio) if ((zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_FREE) && zio->io_error == ENXIO && + spa->spa_load_state == SPA_LOAD_NONE && spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 4e993060c..0206dad9e 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -75,6 +75,7 @@ #include <sys/vdev_impl.h> #include <sys/zvol.h> #include <sys/dumphdr.h> +#include <sys/zil_impl.h> #include "zfs_namecheck.h" @@ -113,7 +114,6 @@ typedef struct zvol_state { uint32_t zv_total_opens; /* total open count */ zilog_t *zv_zilog; /* ZIL handle */ list_t zv_extents; /* List of extents for dump */ - uint64_t zv_txg_assign; /* txg to assign during ZIL replay */ znode_t zv_znode; /* for range locking */ } zvol_state_t; @@ -381,7 +381,7 @@ zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap) tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, off, len); - error = dmu_tx_assign(tx, zv->zv_txg_assign); + error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { @@ -558,7 +558,7 @@ zvol_create_minor(const char *name, major_t maj) ASSERT(error == 0); zv->zv_volblocksize = doi.doi_data_block_size; - zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector, NULL); + zil_replay(os, zv, zvol_replay_vector); zvol_size_changed(zv, maj); /* XXX this should handle the possible i/o error */ @@ -971,8 +971,16 @@ static void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len) { uint32_t blocksize = zv->zv_volblocksize; + zilog_t *zilog = zv->zv_zilog; lr_write_t *lr; + if (zilog->zl_replay) { + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = + zilog->zl_replaying_seq; + return; + } + while (len) { ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize)); itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr)); @@ -987,7 +995,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len) lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t); BP_ZERO(&lr->lr_blkptr); - (void) zil_itx_assign(zv->zv_zilog, itx, tx); + (void) zil_itx_assign(zilog, itx, tx); len -= nbytes; off += nbytes; } |