diff options
Diffstat (limited to 'include/sys')
-rw-r--r-- | include/sys/Makefile.am | 4 | ||||
-rw-r--r-- | include/sys/bpobj.h | 4 | ||||
-rw-r--r-- | include/sys/dbuf.h | 2 | ||||
-rw-r--r-- | include/sys/dmu.h | 11 | ||||
-rw-r--r-- | include/sys/dnode.h | 1 | ||||
-rw-r--r-- | include/sys/dsl_dataset.h | 30 | ||||
-rw-r--r-- | include/sys/dsl_deadlist.h | 2 | ||||
-rw-r--r-- | include/sys/dsl_deleg.h | 3 | ||||
-rw-r--r-- | include/sys/dsl_dir.h | 5 | ||||
-rw-r--r-- | include/sys/dsl_pool.h | 5 | ||||
-rw-r--r-- | include/sys/dsl_scan.h | 3 | ||||
-rw-r--r-- | include/sys/fs/zfs.h | 32 | ||||
-rw-r--r-- | include/sys/metaslab.h | 7 | ||||
-rw-r--r-- | include/sys/metaslab_impl.h | 16 | ||||
-rw-r--r-- | include/sys/range_tree.h | 13 | ||||
-rw-r--r-- | include/sys/spa.h | 15 | ||||
-rw-r--r-- | include/sys/spa_impl.h | 74 | ||||
-rw-r--r-- | include/sys/space_map.h | 13 | ||||
-rw-r--r-- | include/sys/trace_vdev.h | 119 | ||||
-rw-r--r-- | include/sys/vdev.h | 9 | ||||
-rw-r--r-- | include/sys/vdev_impl.h | 95 | ||||
-rw-r--r-- | include/sys/vdev_indirect_births.h | 80 | ||||
-rw-r--r-- | include/sys/vdev_indirect_mapping.h | 141 | ||||
-rw-r--r-- | include/sys/vdev_removal.h | 93 | ||||
-rw-r--r-- | include/sys/zfs_debug.h | 3 | ||||
-rw-r--r-- | include/sys/zil.h | 2 | ||||
-rw-r--r-- | include/sys/zio.h | 5 | ||||
-rw-r--r-- | include/sys/zio_priority.h | 1 |
28 files changed, 753 insertions, 35 deletions
diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 8e18a8790..f30c9427e 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -71,6 +71,7 @@ COMMON_H = \ $(top_srcdir)/include/sys/trace_dnode.h \ $(top_srcdir)/include/sys/trace_multilist.h \ $(top_srcdir)/include/sys/trace_txg.h \ + $(top_srcdir)/include/sys/trace_vdev.h \ $(top_srcdir)/include/sys/trace_zil.h \ $(top_srcdir)/include/sys/trace_zio.h \ $(top_srcdir)/include/sys/trace_zrlock.h \ @@ -87,8 +88,11 @@ COMMON_H = \ $(top_srcdir)/include/sys/vdev_file.h \ $(top_srcdir)/include/sys/vdev.h \ $(top_srcdir)/include/sys/vdev_impl.h \ + $(top_srcdir)/include/sys/vdev_indirect_births.h \ + $(top_srcdir)/include/sys/vdev_indirect_mapping.h \ $(top_srcdir)/include/sys/vdev_raidz.h \ $(top_srcdir)/include/sys/vdev_raidz_impl.h \ + $(top_srcdir)/include/sys/vdev_removal.h \ $(top_srcdir)/include/sys/xvattr.h \ $(top_srcdir)/include/sys/zap.h \ $(top_srcdir)/include/sys/zap_impl.h \ diff --git a/include/sys/bpobj.h b/include/sys/bpobj.h index 2a365199c..d425e239f 100644 --- a/include/sys/bpobj.h +++ b/include/sys/bpobj.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_BPOBJ_H @@ -74,6 +74,7 @@ void bpobj_decr_empty(objset_t *os, dmu_tx_t *tx); int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object); void bpobj_close(bpobj_t *bpo); +boolean_t bpobj_is_open(const bpobj_t *bpo); int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx); int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *); @@ -85,6 +86,7 @@ int bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +boolean_t bpobj_is_empty(bpobj_t *bpo); #ifdef __cplusplus } diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 633dfd25a..32f036862 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -318,6 +318,8 @@ void dbuf_unoverride(dbuf_dirty_record_t *dr); void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx); void dbuf_release_bp(dmu_buf_impl_t *db); +boolean_t dbuf_can_remap(const dmu_buf_impl_t *buf); + void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, struct dmu_tx *); diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 027d3d9fc..ccc07006d 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -326,6 +326,7 @@ int dmu_objset_find(char *name, int func(const char *, void *), void *arg, void dmu_objset_byteswap(void *buf, size_t size); int dsl_dataset_rename_snapshot(const char *fsname, const char *oldsnapname, const char *newsnapname, boolean_t recursive); +int dmu_objset_remap_indirects(const char *fsname); typedef struct dmu_buf { uint64_t db_object; /* object that this buffer is part of */ @@ -362,6 +363,9 @@ typedef struct dmu_buf { #define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" #define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt" #define DMU_POOL_VDEV_ZAP_MAP "com.delphix:vdev_zap_map" +#define DMU_POOL_REMOVING "com.delphix:removing" +#define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj" +#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect" /* * Allocate an object from this objset. The range of object numbers @@ -470,6 +474,8 @@ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, int dmu_object_dirty_raw(objset_t *os, uint64_t object, dmu_tx_t *tx); +int dmu_object_remap_indirects(objset_t *os, uint64_t object, uint64_t txg); + void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); @@ -488,8 +494,8 @@ void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, * The bonus data is accessed more or less like a regular buffer. * You must dmu_bonus_hold() to get the buffer, which will give you a * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus - * data. As with any normal buffer, you must call dmu_buf_read() to - * read db_data, dmu_buf_will_dirty() before modifying it, and the + * data. As with any normal buffer, you must call dmu_buf_will_dirty() + * before modifying it, and the * object must be held in an assigned transaction before calling * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus * buffer as well. You must release what you hold with dmu_buf_rele(). @@ -740,6 +746,7 @@ void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len); +void dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name); diff --git a/include/sys/dnode.h b/include/sys/dnode.h index 2f70d5446..9c1df45e9 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -432,6 +432,7 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off, void dnode_evict_dbufs(dnode_t *dn); void dnode_evict_bonus(dnode_t *dn); void dnode_free_interior_slots(dnode_t *dn); +boolean_t dnode_needs_remap(const dnode_t *dn); #define DNODE_IS_DIRTY(_dn) \ ((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa)) diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h index 0030fca24..abd178e30 100644 --- a/include/sys/dsl_dataset.h +++ b/include/sys/dsl_dataset.h @@ -109,6 +109,11 @@ struct dsl_crypto_params; #define DS_FIELD_RESUME_RAWOK "com.datto:resume_rawok" /* + * This field is set to the object number of the remap deadlist if one exists. + */ +#define DS_FIELD_REMAP_DEADLIST "com.delphix:remap_deadlist" + +/* * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose * name lookups should be performed case-insensitively. */ @@ -169,6 +174,24 @@ typedef struct dsl_dataset { dsl_deadlist_t ds_deadlist; bplist_t ds_pending_deadlist; + /* + * The remap deadlist contains blocks (DVA's, really) that are + * referenced by the previous snapshot and point to indirect vdevs, + * but in this dataset they have been remapped to point to concrete + * (or at least, less-indirect) vdevs. In other words, the + * physical DVA is referenced by the previous snapshot but not by + * this dataset. Logically, the DVA continues to be referenced, + * but we are using a different (less indirect) physical DVA. + * This deadlist is used to determine when physical DVAs that + * point to indirect vdevs are no longer referenced anywhere, + * and thus should be marked obsolete. + * + * This is only used if SPA_FEATURE_OBSOLETE_COUNTS is enabled. + */ + dsl_deadlist_t ds_remap_deadlist; + /* protects creation of the ds_remap_deadlist */ + kmutex_t ds_remap_deadlist_lock; + /* protected by lock on pool's dp_dirty_datasets list */ txg_node_t ds_dirty_link; list_node_t ds_synced_link; @@ -328,6 +351,8 @@ void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx); int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, boolean_t async); +void dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, + uint64_t offset, uint64_t size, uint64_t birth, dmu_tx_t *tx); int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value); @@ -416,6 +441,11 @@ void dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx); int dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner, nvlist_t *result); +uint64_t dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds); +void dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx); +boolean_t dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds); +void dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx); + void dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx); void dsl_dataset_deactivate_feature(uint64_t dsobj, diff --git a/include/sys/dsl_deadlist.h b/include/sys/dsl_deadlist.h index d2c16d72c..08f38233d 100644 --- a/include/sys/dsl_deadlist.h +++ b/include/sys/dsl_deadlist.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_DEADLIST_H @@ -79,6 +80,7 @@ void dsl_deadlist_space_range(dsl_deadlist_t *dl, void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx); void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, dmu_tx_t *tx); +boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl); #ifdef __cplusplus } diff --git a/include/sys/dsl_deleg.h b/include/sys/dsl_deleg.h index eb95c68e8..bb28014ac 100644 --- a/include/sys/dsl_deleg.h +++ b/include/sys/dsl_deleg.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_DELEG_H @@ -61,6 +61,7 @@ extern "C" { #define ZFS_DELEG_PERM_RELEASE "release" #define ZFS_DELEG_PERM_DIFF "diff" #define ZFS_DELEG_PERM_BOOKMARK "bookmark" +#define ZFS_DELEG_PERM_REMAP "remap" #define ZFS_DELEG_PERM_LOAD_KEY "load-key" #define ZFS_DELEG_PERM_CHANGE_KEY "change-key" #define ZFS_DELEG_PERM_PROJECTUSED "projectused" diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h index 8b1e343a0..58d243885 100644 --- a/include/sys/dsl_dir.h +++ b/include/sys/dsl_dir.h @@ -48,7 +48,9 @@ struct dsl_dataset; #define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count" #define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count" +#define DD_FIELD_LAST_REMAP_TXG "com.delphix:last_remap_txg" #define DD_FIELD_CRYPTO_KEY_OBJ "com.datto:crypto_key_obj" +#define DD_FIELD_LAST_REMAP_TXG "com.delphix:last_remap_txg" typedef enum dd_used { DD_USED_HEAD, @@ -152,6 +154,7 @@ void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv); uint64_t dsl_dir_space_available(dsl_dir_t *dd, dsl_dir_t *ancestor, int64_t delta, int ondiskonly); void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx); +int dsl_dir_get_remaptxg(dsl_dir_t *dd, uint64_t *count); void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx); int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem, uint64_t asize, boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx); @@ -169,6 +172,7 @@ int dsl_dir_activate_fs_ss_limit(const char *); int dsl_fs_ss_limit_check(dsl_dir_t *, uint64_t, zfs_prop_t, dsl_dir_t *, cred_t *); void dsl_fs_ss_count_adjust(dsl_dir_t *, int64_t, const char *, dmu_tx_t *); +int dsl_dir_update_last_remap_txg(dsl_dir_t *, uint64_t); int dsl_dir_rename(const char *oldname, const char *newname); int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *); @@ -185,7 +189,6 @@ boolean_t dsl_dir_is_zapified(dsl_dir_t *dd); /* internal reserved dir name */ #define MOS_DIR_NAME "$MOS" #define ORIGIN_DIR_NAME "$ORIGIN" -#define XLATION_DIR_NAME "$XLATION" #define FREE_DIR_NAME "$FREE" #define LEAK_DIR_NAME "$LEAK" diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 9ceb59d9b..c60e4bf9d 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -102,6 +102,7 @@ typedef struct dsl_pool { bpobj_t dp_free_bpobj; uint64_t dp_bptree_obj; uint64_t dp_empty_bpobj; + bpobj_t dp_obsolete_bpobj; struct dsl_scan *dp_scan; @@ -151,7 +152,6 @@ void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg); void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg); int dsl_pool_sync_context(dsl_pool_t *dp); uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree); -uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree); void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg); void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); @@ -180,6 +180,9 @@ int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **); int dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp); void dsl_pool_rele(dsl_pool_t *dp, void *tag); +void dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx); +void dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx); + #ifdef __cplusplus } #endif diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index 7a29d9788..345d2754f 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2017 Datto Inc. */ @@ -117,6 +117,7 @@ typedef struct dsl_scan { boolean_t scn_is_bptree; boolean_t scn_async_destroying; boolean_t scn_async_stalled; + uint64_t scn_async_block_min_time_ms; /* flags and stats for controlling scan state */ boolean_t scn_is_sorted; /* doing sequential scan */ diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index e40c427f6..de3b729eb 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -180,6 +180,7 @@ typedef enum { ZFS_PROP_ENCRYPTION_ROOT, ZFS_PROP_KEY_GUID, ZFS_PROP_KEYSTATUS, + ZFS_PROP_REMAPTXG, /* not exposed to the user */ ZFS_NUM_PROPS } zfs_prop_t; @@ -587,7 +588,9 @@ typedef struct zpool_rewind_policy { /* * The following are configuration names used in the nvlist describing a pool's - * configuration. + * configuration. New on-disk names should be prefixed with "<reverse-DNS>:" + * (e.g. "org.open-zfs:") to avoid conflicting names being developed + * independently. */ #define ZPOOL_CONFIG_VERSION "version" #define ZPOOL_CONFIG_POOL_NAME "name" @@ -601,6 +604,9 @@ typedef struct zpool_rewind_policy { #define ZPOOL_CONFIG_CHILDREN "children" #define ZPOOL_CONFIG_ID "id" #define ZPOOL_CONFIG_GUID "guid" +#define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object" +#define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births" +#define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev" #define ZPOOL_CONFIG_PATH "path" #define ZPOOL_CONFIG_DEVID "devid" #define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" @@ -609,7 +615,9 @@ typedef struct zpool_rewind_policy { #define ZPOOL_CONFIG_ASIZE "asize" #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ +#define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ +#define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */ /* container nvlist of extended stats */ #define ZPOOL_CONFIG_VDEV_STATS_EX "vdev_stats_ex" @@ -736,6 +744,13 @@ typedef struct zpool_rewind_policy { #define VDEV_TYPE_SPARE "spare" #define VDEV_TYPE_LOG "log" #define VDEV_TYPE_L2CACHE "l2cache" +#define VDEV_TYPE_INDIRECT "indirect" + +/* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */ +#define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \ + "com.delphix:indirect_obsolete_sm" +#define VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \ + "com.delphix:obsolete_counts_are_precise" /* * This is needed in userland to report the minimum necessary device size. @@ -884,6 +899,20 @@ typedef struct pool_scan_stat { uint64_t pss_issued; /* total bytes checked by scanner */ } pool_scan_stat_t; +typedef struct pool_removal_stat { + uint64_t prs_state; /* dsl_scan_state_t */ + uint64_t prs_removing_vdev; + uint64_t prs_start_time; + uint64_t prs_end_time; + uint64_t prs_to_copy; /* bytes that need to be copied */ + uint64_t prs_copied; /* bytes copied so far */ + /* + * bytes of memory used for indirect mappings. + * This includes all removed vdevs. + */ + uint64_t prs_mapping_memory; +} pool_removal_stat_t; + typedef enum dsl_scan_state { DSS_NONE, DSS_SCANNING, @@ -1112,6 +1141,7 @@ typedef enum zfs_ioc { ZFS_IOC_LOAD_KEY, ZFS_IOC_UNLOAD_KEY, ZFS_IOC_CHANGE_KEY, + ZFS_IOC_REMAP, /* * Linux - 3/64 numbers reserved. diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index be271c702..fdcf6c71b 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -67,8 +67,15 @@ uint64_t metaslab_block_maxsize(metaslab_t *); int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *); +int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t, + dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); +void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, uint64_t); +void metaslab_free_dva(spa_t *, const dva_t *, uint64_t); +void metaslab_free_impl_cb(uint64_t, vdev_t *, uint64_t, uint64_t, void *); +void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t); int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); +int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t); void metaslab_check_free(spa_t *, const blkptr_t *); void metaslab_fastwrite_mark(spa_t *, const blkptr_t *); void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *); diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index f8a713a4f..76f670a4d 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -257,14 +257,13 @@ struct metaslab_group { * Each metaslab maintains a set of in-core trees to track metaslab * operations. The in-core free tree (ms_tree) contains the list of * free segments which are eligible for allocation. As blocks are - * allocated, the allocated segments are removed from the ms_tree and - * added to a per txg allocation tree (ms_alloctree). This allows us to - * process all allocations in syncing context where it is safe to update - * the on-disk space maps. Frees are also processed in syncing context. - * Most frees are generated from syncing context, and those that are not - * are held in the spa_free_bplist for processing in syncing context. - * An additional set of in-core trees is maintained to track deferred - * frees (ms_defertree). Once a block is freed it will move from the + * allocated, the allocated segment are removed from the ms_tree and + * added to a per txg allocation tree (ms_alloctree). As blocks are + * freed, they are added to the free tree (ms_freeingtree). These trees + * allow us to process all allocations and frees in syncing context + * where it is safe to update the on-disk space maps. An additional set + * of in-core trees is maintained to track deferred frees + * (ms_defertree). Once a block is freed it will move from the * ms_freedtree to the ms_defertree. A deferred free means that a block * has been freed but cannot be used by the pool until TXG_DEFER_SIZE * transactions groups later. For example, a block that is freed in txg @@ -310,6 +309,7 @@ struct metaslab_group { */ struct metaslab { kmutex_t ms_lock; + kmutex_t ms_sync_lock; kcondvar_t ms_load_cv; space_map_t *ms_sm; uint64_t ms_id; diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h index 1d3bdf9e5..970505628 100644 --- a/include/sys/range_tree.h +++ b/include/sys/range_tree.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_RANGE_TREE_H @@ -41,6 +41,10 @@ extern "C" { typedef struct range_tree_ops range_tree_ops_t; +/* + * Note: the range_tree may not be accessed concurrently; consumers + * must provide external locking if required. + */ typedef struct range_tree { avl_tree_t rt_root; /* offset-ordered segment AVL tree */ uint64_t rt_space; /* sum of all segments in the map */ @@ -58,7 +62,6 @@ typedef struct range_tree { * 2^i <= size of range in bytes < 2^(i+1) */ uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE]; - kmutex_t *rt_lock; /* pointer to lock that protects map */ } range_tree_t; typedef struct range_seg { @@ -82,9 +85,8 @@ typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size); void range_tree_init(void); void range_tree_fini(void); range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg, - int (*avl_compare) (const void *, const void *), kmutex_t *lp, - uint64_t gap); -range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp); + int (*avl_compare) (const void *, const void *), uint64_t gap); +range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg); void range_tree_destroy(range_tree_t *rt); boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size); range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size); @@ -94,7 +96,6 @@ uint64_t range_tree_space(range_tree_t *rt); void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size); void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst); void range_tree_stat_verify(range_tree_t *rt); -void range_tree_set_lock(range_tree_t *rt, kmutex_t *lp); void range_tree_add(void *arg, uint64_t start, uint64_t size); void range_tree_remove(void *arg, uint64_t start, uint64_t size); diff --git a/include/sys/spa.h b/include/sys/spa.h index f93354c78..eb47c21f6 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -227,7 +227,10 @@ typedef struct zio_cksum_salt { * E blkptr_t contains embedded data (see below) * lvl level of indirection * type DMU object type - * phys birth txg of block allocation; zero if same as logical birth txg + * phys birth txg when dva[0] was written; zero if same as logical birth txg + * note that typically all the dva's would be written in this + * txg, but they could be different if they were moved by + * device removal. * log. birth transaction group in which the block was logically born * fill count number of non-zero blocks under this bp * checksum[4] 256-bit checksum of the data this bp describes @@ -817,7 +820,7 @@ extern kmutex_t spa_namespace_lock; #define SPA_CONFIG_UPDATE_POOL 0 #define SPA_CONFIG_UPDATE_VDEVS 1 -extern void spa_config_sync(spa_t *, boolean_t, boolean_t); +extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t); extern void spa_config_load(void); extern nvlist_t *spa_all_configs(uint64_t *); extern void spa_config_set(spa_t *spa, nvlist_t *config); @@ -932,7 +935,7 @@ typedef enum spa_log_state { extern spa_log_state_t spa_get_log_state(spa_t *spa); extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); -extern int spa_offline_log(spa_t *spa); +extern int spa_reset_logs(spa_t *spa); /* Log claim callback */ extern void spa_claim_notify(zio_t *zio); @@ -942,6 +945,7 @@ extern void spa_deadman(void *); extern boolean_t spa_shutting_down(spa_t *spa); extern struct dsl_pool *spa_get_dsl(spa_t *spa); extern boolean_t spa_is_initializing(spa_t *spa); +extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa); extern blkptr_t *spa_get_rootblkptr(spa_t *spa); extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); extern void spa_altroot(spa_t *, char *, size_t); @@ -1009,6 +1013,11 @@ extern boolean_t spa_has_pending_synctask(spa_t *spa); extern int spa_maxblocksize(spa_t *spa); extern int spa_maxdnodesize(spa_t *spa); extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp); +typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size, + void *arg); +extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, + spa_remap_cb_t callback, void *arg); +extern uint64_t spa_get_last_removal_txg(spa_t *spa); extern boolean_t spa_multihost(spa_t *spa); extern unsigned long spa_get_hostid(void); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 77625d4b0..1741eb9e5 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -33,6 +33,7 @@ #include <sys/spa.h> #include <sys/vdev.h> +#include <sys/vdev_removal.h> #include <sys/metaslab.h> #include <sys/dmu.h> #include <sys/dsl_pool.h> @@ -64,6 +65,62 @@ typedef struct spa_history_phys { uint64_t sh_records_lost; /* num of records overwritten */ } spa_history_phys_t; +/* + * All members must be uint64_t, for byteswap purposes. + */ +typedef struct spa_removing_phys { + uint64_t sr_state; /* dsl_scan_state_t */ + + /* + * The vdev ID that we most recently attempted to remove, + * or -1 if no removal has been attempted. + */ + uint64_t sr_removing_vdev; + + /* + * The vdev ID that we most recently successfully removed, + * or -1 if no devices have been removed. + */ + uint64_t sr_prev_indirect_vdev; + + uint64_t sr_start_time; + uint64_t sr_end_time; + + /* + * Note that we can not use the space map's or indirect mapping's + * accounting as a substitute for these values, because we need to + * count frees of not-yet-copied data as though it did the copy. + * Otherwise, we could get into a situation where copied > to_copy, + * or we complete before copied == to_copy. + */ + uint64_t sr_to_copy; /* bytes that need to be copied */ + uint64_t sr_copied; /* bytes that have been copied or freed */ +} spa_removing_phys_t; + +/* + * This struct is stored as an entry in the DMU_POOL_DIRECTORY_OBJECT + * (with key DMU_POOL_CONDENSING_INDIRECT). It is present if a condense + * of an indirect vdev's mapping object is in progress. + */ +typedef struct spa_condensing_indirect_phys { + /* + * The vdev ID of the indirect vdev whose indirect mapping is + * being condensed. + */ + uint64_t scip_vdev; + + /* + * The vdev's old obsolete spacemap. This spacemap's contents are + * being integrated into the new mapping. + */ + uint64_t scip_prev_obsolete_sm_object; + + /* + * The new mapping object that is being created. + */ + uint64_t scip_next_mapping_object; +} spa_condensing_indirect_phys_t; + struct spa_aux_vdev { uint64_t sav_object; /* MOS object for device list */ nvlist_t *sav_config; /* cached device config */ @@ -143,6 +200,7 @@ struct spa { int spa_inject_ref; /* injection references */ uint8_t spa_sync_on; /* sync threads are running */ spa_load_state_t spa_load_state; /* current load operation */ + boolean_t spa_indirect_vdevs_loaded; /* mappings loaded? */ uint64_t spa_import_flags; /* import specific flags */ spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; @@ -204,6 +262,14 @@ struct spa { int spa_async_suspended; /* async tasks suspended */ kcondvar_t spa_async_cv; /* wait for thread_exit() */ uint16_t spa_async_tasks; /* async task mask */ + + spa_removing_phys_t spa_removing_phys; + spa_vdev_removal_t *spa_vdev_removal; + + spa_condensing_indirect_phys_t spa_condensing_indirect_phys; + spa_condensing_indirect_t *spa_condensing_indirect; + kthread_t *spa_condense_thread; /* thread doing condense. */ + char *spa_root; /* alternate root directory */ uint64_t spa_ena; /* spa-wide ereport ENA */ int spa_last_open_failed; /* error if last open failed */ @@ -234,6 +300,7 @@ struct spa { /* per-CPU array of root of async I/O: */ zio_t **spa_async_zio_root; zio_t *spa_suspend_zio_root; /* root of all suspended I/O */ + zio_t *spa_txg_zio[TXG_SIZE]; /* spa_sync() waits for this */ kmutex_t spa_suspend_lock; /* protects suspend_zio_root */ kcondvar_t spa_suspend_cv; /* notification of resume */ zio_suspend_reason_t spa_suspended; /* pool is suspended */ @@ -302,6 +369,11 @@ extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent); extern void spa_taskq_dispatch_sync(spa_t *, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags); +extern void spa_load_spares(spa_t *spa); +extern void spa_load_l2cache(spa_t *spa); +extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, + const char *name); +extern void spa_event_post(sysevent_t *ev); #ifdef __cplusplus diff --git a/include/sys/space_map.h b/include/sys/space_map.h index a59e6d37d..457300d05 100644 --- a/include/sys/space_map.h +++ b/include/sys/space_map.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_SPACE_MAP_H @@ -73,6 +73,9 @@ typedef struct space_map_phys { * The space map object defines a region of space, its size, how much is * allocated, and the on-disk object that stores this information. * Consumers of space maps may only access the members of this structure. + * + * Note: the space_map may not be accessed concurrently; consumers + * must provide external locking if required. */ typedef struct space_map { uint64_t sm_start; /* start of map */ @@ -85,7 +88,6 @@ typedef struct space_map { uint32_t sm_blksz; /* block size for space map */ dmu_buf_t *sm_dbuf; /* space_map_phys_t dbuf */ space_map_phys_t *sm_phys; /* on-disk space map */ - kmutex_t *sm_lock; /* pointer to lock that protects map */ } space_map_t; /* @@ -133,7 +135,11 @@ typedef enum { SM_FREE } maptype_t; +typedef int (*sm_cb_t)(maptype_t type, uint64_t offset, uint64_t size, + void *arg); + int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype); +int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg); void space_map_histogram_clear(space_map_t *sm); void space_map_histogram_add(space_map_t *sm, range_tree_t *rt, @@ -150,9 +156,10 @@ void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, void space_map_truncate(space_map_t *sm, dmu_tx_t *tx); uint64_t space_map_alloc(objset_t *os, dmu_tx_t *tx); void space_map_free(space_map_t *sm, dmu_tx_t *tx); +void space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx); int space_map_open(space_map_t **smp, objset_t *os, uint64_t object, - uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp); + uint64_t start, uint64_t size, uint8_t shift); void space_map_close(space_map_t *sm); int64_t space_map_alloc_delta(space_map_t *sm); diff --git a/include/sys/trace_vdev.h b/include/sys/trace_vdev.h new file mode 100644 index 000000000..d7af44c25 --- /dev/null +++ b/include/sys/trace_vdev.h @@ -0,0 +1,119 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR zfs_vdev + +#if !defined(_TRACE_VDEV_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VDEV_H + +#include <linux/tracepoint.h> +#include <sys/types.h> + +/* + * Generic support for three argument tracepoints of the form: + * + * DTRACE_PROBE3(..., + * spa_t *, ..., + * uint64_t, ..., + * uint64_t, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_removing_class_3, + TP_PROTO(spa_t *spa, uint64_t offset, uint64_t size), + TP_ARGS(spa, offset, size), + TP_STRUCT__entry( + __field(spa_t *, vdev_spa) + __field(uint64_t, vdev_offset) + __field(uint64_t, vdev_size) + ), + TP_fast_assign( + __entry->vdev_spa = spa; + __entry->vdev_offset = offset; + __entry->vdev_size = size; + ), + TP_printk("spa %p offset %llu size %llu", + __entry->vdev_spa, __entry->vdev_offset, + __entry->vdev_size) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_REMOVE_FREE_EVENT(name) \ +DEFINE_EVENT(zfs_removing_class_3, name, \ + TP_PROTO(spa_t *spa, uint64_t offset, uint64_t size), \ + TP_ARGS(spa, offset, size)) +/* END CSTYLED */ +DEFINE_REMOVE_FREE_EVENT(zfs_remove__free__synced); +DEFINE_REMOVE_FREE_EVENT(zfs_remove__free__unvisited); + +/* + * Generic support for four argument tracepoints of the form: + * + * DTRACE_PROBE4(..., + * spa_t *, ..., + * uint64_t, ..., + * uint64_t, ..., + * uint64_t, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_removing_class_4, + TP_PROTO(spa_t *spa, uint64_t offset, uint64_t size, uint64_t txg), + TP_ARGS(spa, offset, size, txg), + TP_STRUCT__entry( + __field(spa_t *, vdev_spa) + __field(uint64_t, vdev_offset) + __field(uint64_t, vdev_size) + __field(uint64_t, vdev_txg) + ), + TP_fast_assign( + __entry->vdev_spa = spa; + __entry->vdev_offset = offset; + __entry->vdev_size = size; + __entry->vdev_txg = txg; + ), + TP_printk("spa %p offset %llu size %llu txg %llu", + __entry->vdev_spa, __entry->vdev_offset, + __entry->vdev_size, __entry->vdev_txg) +); + +/* BEGIN CSTYLED */ +#define DEFINE_REMOVE_FREE_EVENT_TXG(name) \ +DEFINE_EVENT(zfs_removing_class_4, name, \ + TP_PROTO(spa_t *spa, uint64_t offset, uint64_t size,uint64_t txg), \ + TP_ARGS(spa, offset, size, txg)) +/* END CSTYLED */ +DEFINE_REMOVE_FREE_EVENT_TXG(zfs_remove__free__inflight); + +#endif /* _TRACE_VDEV_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace_vdev +#include <trace/define_trace.h> + +#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 022713096..511d4d0b6 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_VDEV_H @@ -55,7 +55,7 @@ extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); extern void vdev_reopen(vdev_t *); extern int vdev_validate_aux(vdev_t *vd); extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio); - +extern boolean_t vdev_is_concrete(vdev_t *vd); extern boolean_t vdev_is_bootable(vdev_t *vd); extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid); @@ -75,6 +75,11 @@ extern void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx); extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx); extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx); +extern void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx); +extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, + uint64_t size, uint64_t txg); +extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev, + uint64_t offset, uint64_t size, dmu_tx_t *tx); extern void vdev_hold(vdev_t *); extern void vdev_rele(vdev_t *); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 2ad3510ea..b933f9ab8 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -27,6 +27,7 @@ #define _SYS_VDEV_IMPL_H #include <sys/avl.h> +#include <sys/bpobj.h> #include <sys/dmu.h> #include <sys/metaslab.h> #include <sys/nvpair.h> @@ -34,6 +35,9 @@ #include <sys/vdev.h> #include <sys/dkio.h> #include <sys/uberblock_impl.h> +#include <sys/vdev_indirect_mapping.h> +#include <sys/vdev_indirect_births.h> +#include <sys/vdev_removal.h> #include <sys/zfs_ratelimit.h> #ifdef __cplusplus @@ -72,6 +76,11 @@ typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t); typedef void vdev_hold_func_t(vdev_t *vd); typedef void vdev_rele_func_t(vdev_t *vd); +typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd, + uint64_t offset, uint64_t size, void *arg); +typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size, + vdev_remap_cb_t callback, void *arg); + typedef const struct vdev_ops { vdev_open_func_t *vdev_op_open; vdev_close_func_t *vdev_op_close; @@ -82,6 +91,7 @@ typedef const struct vdev_ops { vdev_need_resilver_func_t *vdev_op_need_resilver; vdev_hold_func_t *vdev_op_hold; vdev_rele_func_t *vdev_op_rele; + vdev_remap_func_t *vdev_op_remap; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; @@ -130,6 +140,45 @@ struct vdev_queue { }; /* + * On-disk indirect vdev state. + * + * An indirect vdev is described exclusively in the MOS config of a pool. + * The config for an indirect vdev includes several fields, which are + * accessed in memory by a vdev_indirect_config_t. + */ +typedef struct vdev_indirect_config { + /* + * Object (in MOS) which contains the indirect mapping. This object + * contains an array of vdev_indirect_mapping_entry_phys_t ordered by + * vimep_src. The bonus buffer for this object is a + * vdev_indirect_mapping_phys_t. This object is allocated when a vdev + * removal is initiated. + * + * Note that this object can be empty if none of the data on the vdev + * has been copied yet. + */ + uint64_t vic_mapping_object; + + /* + * Object (in MOS) which contains the birth times for the mapping + * entries. This object contains an array of + * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus + * buffer for this object is a vdev_indirect_birth_phys_t. This object + * is allocated when a vdev removal is initiated. + * + * Note that this object can be empty if none of the vdev has yet been + * copied. + */ + uint64_t vic_births_object; + + /* + * This is the vdev ID which was removed previous to this vdev, or + * UINT64_MAX if there are no previously removed vdevs. + */ + uint64_t vic_prev_indirect_vdev; +} vdev_indirect_config_t; + +/* * Virtual device descriptor */ struct vdev { @@ -188,6 +237,40 @@ struct vdev { uint64_t vdev_top_zap; /* + * Values stored in the config for an indirect or removing vdev. + */ + vdev_indirect_config_t vdev_indirect_config; + + /* + * The vdev_indirect_rwlock protects the vdev_indirect_mapping + * pointer from changing on indirect vdevs (when it is condensed). + * Note that removing (not yet indirect) vdevs have different + * access patterns (the mapping is not accessed from open context, + * e.g. from zio_read) and locking strategy (e.g. svr_lock). + */ + krwlock_t vdev_indirect_rwlock; + vdev_indirect_mapping_t *vdev_indirect_mapping; + vdev_indirect_births_t *vdev_indirect_births; + + /* + * In memory data structures used to manage the obsolete sm, for + * indirect or removing vdevs. + * + * The vdev_obsolete_segments is the in-core record of the segments + * that are no longer referenced anywhere in the pool (due to + * being freed or remapped and not referenced by any snapshots). + * During a sync, segments are added to vdev_obsolete_segments + * via vdev_indirect_mark_obsolete(); at the end of each sync + * pass, this is appended to vdev_obsolete_sm via + * vdev_indirect_sync_obsolete(). The vdev_obsolete_lock + * protects against concurrent modifications of vdev_obsolete_segments + * from multiple zio threads. + */ + kmutex_t vdev_obsolete_lock; + range_tree_t *vdev_obsolete_segments; + space_map_t *vdev_obsolete_sm; + + /* * The queue depth parameters determine how many async writes are * still pending (i.e. allocated by net yet issued to disk) per * top-level (vdev_async_write_queue_depth) and the maximum allowed @@ -356,7 +439,7 @@ extern void vdev_remove_parent(vdev_t *cvd); */ extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd); extern boolean_t vdev_log_state_valid(vdev_t *vd); -extern void vdev_load(vdev_t *vd); +extern int vdev_load(vdev_t *vd); extern int vdev_dtl_load(vdev_t *vd); extern void vdev_sync(vdev_t *vd, uint64_t txg); extern void vdev_sync_done(vdev_t *vd, uint64_t txg); @@ -375,6 +458,7 @@ extern vdev_ops_t vdev_file_ops; extern vdev_ops_t vdev_missing_ops; extern vdev_ops_t vdev_hole_ops; extern vdev_ops_t vdev_spare_ops; +extern vdev_ops_t vdev_indirect_ops; /* * Common size functions @@ -389,6 +473,15 @@ extern void vdev_set_min_asize(vdev_t *vd); /* zdb uses this tunable, so it must be declared here to make lint happy. */ extern int zfs_vdev_cache_size; +/* + * Functions from vdev_indirect.c + */ +extern void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx); +extern boolean_t vdev_indirect_should_condense(vdev_t *vd); +extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx); +extern int vdev_obsolete_sm_object(vdev_t *vd); +extern boolean_t vdev_obsolete_counts_are_precise(vdev_t *vd); + #ifdef __cplusplus } #endif diff --git a/include/sys/vdev_indirect_births.h b/include/sys/vdev_indirect_births.h new file mode 100644 index 000000000..987b14485 --- /dev/null +++ b/include/sys/vdev_indirect_births.h @@ -0,0 +1,80 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2015 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VDEV_INDIRECT_BIRTHS_H +#define _SYS_VDEV_INDIRECT_BIRTHS_H + +#include <sys/dmu.h> +#include <sys/spa.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vdev_indirect_birth_entry_phys { + uint64_t vibe_offset; + uint64_t vibe_phys_birth_txg; +} vdev_indirect_birth_entry_phys_t; + +typedef struct vdev_indirect_birth_phys { + uint64_t vib_count; /* count of v_i_b_entry_phys_t's */ +} vdev_indirect_birth_phys_t; + +typedef struct vdev_indirect_births { + uint64_t vib_object; + + /* + * Each entry indicates that everything up to but not including + * vibe_offset was copied in vibe_phys_birth_txg. Entries are sorted + * by increasing phys_birth, and also by increasing offset. See + * vdev_indirect_births_physbirth for usage. + */ + vdev_indirect_birth_entry_phys_t *vib_entries; + + objset_t *vib_objset; + + dmu_buf_t *vib_dbuf; + vdev_indirect_birth_phys_t *vib_phys; +} vdev_indirect_births_t; + +extern vdev_indirect_births_t *vdev_indirect_births_open(objset_t *os, + uint64_t object); +extern void vdev_indirect_births_close(vdev_indirect_births_t *vib); +extern boolean_t vdev_indirect_births_is_open(vdev_indirect_births_t *vib); +extern uint64_t vdev_indirect_births_alloc(objset_t *os, dmu_tx_t *tx); +extern void vdev_indirect_births_free(objset_t *os, uint64_t object, + dmu_tx_t *tx); + +extern uint64_t vdev_indirect_births_count(vdev_indirect_births_t *vib); +extern uint64_t vdev_indirect_births_object(vdev_indirect_births_t *vib); + +extern void vdev_indirect_births_add_entry(vdev_indirect_births_t *vib, + uint64_t offset, uint64_t txg, dmu_tx_t *tx); + +extern uint64_t vdev_indirect_births_physbirth(vdev_indirect_births_t *vib, + uint64_t offset, uint64_t asize); + +extern uint64_t vdev_indirect_births_last_entry_txg( + vdev_indirect_births_t *vib); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_INDIRECT_BIRTHS_H */ diff --git a/include/sys/vdev_indirect_mapping.h b/include/sys/vdev_indirect_mapping.h new file mode 100644 index 000000000..7e42c1019 --- /dev/null +++ b/include/sys/vdev_indirect_mapping.h @@ -0,0 +1,141 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2015 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VDEV_INDIRECT_MAPPING_H +#define _SYS_VDEV_INDIRECT_MAPPING_H + +#include <sys/dmu.h> +#include <sys/list.h> +#include <sys/spa.h> +#include <sys/space_map.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vdev_indirect_mapping_entry_phys { + /* + * Decode with DVA_MAPPING_* macros. + * Contains: + * the source offset (low 63 bits) + * the one-bit "mark", used for garbage collection (by zdb) + */ + uint64_t vimep_src; + + /* + * Note: the DVA's asize is 24 bits, and can thus store ranges + * up to 8GB. + */ + dva_t vimep_dst; +} vdev_indirect_mapping_entry_phys_t; + +#define DVA_MAPPING_GET_SRC_OFFSET(vimep) \ + BF64_GET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0) +#define DVA_MAPPING_SET_SRC_OFFSET(vimep, x) \ + BF64_SET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0, x) + +typedef struct vdev_indirect_mapping_entry { + vdev_indirect_mapping_entry_phys_t vime_mapping; + uint32_t vime_obsolete_count; + list_node_t vime_node; +} vdev_indirect_mapping_entry_t; + +/* + * This is stored in the bonus buffer of the mapping object, see comment of + * vdev_indirect_config for more details. + */ +typedef struct vdev_indirect_mapping_phys { + uint64_t vimp_max_offset; + uint64_t vimp_bytes_mapped; + uint64_t vimp_num_entries; /* number of v_i_m_entry_phys_t's */ + + /* + * For each entry in the mapping object, this object contains an + * entry representing the number of bytes of that mapping entry + * that were no longer in use by the pool at the time this indirect + * vdev was last condensed. + */ + uint64_t vimp_counts_object; +} vdev_indirect_mapping_phys_t; + +#define VDEV_INDIRECT_MAPPING_SIZE_V0 (3 * sizeof (uint64_t)) + +typedef struct vdev_indirect_mapping { + uint64_t vim_object; + boolean_t vim_havecounts; + + /* + * An ordered array of all mapping entries, sorted by source offset. + * Note that vim_entries is needed during a removal (and contains + * mappings that have been synced to disk so far) to handle frees + * from the removing device. + */ + vdev_indirect_mapping_entry_phys_t *vim_entries; + + objset_t *vim_objset; + + dmu_buf_t *vim_dbuf; + vdev_indirect_mapping_phys_t *vim_phys; +} vdev_indirect_mapping_t; + +extern vdev_indirect_mapping_t *vdev_indirect_mapping_open(objset_t *os, + uint64_t object); +extern void vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim); +extern uint64_t vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx); +extern void vdev_indirect_mapping_free(objset_t *os, uint64_t obj, + dmu_tx_t *tx); + +extern uint64_t vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim); +extern uint64_t vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim); +extern uint64_t vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim); +extern uint64_t vdev_indirect_mapping_bytes_mapped( + vdev_indirect_mapping_t *vim); +extern uint64_t vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim); + +/* + * Writes the given list of vdev_indirect_mapping_entry_t to the mapping + * then updates internal state. + */ +extern void vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim, + list_t *vime_list, dmu_tx_t *tx); + +extern vdev_indirect_mapping_entry_phys_t * + vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim, + uint64_t offset); + +extern vdev_indirect_mapping_entry_phys_t * + vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim, + uint64_t offset); + +extern uint32_t *vdev_indirect_mapping_load_obsolete_counts( + vdev_indirect_mapping_t *vim); +extern void vdev_indirect_mapping_load_obsolete_spacemap( + vdev_indirect_mapping_t *vim, + uint32_t *counts, space_map_t *obsolete_space_sm); +extern void vdev_indirect_mapping_increment_obsolete_count( + vdev_indirect_mapping_t *vim, + uint64_t offset, uint64_t asize, uint32_t *counts); +extern void vdev_indirect_mapping_free_obsolete_counts( + vdev_indirect_mapping_t *vim, uint32_t *counts); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_INDIRECT_MAPPING_H */ diff --git a/include/sys/vdev_removal.h b/include/sys/vdev_removal.h new file mode 100644 index 000000000..5b1e3056b --- /dev/null +++ b/include/sys/vdev_removal.h @@ -0,0 +1,93 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2014, 2015 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VDEV_REMOVAL_H +#define _SYS_VDEV_REMOVAL_H + +#include <sys/spa.h> +#include <sys/bpobj.h> +#include <sys/vdev_indirect_mapping.h> +#include <sys/vdev_indirect_births.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct spa_vdev_removal { + vdev_t *svr_vdev; + uint64_t svr_max_offset_to_sync[TXG_SIZE]; + /* Thread performing a vdev removal. */ + kthread_t *svr_thread; + /* Segments left to copy from the current metaslab. */ + range_tree_t *svr_allocd_segs; + kmutex_t svr_lock; + kcondvar_t svr_cv; + boolean_t svr_thread_exit; + + /* + * New mappings to write out each txg. + */ + list_t svr_new_segments[TXG_SIZE]; + + /* + * Ranges that were freed while a mapping was in flight. This is + * a subset of the ranges covered by vdev_im_new_segments. + */ + range_tree_t *svr_frees[TXG_SIZE]; + + /* + * Number of bytes which we have finished our work for + * in each txg. This could be data copied (which will be part of + * the mappings in vdev_im_new_segments), or data freed before + * we got around to copying it. + */ + uint64_t svr_bytes_done[TXG_SIZE]; + + /* List of leaf zap objects to be unlinked */ + nvlist_t *svr_zaplist; +} spa_vdev_removal_t; + +typedef struct spa_condensing_indirect { + /* + * New mappings to write out each txg. + */ + list_t sci_new_mapping_entries[TXG_SIZE]; + + vdev_indirect_mapping_t *sci_new_mapping; +} spa_condensing_indirect_t; + +extern int spa_remove_init(spa_t *); +extern void spa_restart_removal(spa_t *); +extern int spa_condense_init(spa_t *); +extern void spa_condense_fini(spa_t *); +extern void spa_condense_indirect_restart(spa_t *); +extern void spa_vdev_condense_suspend(spa_t *); +extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t); +extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t, uint64_t); +extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *); +extern void svr_sync(spa_t *spa, dmu_tx_t *tx); +extern void spa_vdev_remove_suspend(spa_t *); +extern int spa_vdev_remove_cancel(spa_t *); +extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_REMOVAL_H */ diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h index 7e05c2a35..91b161ff8 100644 --- a/include/sys/zfs_debug.h +++ b/include/sys/zfs_debug.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_DEBUG_H @@ -53,6 +53,7 @@ extern int zfs_dbgmsg_enable; #define ZFS_DEBUG_HISTOGRAM_VERIFY (1 << 7) #define ZFS_DEBUG_METASLAB_VERIFY (1 << 8) #define ZFS_DEBUG_SET_ERROR (1 << 9) +#define ZFS_DEBUG_INDIRECT_REMAP (1 << 10) extern void __dprintf(const char *file, const char *func, int line, const char *fmt, ...); diff --git a/include/sys/zil.h b/include/sys/zil.h index fc0cc5065..fb7b38a06 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -496,7 +496,7 @@ extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); extern void zil_commit(zilog_t *zilog, uint64_t oid); extern void zil_commit_impl(zilog_t *zilog, uint64_t oid); -extern int zil_vdev_offline(const char *osname, void *txarg); +extern int zil_reset(const char *osname, void *txarg); extern int zil_claim(struct dsl_pool *dp, struct dsl_dataset *ds, void *txarg); extern int zil_check_log_chain(struct dsl_pool *dp, diff --git a/include/sys/zio.h b/include/sys/zio.h index be8e18b4b..a275b16de 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -244,7 +244,7 @@ enum zio_flag { #define ZIO_VDEV_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \ - ZIO_FLAG_CANFAIL) + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_CANFAIL) #define ZIO_CHILD_BIT(x) (1 << (x)) #define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1 << (x))) @@ -278,6 +278,9 @@ enum zio_wait_type { #define ECKSUM EBADE #define EFRAGS EBADR +/* Similar for ENOACTIVE */ +#define ENOTACTIVE ENOANO + typedef void zio_done_func_t(zio_t *zio); extern int zio_dva_throttle_enabled; diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h index 3fc3589be..c2cc8b2d5 100644 --- a/include/sys/zio_priority.h +++ b/include/sys/zio_priority.h @@ -28,6 +28,7 @@ typedef enum zio_priority { ZIO_PRIORITY_ASYNC_READ, /* prefetch */ ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ + ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */ ZIO_PRIORITY_NUM_QUEUEABLE, ZIO_PRIORITY_NOW, /* non-queued i/os (e.g. free) */ } zio_priority_t; |