summaryrefslogtreecommitdiffstats
path: root/include/sys
diff options
context:
space:
mode:
Diffstat (limited to 'include/sys')
-rw-r--r--include/sys/Makefile.am4
-rw-r--r--include/sys/bpobj.h4
-rw-r--r--include/sys/dbuf.h2
-rw-r--r--include/sys/dmu.h11
-rw-r--r--include/sys/dnode.h1
-rw-r--r--include/sys/dsl_dataset.h30
-rw-r--r--include/sys/dsl_deadlist.h2
-rw-r--r--include/sys/dsl_deleg.h3
-rw-r--r--include/sys/dsl_dir.h5
-rw-r--r--include/sys/dsl_pool.h5
-rw-r--r--include/sys/dsl_scan.h3
-rw-r--r--include/sys/fs/zfs.h32
-rw-r--r--include/sys/metaslab.h7
-rw-r--r--include/sys/metaslab_impl.h16
-rw-r--r--include/sys/range_tree.h13
-rw-r--r--include/sys/spa.h15
-rw-r--r--include/sys/spa_impl.h74
-rw-r--r--include/sys/space_map.h13
-rw-r--r--include/sys/trace_vdev.h119
-rw-r--r--include/sys/vdev.h9
-rw-r--r--include/sys/vdev_impl.h95
-rw-r--r--include/sys/vdev_indirect_births.h80
-rw-r--r--include/sys/vdev_indirect_mapping.h141
-rw-r--r--include/sys/vdev_removal.h93
-rw-r--r--include/sys/zfs_debug.h3
-rw-r--r--include/sys/zil.h2
-rw-r--r--include/sys/zio.h5
-rw-r--r--include/sys/zio_priority.h1
28 files changed, 753 insertions, 35 deletions
diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am
index 8e18a8790..f30c9427e 100644
--- a/include/sys/Makefile.am
+++ b/include/sys/Makefile.am
@@ -71,6 +71,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/trace_dnode.h \
$(top_srcdir)/include/sys/trace_multilist.h \
$(top_srcdir)/include/sys/trace_txg.h \
+ $(top_srcdir)/include/sys/trace_vdev.h \
$(top_srcdir)/include/sys/trace_zil.h \
$(top_srcdir)/include/sys/trace_zio.h \
$(top_srcdir)/include/sys/trace_zrlock.h \
@@ -87,8 +88,11 @@ COMMON_H = \
$(top_srcdir)/include/sys/vdev_file.h \
$(top_srcdir)/include/sys/vdev.h \
$(top_srcdir)/include/sys/vdev_impl.h \
+ $(top_srcdir)/include/sys/vdev_indirect_births.h \
+ $(top_srcdir)/include/sys/vdev_indirect_mapping.h \
$(top_srcdir)/include/sys/vdev_raidz.h \
$(top_srcdir)/include/sys/vdev_raidz_impl.h \
+ $(top_srcdir)/include/sys/vdev_removal.h \
$(top_srcdir)/include/sys/xvattr.h \
$(top_srcdir)/include/sys/zap.h \
$(top_srcdir)/include/sys/zap_impl.h \
diff --git a/include/sys/bpobj.h b/include/sys/bpobj.h
index 2a365199c..d425e239f 100644
--- a/include/sys/bpobj.h
+++ b/include/sys/bpobj.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_BPOBJ_H
@@ -74,6 +74,7 @@ void bpobj_decr_empty(objset_t *os, dmu_tx_t *tx);
int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object);
void bpobj_close(bpobj_t *bpo);
+boolean_t bpobj_is_open(const bpobj_t *bpo);
int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx);
int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *);
@@ -85,6 +86,7 @@ int bpobj_space(bpobj_t *bpo,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+boolean_t bpobj_is_empty(bpobj_t *bpo);
#ifdef __cplusplus
}
diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h
index 633dfd25a..32f036862 100644
--- a/include/sys/dbuf.h
+++ b/include/sys/dbuf.h
@@ -318,6 +318,8 @@ void dbuf_unoverride(dbuf_dirty_record_t *dr);
void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
void dbuf_release_bp(dmu_buf_impl_t *db);
+boolean_t dbuf_can_remap(const dmu_buf_impl_t *buf);
+
void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
struct dmu_tx *);
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 027d3d9fc..ccc07006d 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -326,6 +326,7 @@ int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
void dmu_objset_byteswap(void *buf, size_t size);
int dsl_dataset_rename_snapshot(const char *fsname,
const char *oldsnapname, const char *newsnapname, boolean_t recursive);
+int dmu_objset_remap_indirects(const char *fsname);
typedef struct dmu_buf {
uint64_t db_object; /* object that this buffer is part of */
@@ -362,6 +363,9 @@ typedef struct dmu_buf {
#define DMU_POOL_EMPTY_BPOBJ "empty_bpobj"
#define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt"
#define DMU_POOL_VDEV_ZAP_MAP "com.delphix:vdev_zap_map"
+#define DMU_POOL_REMOVING "com.delphix:removing"
+#define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj"
+#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect"
/*
* Allocate an object from this objset. The range of object numbers
@@ -470,6 +474,8 @@ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
int dmu_object_dirty_raw(objset_t *os, uint64_t object, dmu_tx_t *tx);
+int dmu_object_remap_indirects(objset_t *os, uint64_t object, uint64_t txg);
+
void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
int compressed_size, int byteorder, dmu_tx_t *tx);
@@ -488,8 +494,8 @@ void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
* The bonus data is accessed more or less like a regular buffer.
* You must dmu_bonus_hold() to get the buffer, which will give you a
* dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
- * data. As with any normal buffer, you must call dmu_buf_read() to
- * read db_data, dmu_buf_will_dirty() before modifying it, and the
+ * data. As with any normal buffer, you must call dmu_buf_will_dirty()
+ * before modifying it, and the
* object must be held in an assigned transaction before calling
* dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus
* buffer as well. You must release what you hold with dmu_buf_rele().
@@ -740,6 +746,7 @@ void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
uint64_t len);
void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
uint64_t len);
+void dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object);
void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add,
const char *name);
diff --git a/include/sys/dnode.h b/include/sys/dnode.h
index 2f70d5446..9c1df45e9 100644
--- a/include/sys/dnode.h
+++ b/include/sys/dnode.h
@@ -432,6 +432,7 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
void dnode_evict_dbufs(dnode_t *dn);
void dnode_evict_bonus(dnode_t *dn);
void dnode_free_interior_slots(dnode_t *dn);
+boolean_t dnode_needs_remap(const dnode_t *dn);
#define DNODE_IS_DIRTY(_dn) \
((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa))
diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h
index 0030fca24..abd178e30 100644
--- a/include/sys/dsl_dataset.h
+++ b/include/sys/dsl_dataset.h
@@ -109,6 +109,11 @@ struct dsl_crypto_params;
#define DS_FIELD_RESUME_RAWOK "com.datto:resume_rawok"
/*
+ * This field is set to the object number of the remap deadlist if one exists.
+ */
+#define DS_FIELD_REMAP_DEADLIST "com.delphix:remap_deadlist"
+
+/*
* DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
* name lookups should be performed case-insensitively.
*/
@@ -169,6 +174,24 @@ typedef struct dsl_dataset {
dsl_deadlist_t ds_deadlist;
bplist_t ds_pending_deadlist;
+ /*
+ * The remap deadlist contains blocks (DVA's, really) that are
+ * referenced by the previous snapshot and point to indirect vdevs,
+ * but in this dataset they have been remapped to point to concrete
+ * (or at least, less-indirect) vdevs. In other words, the
+ * physical DVA is referenced by the previous snapshot but not by
+ * this dataset. Logically, the DVA continues to be referenced,
+ * but we are using a different (less indirect) physical DVA.
+ * This deadlist is used to determine when physical DVAs that
+ * point to indirect vdevs are no longer referenced anywhere,
+ * and thus should be marked obsolete.
+ *
+ * This is only used if SPA_FEATURE_OBSOLETE_COUNTS is enabled.
+ */
+ dsl_deadlist_t ds_remap_deadlist;
+ /* protects creation of the ds_remap_deadlist */
+ kmutex_t ds_remap_deadlist_lock;
+
/* protected by lock on pool's dp_dirty_datasets list */
txg_node_t ds_dirty_link;
list_node_t ds_synced_link;
@@ -328,6 +351,8 @@ void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp,
dmu_tx_t *tx);
int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp,
dmu_tx_t *tx, boolean_t async);
+void dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev,
+ uint64_t offset, uint64_t size, uint64_t birth, dmu_tx_t *tx);
int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name,
uint64_t *value);
@@ -416,6 +441,11 @@ void dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx);
int dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner,
nvlist_t *result);
+uint64_t dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds);
+void dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx);
+boolean_t dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds);
+void dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx);
+
void dsl_dataset_activate_feature(uint64_t dsobj,
spa_feature_t f, dmu_tx_t *tx);
void dsl_dataset_deactivate_feature(uint64_t dsobj,
diff --git a/include/sys/dsl_deadlist.h b/include/sys/dsl_deadlist.h
index d2c16d72c..08f38233d 100644
--- a/include/sys/dsl_deadlist.h
+++ b/include/sys/dsl_deadlist.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_DSL_DEADLIST_H
@@ -79,6 +80,7 @@ void dsl_deadlist_space_range(dsl_deadlist_t *dl,
void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx);
void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
dmu_tx_t *tx);
+boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl);
#ifdef __cplusplus
}
diff --git a/include/sys/dsl_deleg.h b/include/sys/dsl_deleg.h
index eb95c68e8..bb28014ac 100644
--- a/include/sys/dsl_deleg.h
+++ b/include/sys/dsl_deleg.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_DSL_DELEG_H
@@ -61,6 +61,7 @@ extern "C" {
#define ZFS_DELEG_PERM_RELEASE "release"
#define ZFS_DELEG_PERM_DIFF "diff"
#define ZFS_DELEG_PERM_BOOKMARK "bookmark"
+#define ZFS_DELEG_PERM_REMAP "remap"
#define ZFS_DELEG_PERM_LOAD_KEY "load-key"
#define ZFS_DELEG_PERM_CHANGE_KEY "change-key"
#define ZFS_DELEG_PERM_PROJECTUSED "projectused"
diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h
index 8b1e343a0..58d243885 100644
--- a/include/sys/dsl_dir.h
+++ b/include/sys/dsl_dir.h
@@ -48,7 +48,9 @@ struct dsl_dataset;
#define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count"
#define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count"
+#define DD_FIELD_LAST_REMAP_TXG "com.delphix:last_remap_txg"
#define DD_FIELD_CRYPTO_KEY_OBJ "com.datto:crypto_key_obj"
+#define DD_FIELD_LAST_REMAP_TXG "com.delphix:last_remap_txg"
typedef enum dd_used {
DD_USED_HEAD,
@@ -152,6 +154,7 @@ void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv);
uint64_t dsl_dir_space_available(dsl_dir_t *dd,
dsl_dir_t *ancestor, int64_t delta, int ondiskonly);
void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx);
+int dsl_dir_get_remaptxg(dsl_dir_t *dd, uint64_t *count);
void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx);
int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem,
uint64_t asize, boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx);
@@ -169,6 +172,7 @@ int dsl_dir_activate_fs_ss_limit(const char *);
int dsl_fs_ss_limit_check(dsl_dir_t *, uint64_t, zfs_prop_t, dsl_dir_t *,
cred_t *);
void dsl_fs_ss_count_adjust(dsl_dir_t *, int64_t, const char *, dmu_tx_t *);
+int dsl_dir_update_last_remap_txg(dsl_dir_t *, uint64_t);
int dsl_dir_rename(const char *oldname, const char *newname);
int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *);
@@ -185,7 +189,6 @@ boolean_t dsl_dir_is_zapified(dsl_dir_t *dd);
/* internal reserved dir name */
#define MOS_DIR_NAME "$MOS"
#define ORIGIN_DIR_NAME "$ORIGIN"
-#define XLATION_DIR_NAME "$XLATION"
#define FREE_DIR_NAME "$FREE"
#define LEAK_DIR_NAME "$LEAK"
diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h
index 9ceb59d9b..c60e4bf9d 100644
--- a/include/sys/dsl_pool.h
+++ b/include/sys/dsl_pool.h
@@ -102,6 +102,7 @@ typedef struct dsl_pool {
bpobj_t dp_free_bpobj;
uint64_t dp_bptree_obj;
uint64_t dp_empty_bpobj;
+ bpobj_t dp_obsolete_bpobj;
struct dsl_scan *dp_scan;
@@ -151,7 +152,6 @@ void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg);
int dsl_pool_sync_context(dsl_pool_t *dp);
uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
-uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree);
void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
@@ -180,6 +180,9 @@ int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **);
int dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp);
void dsl_pool_rele(dsl_pool_t *dp, void *tag);
+void dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx);
+void dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx);
+
#ifdef __cplusplus
}
#endif
diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h
index 7a29d9788..345d2754f 100644
--- a/include/sys/dsl_scan.h
+++ b/include/sys/dsl_scan.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2017 Datto Inc.
*/
@@ -117,6 +117,7 @@ typedef struct dsl_scan {
boolean_t scn_is_bptree;
boolean_t scn_async_destroying;
boolean_t scn_async_stalled;
+ uint64_t scn_async_block_min_time_ms;
/* flags and stats for controlling scan state */
boolean_t scn_is_sorted; /* doing sequential scan */
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index e40c427f6..de3b729eb 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -180,6 +180,7 @@ typedef enum {
ZFS_PROP_ENCRYPTION_ROOT,
ZFS_PROP_KEY_GUID,
ZFS_PROP_KEYSTATUS,
+ ZFS_PROP_REMAPTXG, /* not exposed to the user */
ZFS_NUM_PROPS
} zfs_prop_t;
@@ -587,7 +588,9 @@ typedef struct zpool_rewind_policy {
/*
* The following are configuration names used in the nvlist describing a pool's
- * configuration.
+ * configuration. New on-disk names should be prefixed with "<reverse-DNS>:"
+ * (e.g. "org.open-zfs:") to avoid conflicting names being developed
+ * independently.
*/
#define ZPOOL_CONFIG_VERSION "version"
#define ZPOOL_CONFIG_POOL_NAME "name"
@@ -601,6 +604,9 @@ typedef struct zpool_rewind_policy {
#define ZPOOL_CONFIG_CHILDREN "children"
#define ZPOOL_CONFIG_ID "id"
#define ZPOOL_CONFIG_GUID "guid"
+#define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object"
+#define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births"
+#define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev"
#define ZPOOL_CONFIG_PATH "path"
#define ZPOOL_CONFIG_DEVID "devid"
#define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array"
@@ -609,7 +615,9 @@ typedef struct zpool_rewind_policy {
#define ZPOOL_CONFIG_ASIZE "asize"
#define ZPOOL_CONFIG_DTL "DTL"
#define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */
+#define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */
#define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */
+#define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */
/* container nvlist of extended stats */
#define ZPOOL_CONFIG_VDEV_STATS_EX "vdev_stats_ex"
@@ -736,6 +744,13 @@ typedef struct zpool_rewind_policy {
#define VDEV_TYPE_SPARE "spare"
#define VDEV_TYPE_LOG "log"
#define VDEV_TYPE_L2CACHE "l2cache"
+#define VDEV_TYPE_INDIRECT "indirect"
+
+/* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */
+#define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \
+ "com.delphix:indirect_obsolete_sm"
+#define VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \
+ "com.delphix:obsolete_counts_are_precise"
/*
* This is needed in userland to report the minimum necessary device size.
@@ -884,6 +899,20 @@ typedef struct pool_scan_stat {
uint64_t pss_issued; /* total bytes checked by scanner */
} pool_scan_stat_t;
+typedef struct pool_removal_stat {
+ uint64_t prs_state; /* dsl_scan_state_t */
+ uint64_t prs_removing_vdev;
+ uint64_t prs_start_time;
+ uint64_t prs_end_time;
+ uint64_t prs_to_copy; /* bytes that need to be copied */
+ uint64_t prs_copied; /* bytes copied so far */
+ /*
+ * bytes of memory used for indirect mappings.
+ * This includes all removed vdevs.
+ */
+ uint64_t prs_mapping_memory;
+} pool_removal_stat_t;
+
typedef enum dsl_scan_state {
DSS_NONE,
DSS_SCANNING,
@@ -1112,6 +1141,7 @@ typedef enum zfs_ioc {
ZFS_IOC_LOAD_KEY,
ZFS_IOC_UNLOAD_KEY,
ZFS_IOC_CHANGE_KEY,
+ ZFS_IOC_REMAP,
/*
* Linux - 3/64 numbers reserved.
diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index be271c702..fdcf6c71b 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -67,8 +67,15 @@ uint64_t metaslab_block_maxsize(metaslab_t *);
int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *);
+int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t,
+ dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *);
void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
+void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, uint64_t);
+void metaslab_free_dva(spa_t *, const dva_t *, uint64_t);
+void metaslab_free_impl_cb(uint64_t, vdev_t *, uint64_t, uint64_t, void *);
+void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t);
int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
+int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
void metaslab_check_free(spa_t *, const blkptr_t *);
void metaslab_fastwrite_mark(spa_t *, const blkptr_t *);
void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *);
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index f8a713a4f..76f670a4d 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -257,14 +257,13 @@ struct metaslab_group {
* Each metaslab maintains a set of in-core trees to track metaslab
* operations. The in-core free tree (ms_tree) contains the list of
* free segments which are eligible for allocation. As blocks are
- * allocated, the allocated segments are removed from the ms_tree and
- * added to a per txg allocation tree (ms_alloctree). This allows us to
- * process all allocations in syncing context where it is safe to update
- * the on-disk space maps. Frees are also processed in syncing context.
- * Most frees are generated from syncing context, and those that are not
- * are held in the spa_free_bplist for processing in syncing context.
- * An additional set of in-core trees is maintained to track deferred
- * frees (ms_defertree). Once a block is freed it will move from the
+ * allocated, the allocated segment are removed from the ms_tree and
+ * added to a per txg allocation tree (ms_alloctree). As blocks are
+ * freed, they are added to the free tree (ms_freeingtree). These trees
+ * allow us to process all allocations and frees in syncing context
+ * where it is safe to update the on-disk space maps. An additional set
+ * of in-core trees is maintained to track deferred frees
+ * (ms_defertree). Once a block is freed it will move from the
* ms_freedtree to the ms_defertree. A deferred free means that a block
* has been freed but cannot be used by the pool until TXG_DEFER_SIZE
* transactions groups later. For example, a block that is freed in txg
@@ -310,6 +309,7 @@ struct metaslab_group {
*/
struct metaslab {
kmutex_t ms_lock;
+ kmutex_t ms_sync_lock;
kcondvar_t ms_load_cv;
space_map_t *ms_sm;
uint64_t ms_id;
diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h
index 1d3bdf9e5..970505628 100644
--- a/include/sys/range_tree.h
+++ b/include/sys/range_tree.h
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_RANGE_TREE_H
@@ -41,6 +41,10 @@ extern "C" {
typedef struct range_tree_ops range_tree_ops_t;
+/*
+ * Note: the range_tree may not be accessed concurrently; consumers
+ * must provide external locking if required.
+ */
typedef struct range_tree {
avl_tree_t rt_root; /* offset-ordered segment AVL tree */
uint64_t rt_space; /* sum of all segments in the map */
@@ -58,7 +62,6 @@ typedef struct range_tree {
* 2^i <= size of range in bytes < 2^(i+1)
*/
uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE];
- kmutex_t *rt_lock; /* pointer to lock that protects map */
} range_tree_t;
typedef struct range_seg {
@@ -82,9 +85,8 @@ typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size);
void range_tree_init(void);
void range_tree_fini(void);
range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg,
- int (*avl_compare) (const void *, const void *), kmutex_t *lp,
- uint64_t gap);
-range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp);
+ int (*avl_compare) (const void *, const void *), uint64_t gap);
+range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
void range_tree_destroy(range_tree_t *rt);
boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
@@ -94,7 +96,6 @@ uint64_t range_tree_space(range_tree_t *rt);
void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
void range_tree_stat_verify(range_tree_t *rt);
-void range_tree_set_lock(range_tree_t *rt, kmutex_t *lp);
void range_tree_add(void *arg, uint64_t start, uint64_t size);
void range_tree_remove(void *arg, uint64_t start, uint64_t size);
diff --git a/include/sys/spa.h b/include/sys/spa.h
index f93354c78..eb47c21f6 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -227,7 +227,10 @@ typedef struct zio_cksum_salt {
* E blkptr_t contains embedded data (see below)
* lvl level of indirection
* type DMU object type
- * phys birth txg of block allocation; zero if same as logical birth txg
+ * phys birth txg when dva[0] was written; zero if same as logical birth txg
+ * note that typically all the dva's would be written in this
+ * txg, but they could be different if they were moved by
+ * device removal.
* log. birth transaction group in which the block was logically born
* fill count number of non-zero blocks under this bp
* checksum[4] 256-bit checksum of the data this bp describes
@@ -817,7 +820,7 @@ extern kmutex_t spa_namespace_lock;
#define SPA_CONFIG_UPDATE_POOL 0
#define SPA_CONFIG_UPDATE_VDEVS 1
-extern void spa_config_sync(spa_t *, boolean_t, boolean_t);
+extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t);
extern void spa_config_load(void);
extern nvlist_t *spa_all_configs(uint64_t *);
extern void spa_config_set(spa_t *spa, nvlist_t *config);
@@ -932,7 +935,7 @@ typedef enum spa_log_state {
extern spa_log_state_t spa_get_log_state(spa_t *spa);
extern void spa_set_log_state(spa_t *spa, spa_log_state_t state);
-extern int spa_offline_log(spa_t *spa);
+extern int spa_reset_logs(spa_t *spa);
/* Log claim callback */
extern void spa_claim_notify(zio_t *zio);
@@ -942,6 +945,7 @@ extern void spa_deadman(void *);
extern boolean_t spa_shutting_down(spa_t *spa);
extern struct dsl_pool *spa_get_dsl(spa_t *spa);
extern boolean_t spa_is_initializing(spa_t *spa);
+extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa);
extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
extern void spa_altroot(spa_t *, char *, size_t);
@@ -1009,6 +1013,11 @@ extern boolean_t spa_has_pending_synctask(spa_t *spa);
extern int spa_maxblocksize(spa_t *spa);
extern int spa_maxdnodesize(spa_t *spa);
extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp);
+typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size,
+ void *arg);
+extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp,
+ spa_remap_cb_t callback, void *arg);
+extern uint64_t spa_get_last_removal_txg(spa_t *spa);
extern boolean_t spa_multihost(spa_t *spa);
extern unsigned long spa_get_hostid(void);
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 77625d4b0..1741eb9e5 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
@@ -33,6 +33,7 @@
#include <sys/spa.h>
#include <sys/vdev.h>
+#include <sys/vdev_removal.h>
#include <sys/metaslab.h>
#include <sys/dmu.h>
#include <sys/dsl_pool.h>
@@ -64,6 +65,62 @@ typedef struct spa_history_phys {
uint64_t sh_records_lost; /* num of records overwritten */
} spa_history_phys_t;
+/*
+ * All members must be uint64_t, for byteswap purposes.
+ */
+typedef struct spa_removing_phys {
+ uint64_t sr_state; /* dsl_scan_state_t */
+
+ /*
+ * The vdev ID that we most recently attempted to remove,
+ * or -1 if no removal has been attempted.
+ */
+ uint64_t sr_removing_vdev;
+
+ /*
+ * The vdev ID that we most recently successfully removed,
+ * or -1 if no devices have been removed.
+ */
+ uint64_t sr_prev_indirect_vdev;
+
+ uint64_t sr_start_time;
+ uint64_t sr_end_time;
+
+ /*
+ * Note that we can not use the space map's or indirect mapping's
+ * accounting as a substitute for these values, because we need to
+ * count frees of not-yet-copied data as though it did the copy.
+ * Otherwise, we could get into a situation where copied > to_copy,
+ * or we complete before copied == to_copy.
+ */
+ uint64_t sr_to_copy; /* bytes that need to be copied */
+ uint64_t sr_copied; /* bytes that have been copied or freed */
+} spa_removing_phys_t;
+
+/*
+ * This struct is stored as an entry in the DMU_POOL_DIRECTORY_OBJECT
+ * (with key DMU_POOL_CONDENSING_INDIRECT). It is present if a condense
+ * of an indirect vdev's mapping object is in progress.
+ */
+typedef struct spa_condensing_indirect_phys {
+ /*
+ * The vdev ID of the indirect vdev whose indirect mapping is
+ * being condensed.
+ */
+ uint64_t scip_vdev;
+
+ /*
+ * The vdev's old obsolete spacemap. This spacemap's contents are
+ * being integrated into the new mapping.
+ */
+ uint64_t scip_prev_obsolete_sm_object;
+
+ /*
+ * The new mapping object that is being created.
+ */
+ uint64_t scip_next_mapping_object;
+} spa_condensing_indirect_phys_t;
+
struct spa_aux_vdev {
uint64_t sav_object; /* MOS object for device list */
nvlist_t *sav_config; /* cached device config */
@@ -143,6 +200,7 @@ struct spa {
int spa_inject_ref; /* injection references */
uint8_t spa_sync_on; /* sync threads are running */
spa_load_state_t spa_load_state; /* current load operation */
+ boolean_t spa_indirect_vdevs_loaded; /* mappings loaded? */
uint64_t spa_import_flags; /* import specific flags */
spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
dsl_pool_t *spa_dsl_pool;
@@ -204,6 +262,14 @@ struct spa {
int spa_async_suspended; /* async tasks suspended */
kcondvar_t spa_async_cv; /* wait for thread_exit() */
uint16_t spa_async_tasks; /* async task mask */
+
+ spa_removing_phys_t spa_removing_phys;
+ spa_vdev_removal_t *spa_vdev_removal;
+
+ spa_condensing_indirect_phys_t spa_condensing_indirect_phys;
+ spa_condensing_indirect_t *spa_condensing_indirect;
+ kthread_t *spa_condense_thread; /* thread doing condense. */
+
char *spa_root; /* alternate root directory */
uint64_t spa_ena; /* spa-wide ereport ENA */
int spa_last_open_failed; /* error if last open failed */
@@ -234,6 +300,7 @@ struct spa {
/* per-CPU array of root of async I/O: */
zio_t **spa_async_zio_root;
zio_t *spa_suspend_zio_root; /* root of all suspended I/O */
+ zio_t *spa_txg_zio[TXG_SIZE]; /* spa_sync() waits for this */
kmutex_t spa_suspend_lock; /* protects suspend_zio_root */
kcondvar_t spa_suspend_cv; /* notification of resume */
zio_suspend_reason_t spa_suspended; /* pool is suspended */
@@ -302,6 +369,11 @@ extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent);
extern void spa_taskq_dispatch_sync(spa_t *, zio_type_t t, zio_taskq_type_t q,
task_func_t *func, void *arg, uint_t flags);
+extern void spa_load_spares(spa_t *spa);
+extern void spa_load_l2cache(spa_t *spa);
+extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl,
+ const char *name);
+extern void spa_event_post(sysevent_t *ev);
#ifdef __cplusplus
diff --git a/include/sys/space_map.h b/include/sys/space_map.h
index a59e6d37d..457300d05 100644
--- a/include/sys/space_map.h
+++ b/include/sys/space_map.h
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_SPACE_MAP_H
@@ -73,6 +73,9 @@ typedef struct space_map_phys {
* The space map object defines a region of space, its size, how much is
* allocated, and the on-disk object that stores this information.
* Consumers of space maps may only access the members of this structure.
+ *
+ * Note: the space_map may not be accessed concurrently; consumers
+ * must provide external locking if required.
*/
typedef struct space_map {
uint64_t sm_start; /* start of map */
@@ -85,7 +88,6 @@ typedef struct space_map {
uint32_t sm_blksz; /* block size for space map */
dmu_buf_t *sm_dbuf; /* space_map_phys_t dbuf */
space_map_phys_t *sm_phys; /* on-disk space map */
- kmutex_t *sm_lock; /* pointer to lock that protects map */
} space_map_t;
/*
@@ -133,7 +135,11 @@ typedef enum {
SM_FREE
} maptype_t;
+typedef int (*sm_cb_t)(maptype_t type, uint64_t offset, uint64_t size,
+ void *arg);
+
int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
+int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg);
void space_map_histogram_clear(space_map_t *sm);
void space_map_histogram_add(space_map_t *sm, range_tree_t *rt,
@@ -150,9 +156,10 @@ void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
void space_map_truncate(space_map_t *sm, dmu_tx_t *tx);
uint64_t space_map_alloc(objset_t *os, dmu_tx_t *tx);
void space_map_free(space_map_t *sm, dmu_tx_t *tx);
+void space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx);
int space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
- uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp);
+ uint64_t start, uint64_t size, uint8_t shift);
void space_map_close(space_map_t *sm);
int64_t space_map_alloc_delta(space_map_t *sm);
diff --git a/include/sys/trace_vdev.h b/include/sys/trace_vdev.h
new file mode 100644
index 000000000..d7af44c25
--- /dev/null
+++ b/include/sys/trace_vdev.h
@@ -0,0 +1,119 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS)
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM zfs
+
+#undef TRACE_SYSTEM_VAR
+#define TRACE_SYSTEM_VAR zfs_vdev
+
+#if !defined(_TRACE_VDEV_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_VDEV_H
+
+#include <linux/tracepoint.h>
+#include <sys/types.h>
+
+/*
+ * Generic support for three argument tracepoints of the form:
+ *
+ * DTRACE_PROBE3(...,
+ * spa_t *, ...,
+ * uint64_t, ...,
+ * uint64_t, ...);
+ */
+/* BEGIN CSTYLED */
+DECLARE_EVENT_CLASS(zfs_removing_class_3,
+ TP_PROTO(spa_t *spa, uint64_t offset, uint64_t size),
+ TP_ARGS(spa, offset, size),
+ TP_STRUCT__entry(
+ __field(spa_t *, vdev_spa)
+ __field(uint64_t, vdev_offset)
+ __field(uint64_t, vdev_size)
+ ),
+ TP_fast_assign(
+ __entry->vdev_spa = spa;
+ __entry->vdev_offset = offset;
+ __entry->vdev_size = size;
+ ),
+ TP_printk("spa %p offset %llu size %llu",
+ __entry->vdev_spa, __entry->vdev_offset,
+ __entry->vdev_size)
+);
+/* END CSTYLED */
+
+/* BEGIN CSTYLED */
+#define DEFINE_REMOVE_FREE_EVENT(name) \
+DEFINE_EVENT(zfs_removing_class_3, name, \
+ TP_PROTO(spa_t *spa, uint64_t offset, uint64_t size), \
+ TP_ARGS(spa, offset, size))
+/* END CSTYLED */
+DEFINE_REMOVE_FREE_EVENT(zfs_remove__free__synced);
+DEFINE_REMOVE_FREE_EVENT(zfs_remove__free__unvisited);
+
+/*
+ * Generic support for four argument tracepoints of the form:
+ *
+ * DTRACE_PROBE4(...,
+ * spa_t *, ...,
+ * uint64_t, ...,
+ * uint64_t, ...,
+ * uint64_t, ...);
+ */
+/* BEGIN CSTYLED */
+DECLARE_EVENT_CLASS(zfs_removing_class_4,
+ TP_PROTO(spa_t *spa, uint64_t offset, uint64_t size, uint64_t txg),
+ TP_ARGS(spa, offset, size, txg),
+ TP_STRUCT__entry(
+ __field(spa_t *, vdev_spa)
+ __field(uint64_t, vdev_offset)
+ __field(uint64_t, vdev_size)
+ __field(uint64_t, vdev_txg)
+ ),
+ TP_fast_assign(
+ __entry->vdev_spa = spa;
+ __entry->vdev_offset = offset;
+ __entry->vdev_size = size;
+ __entry->vdev_txg = txg;
+ ),
+ TP_printk("spa %p offset %llu size %llu txg %llu",
+ __entry->vdev_spa, __entry->vdev_offset,
+ __entry->vdev_size, __entry->vdev_txg)
+);
+
+/* BEGIN CSTYLED */
+#define DEFINE_REMOVE_FREE_EVENT_TXG(name) \
+DEFINE_EVENT(zfs_removing_class_4, name, \
+ TP_PROTO(spa_t *spa, uint64_t offset, uint64_t size,uint64_t txg), \
+ TP_ARGS(spa, offset, size, txg))
+/* END CSTYLED */
+DEFINE_REMOVE_FREE_EVENT_TXG(zfs_remove__free__inflight);
+
+#endif /* _TRACE_VDEV_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH sys
+#define TRACE_INCLUDE_FILE trace_vdev
+#include <trace/define_trace.h>
+
+#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index 022713096..511d4d0b6 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_VDEV_H
@@ -55,7 +55,7 @@ extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
extern void vdev_reopen(vdev_t *);
extern int vdev_validate_aux(vdev_t *vd);
extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
-
+extern boolean_t vdev_is_concrete(vdev_t *vd);
extern boolean_t vdev_is_bootable(vdev_t *vd);
extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
@@ -75,6 +75,11 @@ extern void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj,
dmu_tx_t *tx);
extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx);
extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx);
+extern void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx);
+extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset,
+ uint64_t size, uint64_t txg);
+extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev,
+ uint64_t offset, uint64_t size, dmu_tx_t *tx);
extern void vdev_hold(vdev_t *);
extern void vdev_rele(vdev_t *);
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 2ad3510ea..b933f9ab8 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -27,6 +27,7 @@
#define _SYS_VDEV_IMPL_H
#include <sys/avl.h>
+#include <sys/bpobj.h>
#include <sys/dmu.h>
#include <sys/metaslab.h>
#include <sys/nvpair.h>
@@ -34,6 +35,9 @@
#include <sys/vdev.h>
#include <sys/dkio.h>
#include <sys/uberblock_impl.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/vdev_indirect_births.h>
+#include <sys/vdev_removal.h>
#include <sys/zfs_ratelimit.h>
#ifdef __cplusplus
@@ -72,6 +76,11 @@ typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t);
typedef void vdev_hold_func_t(vdev_t *vd);
typedef void vdev_rele_func_t(vdev_t *vd);
+typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd,
+ uint64_t offset, uint64_t size, void *arg);
+typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size,
+ vdev_remap_cb_t callback, void *arg);
+
typedef const struct vdev_ops {
vdev_open_func_t *vdev_op_open;
vdev_close_func_t *vdev_op_close;
@@ -82,6 +91,7 @@ typedef const struct vdev_ops {
vdev_need_resilver_func_t *vdev_op_need_resilver;
vdev_hold_func_t *vdev_op_hold;
vdev_rele_func_t *vdev_op_rele;
+ vdev_remap_func_t *vdev_op_remap;
char vdev_op_type[16];
boolean_t vdev_op_leaf;
} vdev_ops_t;
@@ -130,6 +140,45 @@ struct vdev_queue {
};
/*
+ * On-disk indirect vdev state.
+ *
+ * An indirect vdev is described exclusively in the MOS config of a pool.
+ * The config for an indirect vdev includes several fields, which are
+ * accessed in memory by a vdev_indirect_config_t.
+ */
+typedef struct vdev_indirect_config {
+ /*
+ * Object (in MOS) which contains the indirect mapping. This object
+ * contains an array of vdev_indirect_mapping_entry_phys_t ordered by
+ * vimep_src. The bonus buffer for this object is a
+ * vdev_indirect_mapping_phys_t. This object is allocated when a vdev
+ * removal is initiated.
+ *
+ * Note that this object can be empty if none of the data on the vdev
+ * has been copied yet.
+ */
+ uint64_t vic_mapping_object;
+
+ /*
+ * Object (in MOS) which contains the birth times for the mapping
+ * entries. This object contains an array of
+ * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus
+ * buffer for this object is a vdev_indirect_birth_phys_t. This object
+ * is allocated when a vdev removal is initiated.
+ *
+ * Note that this object can be empty if none of the vdev has yet been
+ * copied.
+ */
+ uint64_t vic_births_object;
+
+ /*
+ * This is the vdev ID which was removed previous to this vdev, or
+ * UINT64_MAX if there are no previously removed vdevs.
+ */
+ uint64_t vic_prev_indirect_vdev;
+} vdev_indirect_config_t;
+
+/*
* Virtual device descriptor
*/
struct vdev {
@@ -188,6 +237,40 @@ struct vdev {
uint64_t vdev_top_zap;
/*
+ * Values stored in the config for an indirect or removing vdev.
+ */
+ vdev_indirect_config_t vdev_indirect_config;
+
+ /*
+ * The vdev_indirect_rwlock protects the vdev_indirect_mapping
+ * pointer from changing on indirect vdevs (when it is condensed).
+ * Note that removing (not yet indirect) vdevs have different
+ * access patterns (the mapping is not accessed from open context,
+ * e.g. from zio_read) and locking strategy (e.g. svr_lock).
+ */
+ krwlock_t vdev_indirect_rwlock;
+ vdev_indirect_mapping_t *vdev_indirect_mapping;
+ vdev_indirect_births_t *vdev_indirect_births;
+
+ /*
+ * In memory data structures used to manage the obsolete sm, for
+ * indirect or removing vdevs.
+ *
+ * The vdev_obsolete_segments is the in-core record of the segments
+ * that are no longer referenced anywhere in the pool (due to
+ * being freed or remapped and not referenced by any snapshots).
+ * During a sync, segments are added to vdev_obsolete_segments
+ * via vdev_indirect_mark_obsolete(); at the end of each sync
+ * pass, this is appended to vdev_obsolete_sm via
+ * vdev_indirect_sync_obsolete(). The vdev_obsolete_lock
+ * protects against concurrent modifications of vdev_obsolete_segments
+ * from multiple zio threads.
+ */
+ kmutex_t vdev_obsolete_lock;
+ range_tree_t *vdev_obsolete_segments;
+ space_map_t *vdev_obsolete_sm;
+
+ /*
* The queue depth parameters determine how many async writes are
* still pending (i.e. allocated by net yet issued to disk) per
* top-level (vdev_async_write_queue_depth) and the maximum allowed
@@ -356,7 +439,7 @@ extern void vdev_remove_parent(vdev_t *cvd);
*/
extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
extern boolean_t vdev_log_state_valid(vdev_t *vd);
-extern void vdev_load(vdev_t *vd);
+extern int vdev_load(vdev_t *vd);
extern int vdev_dtl_load(vdev_t *vd);
extern void vdev_sync(vdev_t *vd, uint64_t txg);
extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
@@ -375,6 +458,7 @@ extern vdev_ops_t vdev_file_ops;
extern vdev_ops_t vdev_missing_ops;
extern vdev_ops_t vdev_hole_ops;
extern vdev_ops_t vdev_spare_ops;
+extern vdev_ops_t vdev_indirect_ops;
/*
* Common size functions
@@ -389,6 +473,15 @@ extern void vdev_set_min_asize(vdev_t *vd);
/* zdb uses this tunable, so it must be declared here to make lint happy. */
extern int zfs_vdev_cache_size;
+/*
+ * Functions from vdev_indirect.c
+ */
+extern void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx);
+extern boolean_t vdev_indirect_should_condense(vdev_t *vd);
+extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx);
+extern int vdev_obsolete_sm_object(vdev_t *vd);
+extern boolean_t vdev_obsolete_counts_are_precise(vdev_t *vd);
+
#ifdef __cplusplus
}
#endif
diff --git a/include/sys/vdev_indirect_births.h b/include/sys/vdev_indirect_births.h
new file mode 100644
index 000000000..987b14485
--- /dev/null
+++ b/include/sys/vdev_indirect_births.h
@@ -0,0 +1,80 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_INDIRECT_BIRTHS_H
+#define _SYS_VDEV_INDIRECT_BIRTHS_H
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct vdev_indirect_birth_entry_phys {
+ uint64_t vibe_offset;
+ uint64_t vibe_phys_birth_txg;
+} vdev_indirect_birth_entry_phys_t;
+
+typedef struct vdev_indirect_birth_phys {
+ uint64_t vib_count; /* count of v_i_b_entry_phys_t's */
+} vdev_indirect_birth_phys_t;
+
+typedef struct vdev_indirect_births {
+ uint64_t vib_object;
+
+ /*
+ * Each entry indicates that everything up to but not including
+ * vibe_offset was copied in vibe_phys_birth_txg. Entries are sorted
+ * by increasing phys_birth, and also by increasing offset. See
+ * vdev_indirect_births_physbirth for usage.
+ */
+ vdev_indirect_birth_entry_phys_t *vib_entries;
+
+ objset_t *vib_objset;
+
+ dmu_buf_t *vib_dbuf;
+ vdev_indirect_birth_phys_t *vib_phys;
+} vdev_indirect_births_t;
+
+extern vdev_indirect_births_t *vdev_indirect_births_open(objset_t *os,
+ uint64_t object);
+extern void vdev_indirect_births_close(vdev_indirect_births_t *vib);
+extern boolean_t vdev_indirect_births_is_open(vdev_indirect_births_t *vib);
+extern uint64_t vdev_indirect_births_alloc(objset_t *os, dmu_tx_t *tx);
+extern void vdev_indirect_births_free(objset_t *os, uint64_t object,
+ dmu_tx_t *tx);
+
+extern uint64_t vdev_indirect_births_count(vdev_indirect_births_t *vib);
+extern uint64_t vdev_indirect_births_object(vdev_indirect_births_t *vib);
+
+extern void vdev_indirect_births_add_entry(vdev_indirect_births_t *vib,
+ uint64_t offset, uint64_t txg, dmu_tx_t *tx);
+
+extern uint64_t vdev_indirect_births_physbirth(vdev_indirect_births_t *vib,
+ uint64_t offset, uint64_t asize);
+
+extern uint64_t vdev_indirect_births_last_entry_txg(
+ vdev_indirect_births_t *vib);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_INDIRECT_BIRTHS_H */
diff --git a/include/sys/vdev_indirect_mapping.h b/include/sys/vdev_indirect_mapping.h
new file mode 100644
index 000000000..7e42c1019
--- /dev/null
+++ b/include/sys/vdev_indirect_mapping.h
@@ -0,0 +1,141 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_INDIRECT_MAPPING_H
+#define _SYS_VDEV_INDIRECT_MAPPING_H
+
+#include <sys/dmu.h>
+#include <sys/list.h>
+#include <sys/spa.h>
+#include <sys/space_map.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct vdev_indirect_mapping_entry_phys {
+ /*
+ * Decode with DVA_MAPPING_* macros.
+ * Contains:
+ * the source offset (low 63 bits)
+ * the one-bit "mark", used for garbage collection (by zdb)
+ */
+ uint64_t vimep_src;
+
+ /*
+ * Note: the DVA's asize is 24 bits, and can thus store ranges
+ * up to 8GB.
+ */
+ dva_t vimep_dst;
+} vdev_indirect_mapping_entry_phys_t;
+
+#define DVA_MAPPING_GET_SRC_OFFSET(vimep) \
+ BF64_GET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0)
+#define DVA_MAPPING_SET_SRC_OFFSET(vimep, x) \
+ BF64_SET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0, x)
+
+typedef struct vdev_indirect_mapping_entry {
+ vdev_indirect_mapping_entry_phys_t vime_mapping;
+ uint32_t vime_obsolete_count;
+ list_node_t vime_node;
+} vdev_indirect_mapping_entry_t;
+
+/*
+ * This is stored in the bonus buffer of the mapping object, see comment of
+ * vdev_indirect_config for more details.
+ */
+typedef struct vdev_indirect_mapping_phys {
+ uint64_t vimp_max_offset;
+ uint64_t vimp_bytes_mapped;
+ uint64_t vimp_num_entries; /* number of v_i_m_entry_phys_t's */
+
+ /*
+ * For each entry in the mapping object, this object contains an
+ * entry representing the number of bytes of that mapping entry
+ * that were no longer in use by the pool at the time this indirect
+ * vdev was last condensed.
+ */
+ uint64_t vimp_counts_object;
+} vdev_indirect_mapping_phys_t;
+
+#define VDEV_INDIRECT_MAPPING_SIZE_V0 (3 * sizeof (uint64_t))
+
+typedef struct vdev_indirect_mapping {
+ uint64_t vim_object;
+ boolean_t vim_havecounts;
+
+ /*
+ * An ordered array of all mapping entries, sorted by source offset.
+ * Note that vim_entries is needed during a removal (and contains
+ * mappings that have been synced to disk so far) to handle frees
+ * from the removing device.
+ */
+ vdev_indirect_mapping_entry_phys_t *vim_entries;
+
+ objset_t *vim_objset;
+
+ dmu_buf_t *vim_dbuf;
+ vdev_indirect_mapping_phys_t *vim_phys;
+} vdev_indirect_mapping_t;
+
+extern vdev_indirect_mapping_t *vdev_indirect_mapping_open(objset_t *os,
+ uint64_t object);
+extern void vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim);
+extern uint64_t vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx);
+extern void vdev_indirect_mapping_free(objset_t *os, uint64_t obj,
+ dmu_tx_t *tx);
+
+extern uint64_t vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim);
+extern uint64_t vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim);
+extern uint64_t vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim);
+extern uint64_t vdev_indirect_mapping_bytes_mapped(
+ vdev_indirect_mapping_t *vim);
+extern uint64_t vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim);
+
+/*
+ * Writes the given list of vdev_indirect_mapping_entry_t to the mapping
+ * then updates internal state.
+ */
+extern void vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
+ list_t *vime_list, dmu_tx_t *tx);
+
+extern vdev_indirect_mapping_entry_phys_t *
+ vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
+ uint64_t offset);
+
+extern vdev_indirect_mapping_entry_phys_t *
+ vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim,
+ uint64_t offset);
+
+extern uint32_t *vdev_indirect_mapping_load_obsolete_counts(
+ vdev_indirect_mapping_t *vim);
+extern void vdev_indirect_mapping_load_obsolete_spacemap(
+ vdev_indirect_mapping_t *vim,
+ uint32_t *counts, space_map_t *obsolete_space_sm);
+extern void vdev_indirect_mapping_increment_obsolete_count(
+ vdev_indirect_mapping_t *vim,
+ uint64_t offset, uint64_t asize, uint32_t *counts);
+extern void vdev_indirect_mapping_free_obsolete_counts(
+ vdev_indirect_mapping_t *vim, uint32_t *counts);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_INDIRECT_MAPPING_H */
diff --git a/include/sys/vdev_removal.h b/include/sys/vdev_removal.h
new file mode 100644
index 000000000..5b1e3056b
--- /dev/null
+++ b/include/sys/vdev_removal.h
@@ -0,0 +1,93 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_REMOVAL_H
+#define _SYS_VDEV_REMOVAL_H
+
+#include <sys/spa.h>
+#include <sys/bpobj.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/vdev_indirect_births.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct spa_vdev_removal {
+ vdev_t *svr_vdev;
+ uint64_t svr_max_offset_to_sync[TXG_SIZE];
+ /* Thread performing a vdev removal. */
+ kthread_t *svr_thread;
+ /* Segments left to copy from the current metaslab. */
+ range_tree_t *svr_allocd_segs;
+ kmutex_t svr_lock;
+ kcondvar_t svr_cv;
+ boolean_t svr_thread_exit;
+
+ /*
+ * New mappings to write out each txg.
+ */
+ list_t svr_new_segments[TXG_SIZE];
+
+ /*
+ * Ranges that were freed while a mapping was in flight. This is
+ * a subset of the ranges covered by vdev_im_new_segments.
+ */
+ range_tree_t *svr_frees[TXG_SIZE];
+
+ /*
+ * Number of bytes which we have finished our work for
+ * in each txg. This could be data copied (which will be part of
+ * the mappings in vdev_im_new_segments), or data freed before
+ * we got around to copying it.
+ */
+ uint64_t svr_bytes_done[TXG_SIZE];
+
+ /* List of leaf zap objects to be unlinked */
+ nvlist_t *svr_zaplist;
+} spa_vdev_removal_t;
+
+typedef struct spa_condensing_indirect {
+ /*
+ * New mappings to write out each txg.
+ */
+ list_t sci_new_mapping_entries[TXG_SIZE];
+
+ vdev_indirect_mapping_t *sci_new_mapping;
+} spa_condensing_indirect_t;
+
+extern int spa_remove_init(spa_t *);
+extern void spa_restart_removal(spa_t *);
+extern int spa_condense_init(spa_t *);
+extern void spa_condense_fini(spa_t *);
+extern void spa_condense_indirect_restart(spa_t *);
+extern void spa_vdev_condense_suspend(spa_t *);
+extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t);
+extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t, uint64_t);
+extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *);
+extern void svr_sync(spa_t *spa, dmu_tx_t *tx);
+extern void spa_vdev_remove_suspend(spa_t *);
+extern int spa_vdev_remove_cancel(spa_t *);
+extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_REMOVAL_H */
diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h
index 7e05c2a35..91b161ff8 100644
--- a/include/sys/zfs_debug.h
+++ b/include/sys/zfs_debug.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZFS_DEBUG_H
@@ -53,6 +53,7 @@ extern int zfs_dbgmsg_enable;
#define ZFS_DEBUG_HISTOGRAM_VERIFY (1 << 7)
#define ZFS_DEBUG_METASLAB_VERIFY (1 << 8)
#define ZFS_DEBUG_SET_ERROR (1 << 9)
+#define ZFS_DEBUG_INDIRECT_REMAP (1 << 10)
extern void __dprintf(const char *file, const char *func,
int line, const char *fmt, ...);
diff --git a/include/sys/zil.h b/include/sys/zil.h
index fc0cc5065..fb7b38a06 100644
--- a/include/sys/zil.h
+++ b/include/sys/zil.h
@@ -496,7 +496,7 @@ extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
extern void zil_commit(zilog_t *zilog, uint64_t oid);
extern void zil_commit_impl(zilog_t *zilog, uint64_t oid);
-extern int zil_vdev_offline(const char *osname, void *txarg);
+extern int zil_reset(const char *osname, void *txarg);
extern int zil_claim(struct dsl_pool *dp,
struct dsl_dataset *ds, void *txarg);
extern int zil_check_log_chain(struct dsl_pool *dp,
diff --git a/include/sys/zio.h b/include/sys/zio.h
index be8e18b4b..a275b16de 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -244,7 +244,7 @@ enum zio_flag {
#define ZIO_VDEV_CHILD_FLAGS(zio) \
(((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \
- ZIO_FLAG_CANFAIL)
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_CANFAIL)
#define ZIO_CHILD_BIT(x) (1 << (x))
#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1 << (x)))
@@ -278,6 +278,9 @@ enum zio_wait_type {
#define ECKSUM EBADE
#define EFRAGS EBADR
+/* Similar for ENOACTIVE */
+#define ENOTACTIVE ENOANO
+
typedef void zio_done_func_t(zio_t *zio);
extern int zio_dva_throttle_enabled;
diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h
index 3fc3589be..c2cc8b2d5 100644
--- a/include/sys/zio_priority.h
+++ b/include/sys/zio_priority.h
@@ -28,6 +28,7 @@ typedef enum zio_priority {
ZIO_PRIORITY_ASYNC_READ, /* prefetch */
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
+ ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW, /* non-queued i/os (e.g. free) */
} zio_priority_t;