17 files changed, 1254 insertions, 26 deletions
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in
index a243f51d8..193bdc510 100644
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@@ -89,6 +89,7 @@ $(MODULE)-objs += vdev_file.o
 $(MODULE)-objs += vdev_indirect.o
 $(MODULE)-objs += vdev_indirect_births.o
 $(MODULE)-objs += vdev_indirect_mapping.o
+$(MODULE)-objs += vdev_initialize.o
 $(MODULE)-objs += vdev_label.o
 $(MODULE)-objs += vdev_mirror.o
 $(MODULE)-objs += vdev_missing.o
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 4b5baf6a6..71688b420 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -635,6 +635,8 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 
 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL);
 	mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
 	    KM_SLEEP);
 	mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
@@ -681,6 +683,8 @@ metaslab_group_destroy(metaslab_group_t *mg)
 	kmem_free(mg->mg_secondaries, mg->mg_allocators *
 	    sizeof (metaslab_t *));
 	mutex_destroy(&mg->mg_lock);
+	mutex_destroy(&mg->mg_ms_initialize_lock);
+	cv_destroy(&mg->mg_ms_initialize_cv);
 
 	for (int i = 0; i < mg->mg_allocators; i++) {
 		zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]);
@@ -1502,6 +1506,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
 	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
+
 	ms->ms_id = id;
 	ms->ms_start = id << vd->vdev_ms_shift;
 	ms->ms_size = 1ULL << vd->vdev_ms_shift;
@@ -2686,6 +2691,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	 * from it in 'metaslab_unload_delay' txgs, then unload it.
 	 */
 	if (msp->ms_loaded &&
+	    msp->ms_initializing == 0 &&
 	    msp->ms_selected_txg + metaslab_unload_delay < txg) {
 
 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
@@ -2967,6 +2973,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 
 	VERIFY(!msp->ms_condensing);
+	VERIFY0(msp->ms_initializing);
 
 	start = mc->mc_ops->msop_alloc(msp, size);
 	if (start != -1ULL) {
@@ -3027,9 +3034,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
 		}
 
 		/*
-		 * If the selected metaslab is condensing, skip it.
+		 * If the selected metaslab is condensing or being
+		 * initialized, skip it.
 		 */
-		if (msp->ms_condensing)
+		if (msp->ms_condensing || msp->ms_initializing > 0)
 			continue;
 
 		*was_active = msp->ms_allocator != -1;
@@ -3190,7 +3198,9 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 		/*
 		 * If this metaslab is currently condensing then pick again as
 		 * we can't manipulate this metaslab until it's committed
-		 * to disk.
+		 * to disk. If this metaslab is being initialized, we shouldn't
+		 * allocate from it since the allocated region might be
+		 * overwritten after allocation.
 		 */
 		if (msp->ms_condensing) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
@@ -3199,6 +3209,13 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 			    ~METASLAB_ACTIVE_MASK);
 			mutex_exit(&msp->ms_lock);
 			continue;
+		} else if (msp->ms_initializing > 0) {
+			metaslab_trace_add(zal, mg, msp, asize, d,
+			    TRACE_INITIALIZING, allocator);
+			metaslab_passivate(msp, msp->ms_weight &
+			    ~METASLAB_ACTIVE_MASK);
+			mutex_exit(&msp->ms_lock);
+			continue;
 		}
 
 		offset = metaslab_block_alloc(msp, asize, txg);
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index f0683b0b8..622be75f9 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -56,6 +56,7 @@
 #include <sys/vdev_removal.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/vdev_indirect_births.h>
+#include <sys/vdev_initialize.h>
 #include <sys/vdev_disk.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
@@ -434,8 +435,9 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
 
 				dp = spa_get_dsl(spa);
 				dsl_pool_config_enter(dp, FTAG);
-				if ((err = dsl_dataset_hold_obj(dp,
-				    za.za_first_integer, FTAG, &ds))) {
+				err = dsl_dataset_hold_obj(dp,
+				    za.za_first_integer, FTAG, &ds);
+				if (err != 0) {
 					dsl_pool_config_exit(dp, FTAG);
 					break;
 				}
@@ -601,7 +603,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 				}
 
 				error = dmu_objset_hold(strval, FTAG, &os);
-				if (error)
+				if (error != 0)
 					break;
 
 				/*
@@ -1218,8 +1220,10 @@ spa_activate(spa_t *spa, int mode)
 		spa_create_zio_taskqs(spa);
 	}
 
-	for (size_t i = 0; i < TXG_SIZE; i++)
-		spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 0);
+	for (size_t i = 0; i < TXG_SIZE; i++) {
+		spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
+		    ZIO_FLAG_CANFAIL);
+	}
 
 	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_config_dirty_node));
@@ -1437,6 +1441,11 @@ spa_unload(spa_t *spa)
 	 */
 	spa_async_suspend(spa);
 
+	if (spa->spa_root_vdev) {
+		vdev_initialize_stop_all(spa->spa_root_vdev,
+		    VDEV_INITIALIZE_ACTIVE);
+	}
+
 	/*
 	 * Stop syncing.
 	 */
@@ -1452,10 +1461,10 @@ spa_unload(spa_t *spa)
 	 * calling taskq_wait(mg_taskq).
 	 */
 	if (spa->spa_root_vdev != NULL) {
-		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++)
 			vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]);
-		spa_config_exit(spa, SCL_ALL, FTAG);
+		spa_config_exit(spa, SCL_ALL, spa);
 	}
 
 	if (spa->spa_mmp.mmp_thread)
@@ -1492,7 +1501,7 @@ spa_unload(spa_t *spa)
 
 	bpobj_close(&spa->spa_deferred_bpobj);
 
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 
 	/*
 	 * Close all vdevs.
@@ -1554,7 +1563,7 @@ spa_unload(spa_t *spa)
 		spa->spa_comment = NULL;
 	}
 
-	spa_config_exit(spa, SCL_ALL, FTAG);
+	spa_config_exit(spa, SCL_ALL, spa);
 }
 
 /*
@@ -4246,6 +4255,9 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
 		 */
 		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
 
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		vdev_initialize_restart(spa->spa_root_vdev);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	spa_load_note(spa, "LOADED");
@@ -5654,6 +5666,18 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
 		}
 
 		/*
+		 * We're about to export or destroy this pool. Make sure
+		 * we stop all initializtion activity here before we
+		 * set the spa_final_txg. This will ensure that all
+		 * dirty data resulting from the initialization is
+		 * committed to disk before we unload the pool.
+		 */
+		if (spa->spa_root_vdev != NULL) {
+			vdev_initialize_stop_all(spa->spa_root_vdev,
+			    VDEV_INITIALIZE_ACTIVE);
+		}
+
+		/*
 		 * We want this to be reflected on every label,
 		 * so mark them all dirty.  spa_unload() will do the
 		 * final sync that pushes these changes out.
@@ -6357,6 +6381,86 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 	return (error);
 }
 
+int
+spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type)
+{
+	/*
+	 * We hold the namespace lock through the whole function
+	 * to prevent any changes to the pool while we're starting or
+	 * stopping initialization. The config and state locks are held so that
+	 * we can properly assess the vdev state before we commit to
+	 * the initializing operation.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+
+	/* Look up vdev and ensure it's a leaf. */
+	vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+	if (vd == NULL || vd->vdev_detached) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(ENODEV));
+	} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(EINVAL));
+	} else if (!vdev_writeable(vd)) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(EROFS));
+	}
+	mutex_enter(&vd->vdev_initialize_lock);
+	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+
+	/*
+	 * When we activate an initialize action we check to see
+	 * if the vdev_initialize_thread is NULL. We do this instead
+	 * of using the vdev_initialize_state since there might be
+	 * a previous initialization process which has completed but
+	 * the thread is not exited.
+	 */
+	if (cmd_type == POOL_INITIALIZE_DO &&
+	    (vd->vdev_initialize_thread != NULL ||
+	    vd->vdev_top->vdev_removing)) {
+		mutex_exit(&vd->vdev_initialize_lock);
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(EBUSY));
+	} else if (cmd_type == POOL_INITIALIZE_CANCEL &&
+	    (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
+	    vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
+		mutex_exit(&vd->vdev_initialize_lock);
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(ESRCH));
+	} else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
+	    vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
+		mutex_exit(&vd->vdev_initialize_lock);
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(ESRCH));
+	}
+
+	switch (cmd_type) {
+	case POOL_INITIALIZE_DO:
+		vdev_initialize(vd);
+		break;
+	case POOL_INITIALIZE_CANCEL:
+		vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
+		break;
+	case POOL_INITIALIZE_SUSPEND:
+		vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED);
+		break;
+	default:
+		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
+	}
+	mutex_exit(&vd->vdev_initialize_lock);
+
+	/* Sync out the initializing state */
+	txg_wait_synced(spa->spa_dsl_pool, 0);
+	mutex_exit(&spa_namespace_lock);
+
+	return (0);
+}
+
+
 /*
  * Split a set of devices from their mirrors, and create a new pool from them.
  */
@@ -6565,6 +6669,19 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
 	spa_activate(newspa, spa_mode_global);
 	spa_async_suspend(newspa);
 
+	for (c = 0; c < children; c++) {
+		if (vml[c] != NULL) {
+			/*
+			 * Temporarily stop the initializing activity. We set
+			 * the state to ACTIVE so that we know to resume
+			 * the initializing once the split has completed.
+			 */
+			mutex_enter(&vml[c]->vdev_initialize_lock);
+			vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE);
+			mutex_exit(&vml[c]->vdev_initialize_lock);
+		}
+	}
+
 	newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
 
 	/* create the new pool from the disks of the original pool */
@@ -6652,6 +6769,10 @@ out:
 		if (vml[c] != NULL)
 			vml[c]->vdev_offline = B_FALSE;
 	}
+
+	/* restart initializing disks as necessary */
+	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
+
 	vdev_reopen(spa->spa_root_vdev);
 
 	nvlist_free(spa->spa_config_splitting);
@@ -7025,6 +7146,14 @@ spa_async_thread(void *arg)
 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
 		dsl_resilver_restart(dp, 0);
 
+	if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
+		mutex_enter(&spa_namespace_lock);
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		vdev_initialize_restart(spa->spa_root_vdev);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+		mutex_exit(&spa_namespace_lock);
+	}
+
 	/*
 	 * Let the world know that we're done.
 	 */
@@ -7677,8 +7806,9 @@ spa_sync(spa_t *spa, uint64_t txg)
 	 * Wait for i/os issued in open context that need to complete
 	 * before this txg syncs.
 	 */
-	VERIFY0(zio_wait(spa->spa_txg_zio[txg & TXG_MASK]));
-	spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 0);
+	(void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
+	spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CANFAIL);
 
 	/*
 	 * Lock out configuration changes.
@@ -7983,7 +8113,8 @@ spa_sync(spa_t *spa, uint64_t txg)
 	/*
 	 * Update usable space statistics.
 	 */
-	while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))))
+	while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
+	    != NULL)
 		vdev_sync_done(vd, txg);
 
 	spa_update_dspace(spa);
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index a3ac70f07..dfac92d45 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -38,6 +38,7 @@
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_initialize.h>
 #include <sys/vdev_file.h>
 #include <sys/vdev_raidz.h>
 #include <sys/metaslab.h>
@@ -1194,6 +1195,12 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
 
 	if (vd != NULL) {
 		ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
+		if (vd->vdev_ops->vdev_op_leaf) {
+			mutex_enter(&vd->vdev_initialize_lock);
+			vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
+			mutex_exit(&vd->vdev_initialize_lock);
+		}
+
 		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 		vdev_free(vd);
 		spa_config_exit(spa, SCL_ALL, spa);
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 8273e7907..f808f8ee7 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -50,6 +50,7 @@
 #include <sys/zil.h>
 #include <sys/dsl_scan.h>
 #include <sys/abd.h>
+#include <sys/vdev_initialize.h>
 #include <sys/zvol.h>
 #include <sys/zfs_ratelimit.h>
 
@@ -212,6 +213,14 @@ vdev_getops(const char *type)
 	return (ops);
 }
 
+/* ARGSUSED */
+void
+vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res)
+{
+	res->rs_start = in->rs_start;
+	res->rs_end = in->rs_end;
+}
+
 /*
  * Derive the enumerated alloction bias from string input.
  * String origin is either the per-vdev zap or zpool(1M).
@@ -526,6 +535,10 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
 
 	for (int t = 0; t < DTL_TYPES; t++) {
 		vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
@@ -850,6 +863,7 @@ void
 vdev_free(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
+	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 
 	/*
 	 * Scan queues are normally destroyed at the end of a scan. If the
@@ -880,6 +894,7 @@ vdev_free(vdev_t *vd)
 
 	ASSERT(vd->vdev_child == NULL);
 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
+	ASSERT(vd->vdev_initialize_thread == NULL);
 
 	/*
 	 * Discard allocation state.
@@ -957,6 +972,10 @@ vdev_free(vdev_t *vd)
 	mutex_destroy(&vd->vdev_stat_lock);
 	mutex_destroy(&vd->vdev_probe_lock);
 	mutex_destroy(&vd->vdev_scan_io_queue_lock);
+	mutex_destroy(&vd->vdev_initialize_lock);
+	mutex_destroy(&vd->vdev_initialize_io_lock);
+	cv_destroy(&vd->vdev_initialize_io_cv);
+	cv_destroy(&vd->vdev_initialize_cv);
 
 	zfs_ratelimit_fini(&vd->vdev_delay_rl);
 	zfs_ratelimit_fini(&vd->vdev_checksum_rl);
@@ -3207,7 +3226,8 @@ vdev_sync_done(vdev_t *vd, uint64_t txg)
 
 	ASSERT(vdev_is_concrete(vd));
 
-	while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))))
+	while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
+	    != NULL)
 		metaslab_sync_done(msp, txg);
 
 	if (reassess)
@@ -3458,6 +3478,15 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 	}
 
+	/* Restart initializing if necessary */
+	mutex_enter(&vd->vdev_initialize_lock);
+	if (vdev_writeable(vd) &&
+	    vd->vdev_initialize_thread == NULL &&
+	    vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
+		(void) vdev_initialize(vd);
+	}
+	mutex_exit(&vd->vdev_initialize_lock);
+
 	if (wasoffline ||
 	    (oldstate < VDEV_STATE_DEGRADED &&
 	    vd->vdev_state >= VDEV_STATE_DEGRADED))
@@ -3848,9 +3877,22 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 		vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
 		vs->vs_state = vd->vdev_state;
 		vs->vs_rsize = vdev_get_min_asize(vd);
-		if (vd->vdev_ops->vdev_op_leaf)
+		if (vd->vdev_ops->vdev_op_leaf) {
 			vs->vs_rsize += VDEV_LABEL_START_SIZE +
 			    VDEV_LABEL_END_SIZE;
+			/*
+			 * Report intializing progress. Since we don't
+			 * have the initializing locks held, this is only
+			 * an estimate (although a fairly accurate one).
+			 */
+			vs->vs_initialize_bytes_done =
+			    vd->vdev_initialize_bytes_done;
+			vs->vs_initialize_bytes_est =
+			    vd->vdev_initialize_bytes_est;
+			vs->vs_initialize_state = vd->vdev_initialize_state;
+			vs->vs_initialize_action_time =
+			    vd->vdev_initialize_action_time;
+		}
 		/*
 		 * Report expandable space on top-level, non-auxillary devices
 		 * only. The expandable space is reported in terms of metaslab
diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
index 9c44ba12a..d13f365dd 100644
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -890,6 +890,7 @@ vdev_ops_t vdev_disk_ops = {
 	vdev_disk_hold,
 	vdev_disk_rele,
 	NULL,
+	vdev_default_xlate,
 	VDEV_TYPE_DISK,		/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c
index bd7e0bc2e..3551898e0 100644
--- a/module/zfs/vdev_file.c
+++ b/module/zfs/vdev_file.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -254,6 +254,7 @@ vdev_ops_t vdev_file_ops = {
 	vdev_file_hold,
 	vdev_file_rele,
 	NULL,
+	vdev_default_xlate,
 	VDEV_TYPE_FILE,		/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
@@ -289,6 +290,7 @@ vdev_ops_t vdev_disk_ops = {
 	vdev_file_hold,
 	vdev_file_rele,
 	NULL,
+	vdev_default_xlate,
 	VDEV_TYPE_DISK,		/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c
index 724457df4..070d1b8d9 100644
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c
@@ -1857,6 +1857,7 @@ vdev_ops_t vdev_indirect_ops = {
 	NULL,
 	NULL,
 	vdev_indirect_remap,
+	NULL,
 	VDEV_TYPE_INDIRECT,	/* name of this vdev type */
 	B_FALSE			/* leaf vdev */
 };
diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c
new file mode 100644
index 000000000..fcd2c76f9
--- /dev/null
+++ b/module/zfs/vdev_initialize.c
@@ -0,0 +1,819 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/txg.h>
+#include <sys/vdev_impl.h>
+#include <sys/refcount.h>
+#include <sys/metaslab_impl.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+
+/*
+ * Maximum number of metaslabs per group that can be initialized
+ * simultaneously.
+ */
+int max_initialize_ms = 3;
+
+/*
+ * Value that is written to disk during initialization.
+ */
+#ifdef _ILP32
+unsigned long zfs_initialize_value = 0xdeadbeefUL;
+#else
+unsigned long zfs_initialize_value = 0xdeadbeefdeadbeeeULL;
+#endif
+
+/* maximum number of I/Os outstanding per leaf vdev */
+int zfs_initialize_limit = 1;
+
+/* size of initializing writes; default 1MiB, see zfs_remove_max_segment */
+uint64_t zfs_initialize_chunk_size = 1024 * 1024;
+
+static boolean_t
+vdev_initialize_should_stop(vdev_t *vd)
+{
+	return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) ||
+	    vd->vdev_detached || vd->vdev_top->vdev_removing);
+}
+
+static void
+vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
+{
+	/*
+	 * We pass in the guid instead of the vdev_t since the vdev may
+	 * have been freed prior to the sync task being processed. This
+	 * happens when a vdev is detached as we call spa_config_vdev_exit(),
+	 * stop the intializing thread, schedule the sync task, and free
+	 * the vdev. Later when the scheduled sync task is invoked, it would
+	 * find that the vdev has been freed.
+	 */
+	uint64_t guid = *(uint64_t *)arg;
+	uint64_t txg = dmu_tx_get_txg(tx);
+	kmem_free(arg, sizeof (uint64_t));
+
+	vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
+	if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
+		return;
+
+	uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK];
+	vd->vdev_initialize_offset[txg & TXG_MASK] = 0;
+
+	VERIFY(vd->vdev_leaf_zap != 0);
+
+	objset_t *mos = vd->vdev_spa->spa_meta_objset;
+
+	if (last_offset > 0) {
+		vd->vdev_initialize_last_offset = last_offset;
+		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+		    VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
+		    sizeof (last_offset), 1, &last_offset, tx));
+	}
+	if (vd->vdev_initialize_action_time > 0) {
+		uint64_t val = (uint64_t)vd->vdev_initialize_action_time;
+		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+		    VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val),
+		    1, &val, tx));
+	}
+
+	uint64_t initialize_state = vd->vdev_initialize_state;
+	VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+	    VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1,
+	    &initialize_state, tx));
+}
+
+static void
+vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
+{
+	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+	spa_t *spa = vd->vdev_spa;
+
+	if (new_state == vd->vdev_initialize_state)
+		return;
+
+	/*
+	 * Copy the vd's guid, this will be freed by the sync task.
+	 */
+	uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+	*guid = vd->vdev_guid;
+
+	/*
+	 * If we're suspending, then preserving the original start time.
+	 */
+	if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
+		vd->vdev_initialize_action_time = gethrestime_sec();
+	}
+	vd->vdev_initialize_state = new_state;
+
+	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+	dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
+	    guid, 2, ZFS_SPACE_CHECK_RESERVED, tx);
+
+	switch (new_state) {
+	case VDEV_INITIALIZE_ACTIVE:
+		spa_history_log_internal(spa, "initialize", tx,
+		    "vdev=%s activated", vd->vdev_path);
+		break;
+	case VDEV_INITIALIZE_SUSPENDED:
+		spa_history_log_internal(spa, "initialize", tx,
+		    "vdev=%s suspended", vd->vdev_path);
+		break;
+	case VDEV_INITIALIZE_CANCELED:
+		spa_history_log_internal(spa, "initialize", tx,
+		    "vdev=%s canceled", vd->vdev_path);
+		break;
+	case VDEV_INITIALIZE_COMPLETE:
+		spa_history_log_internal(spa, "initialize", tx,
+		    "vdev=%s complete", vd->vdev_path);
+		break;
+	default:
+		panic("invalid state %llu", (unsigned long long)new_state);
+	}
+
+	dmu_tx_commit(tx);
+}
+
+static void
+vdev_initialize_cb(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	mutex_enter(&vd->vdev_initialize_io_lock);
+	if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
+		/*
+		 * The I/O failed because the vdev was unavailable; roll the
+		 * last offset back. (This works because spa_sync waits on
+		 * spa_txg_zio before it runs sync tasks.)
+		 */
+		uint64_t *off =
+		    &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK];
+		*off = MIN(*off, zio->io_offset);
+	} else {
+		/*
+		 * Since initializing is best-effort, we ignore I/O errors and
+		 * rely on vdev_probe to determine if the errors are more
+		 * critical.
+		 */
+		if (zio->io_error != 0)
+			vd->vdev_stat.vs_initialize_errors++;
+
+		vd->vdev_initialize_bytes_done += zio->io_orig_size;
+	}
+	ASSERT3U(vd->vdev_initialize_inflight, >, 0);
+	vd->vdev_initialize_inflight--;
+	cv_broadcast(&vd->vdev_initialize_io_cv);
+	mutex_exit(&vd->vdev_initialize_io_lock);
+
+	spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+
+/* Takes care of physical writing and limiting # of concurrent ZIOs. */
+static int
+vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	/* Limit inflight initializing I/Os */
+	mutex_enter(&vd->vdev_initialize_io_lock);
+	while (vd->vdev_initialize_inflight >= zfs_initialize_limit) {
+		cv_wait(&vd->vdev_initialize_io_cv,
+		    &vd->vdev_initialize_io_lock);
+	}
+	vd->vdev_initialize_inflight++;
+	mutex_exit(&vd->vdev_initialize_io_lock);
+
+	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+	uint64_t txg = dmu_tx_get_txg(tx);
+
+	spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
+	mutex_enter(&vd->vdev_initialize_lock);
+
+	if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) {
+		uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+		*guid = vd->vdev_guid;
+
+		/* This is the first write of this txg. */
+		dsl_sync_task_nowait(spa_get_dsl(spa),
+		    vdev_initialize_zap_update_sync, guid, 2,
+		    ZFS_SPACE_CHECK_RESERVED, tx);
+	}
+
+	/*
+	 * We know the vdev struct will still be around since all
+	 * consumers of vdev_free must stop the initialization first.
+	 */
+	if (vdev_initialize_should_stop(vd)) {
+		mutex_enter(&vd->vdev_initialize_io_lock);
+		ASSERT3U(vd->vdev_initialize_inflight, >, 0);
+		vd->vdev_initialize_inflight--;
+		mutex_exit(&vd->vdev_initialize_io_lock);
+		spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+		mutex_exit(&vd->vdev_initialize_lock);
+		dmu_tx_commit(tx);
+		return (SET_ERROR(EINTR));
+	}
+	mutex_exit(&vd->vdev_initialize_lock);
+
+	vd->vdev_initialize_offset[txg & TXG_MASK] = start + size;
+	zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start,
+	    size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL,
+	    ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE));
+	/* vdev_initialize_cb releases SCL_STATE_ALL */
+
+	dmu_tx_commit(tx);
+
+	return (0);
+}
+
+/*
+ * Translate a logical range to the physical range for the specified vdev_t.
+ * This function is initially called with a leaf vdev and will walk each
+ * parent vdev until it reaches a top-level vdev. Once the top-level is
+ * reached the physical range is initialized and the recursive function
+ * begins to unwind. As it unwinds it calls the parent's vdev specific
+ * translation function to do the real conversion.
+ */
+void
+vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs)
+{
+	/*
+	 * Walk up the vdev tree
+	 */
+	if (vd != vd->vdev_top) {
+		vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
+	} else {
+		/*
+		 * We've reached the top-level vdev, initialize the
+		 * physical range to the logical range and start to
+		 * unwind.
+		 */
+		physical_rs->rs_start = logical_rs->rs_start;
+		physical_rs->rs_end = logical_rs->rs_end;
+		return;
+	}
+
+	vdev_t *pvd = vd->vdev_parent;
+	ASSERT3P(pvd, !=, NULL);
+	ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
+
+	/*
+	 * As this recursive function unwinds, translate the logical
+	 * range into its physical components by calling the
+	 * vdev specific translate function.
+	 */
+	range_seg_t intermediate = { { { 0, 0 } } };
+	pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
+
+	physical_rs->rs_start = intermediate.rs_start;
+	physical_rs->rs_end = intermediate.rs_end;
+}
+
+/*
+ * Callback to fill each ABD chunk with zfs_initialize_value. len must be
+ * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
+ * allocation will guarantee these for us.
+ */
+/* ARGSUSED */
+static int
+vdev_initialize_block_fill(void *buf, size_t len, void *unused)
+{
+	ASSERT0(len % sizeof (uint64_t));
+#ifdef _ILP32
+	for (uint64_t i = 0; i < len; i += sizeof (uint32_t)) {
+		*(uint32_t *)((char *)(buf) + i) = zfs_initialize_value;
+	}
+#else
+	for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) {
+		*(uint64_t *)((char *)(buf) + i) = zfs_initialize_value;
+	}
+#endif
+	return (0);
+}
+
+static abd_t *
+vdev_initialize_block_alloc(void)
+{
+	/* Allocate ABD for filler data */
+	abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE);
+
+	ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t));
+	(void) abd_iterate_func(data, 0, zfs_initialize_chunk_size,
+	    vdev_initialize_block_fill, NULL);
+
+	return (data);
+}
+
+static void
+vdev_initialize_block_free(abd_t *data)
+{
+	abd_free(data);
+}
+
+static int
+vdev_initialize_ranges(vdev_t *vd, abd_t *data)
+{
+	avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root;
+
+	for (range_seg_t *rs = avl_first(rt); rs != NULL;
+	    rs = AVL_NEXT(rt, rs)) {
+		uint64_t size = rs->rs_end - rs->rs_start;
+
+		/* Split range into legally-sized physical chunks */
+		uint64_t writes_required =
+		    ((size - 1) / zfs_initialize_chunk_size) + 1;
+
+		for (uint64_t w = 0; w < writes_required; w++) {
+			int error;
+
+			error = vdev_initialize_write(vd,
+			    VDEV_LABEL_START_SIZE + rs->rs_start +
+			    (w * zfs_initialize_chunk_size),
+			    MIN(size - (w * zfs_initialize_chunk_size),
+			    zfs_initialize_chunk_size), data);
+			if (error != 0)
+				return (error);
+		}
+	}
+	return (0);
+}
+
+static void
+vdev_initialize_ms_load(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	metaslab_load_wait(msp);
+	if (!msp->ms_loaded)
+		VERIFY0(metaslab_load(msp));
+}
+
+static void
+vdev_initialize_mg_wait(metaslab_group_t *mg)
+{
+	ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
+	while (mg->mg_initialize_updating) {
+		cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
+	}
+}
+
+static void
+vdev_initialize_mg_mark(metaslab_group_t *mg)
+{
+	ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
+	ASSERT(mg->mg_initialize_updating);
+
+	while (mg->mg_ms_initializing >= max_initialize_ms) {
+		cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
+	}
+	mg->mg_ms_initializing++;
+	ASSERT3U(mg->mg_ms_initializing, <=, max_initialize_ms);
+}
+
+/*
+ * Mark the metaslab as being initialized to prevent any allocations
+ * on this metaslab. We must also track how many metaslabs are currently
+ * being initialized within a metaslab group and limit them to prevent
+ * allocation failures from occurring because all metaslabs are being
+ * initialized.
+ */
+static void
+vdev_initialize_ms_mark(metaslab_t *msp)
+{
+	ASSERT(!MUTEX_HELD(&msp->ms_lock));
+	metaslab_group_t *mg = msp->ms_group;
+
+	mutex_enter(&mg->mg_ms_initialize_lock);
+
+	/*
+	 * To keep an accurate count of how many threads are initializing
+	 * a specific metaslab group, we only allow one thread to mark
+	 * the metaslab group at a time. This ensures that the value of
+	 * ms_initializing will be accurate when we decide to mark a metaslab
+	 * group as being initialized. To do this we force all other threads
+	 * to wait till the metaslab's mg_initialize_updating flag is no
+	 * longer set.
+	 */
+	vdev_initialize_mg_wait(mg);
+	mg->mg_initialize_updating = B_TRUE;
+	if (msp->ms_initializing == 0) {
+		vdev_initialize_mg_mark(mg);
+	}
+	mutex_enter(&msp->ms_lock);
+	msp->ms_initializing++;
+	mutex_exit(&msp->ms_lock);
+
+	mg->mg_initialize_updating = B_FALSE;
+	cv_broadcast(&mg->mg_ms_initialize_cv);
+	mutex_exit(&mg->mg_ms_initialize_lock);
+}
+
+static void
+vdev_initialize_ms_unmark(metaslab_t *msp)
+{
+	ASSERT(!MUTEX_HELD(&msp->ms_lock));
+	metaslab_group_t *mg = msp->ms_group;
+	mutex_enter(&mg->mg_ms_initialize_lock);
+	mutex_enter(&msp->ms_lock);
+	if (--msp->ms_initializing == 0) {
+		mg->mg_ms_initializing--;
+		cv_broadcast(&mg->mg_ms_initialize_cv);
+	}
+	mutex_exit(&msp->ms_lock);
+	mutex_exit(&mg->mg_ms_initialize_lock);
+}
+
+static void
+vdev_initialize_calculate_progress(vdev_t *vd)
+{
+	ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+	    spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+	ASSERT(vd->vdev_leaf_zap != 0);
+
+	vd->vdev_initialize_bytes_est = 0;
+	vd->vdev_initialize_bytes_done = 0;
+
+	for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
+		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+		mutex_enter(&msp->ms_lock);
+
+		uint64_t ms_free = msp->ms_size -
+		    space_map_allocated(msp->ms_sm);
+
+		if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
+			ms_free /= vd->vdev_top->vdev_children;
+
+		/*
+		 * Convert the metaslab range to a physical range
+		 * on our vdev. We use this to determine if we are
+		 * in the middle of this metaslab range.
+		 */
+		range_seg_t logical_rs, physical_rs;
+		logical_rs.rs_start = msp->ms_start;
+		logical_rs.rs_end = msp->ms_start + msp->ms_size;
+		vdev_xlate(vd, &logical_rs, &physical_rs);
+
+		if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
+			vd->vdev_initialize_bytes_est += ms_free;
+			mutex_exit(&msp->ms_lock);
+			continue;
+		} else if (vd->vdev_initialize_last_offset >
+		    physical_rs.rs_end) {
+			vd->vdev_initialize_bytes_done += ms_free;
+			vd->vdev_initialize_bytes_est += ms_free;
+			mutex_exit(&msp->ms_lock);
+			continue;
+		}
+
+		/*
+		 * If we get here, we're in the middle of initializing this
+		 * metaslab. Load it and walk the free tree for more accurate
+		 * progress estimation.
+		 */
+		vdev_initialize_ms_load(msp);
+
+		for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
+		    rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
+			logical_rs.rs_start = rs->rs_start;
+			logical_rs.rs_end = rs->rs_end;
+			vdev_xlate(vd, &logical_rs, &physical_rs);
+
+			uint64_t size = physical_rs.rs_end -
+			    physical_rs.rs_start;
+			vd->vdev_initialize_bytes_est += size;
+			if (vd->vdev_initialize_last_offset >
+			    physical_rs.rs_end) {
+				vd->vdev_initialize_bytes_done += size;
+			} else if (vd->vdev_initialize_last_offset >
+			    physical_rs.rs_start &&
+			    vd->vdev_initialize_last_offset <
+			    physical_rs.rs_end) {
+				vd->vdev_initialize_bytes_done +=
+				    vd->vdev_initialize_last_offset -
+				    physical_rs.rs_start;
+			}
+		}
+		mutex_exit(&msp->ms_lock);
+	}
+}
+
+static int
+vdev_initialize_load(vdev_t *vd)
+{
+	int err = 0;
+	ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+	    spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+	ASSERT(vd->vdev_leaf_zap != 0);
+
+	if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE ||
+	    vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) {
+		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
+		    sizeof (vd->vdev_initialize_last_offset), 1,
+		    &vd->vdev_initialize_last_offset);
+		if (err == ENOENT) {
+			vd->vdev_initialize_last_offset = 0;
+			err = 0;
+		}
+	}
+
+	vdev_initialize_calculate_progress(vd);
+	return (err);
+}
+
+
+/*
+ * Convert the logical range into a physcial range and add it to our
+ * avl tree.
+ */
+void
+vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
+{
+	vdev_t *vd = arg;
+	range_seg_t logical_rs, physical_rs;
+	logical_rs.rs_start = start;
+	logical_rs.rs_end = start + size;
+
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+	vdev_xlate(vd, &logical_rs, &physical_rs);
+
+	IMPLY(vd->vdev_top == vd,
+	    logical_rs.rs_start == physical_rs.rs_start);
+	IMPLY(vd->vdev_top == vd,
+	    logical_rs.rs_end == physical_rs.rs_end);
+
+	/* Only add segments that we have not visited yet */
+	if (physical_rs.rs_end <= vd->vdev_initialize_last_offset)
+		return;
+
+	/* Pick up where we left off mid-range. */
+	if (vd->vdev_initialize_last_offset > physical_rs.rs_start) {
+		zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
+		    "(%llu, %llu)", vd->vdev_path,
+		    (u_longlong_t)physical_rs.rs_start,
+		    (u_longlong_t)physical_rs.rs_end,
+		    (u_longlong_t)vd->vdev_initialize_last_offset,
+		    (u_longlong_t)physical_rs.rs_end);
+		ASSERT3U(physical_rs.rs_end, >,
+		    vd->vdev_initialize_last_offset);
+		physical_rs.rs_start = vd->vdev_initialize_last_offset;
+	}
+	ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
+
+	/*
+	 * With raidz, it's possible that the logical range does not live on
+	 * this leaf vdev. We only add the physical range to this vdev's if it
+	 * has a length greater than 0.
+	 */
+	if (physical_rs.rs_end > physical_rs.rs_start) {
+		range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start,
+		    physical_rs.rs_end - physical_rs.rs_start);
+	} else {
+		ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
+	}
+}
+
+static void
+vdev_initialize_thread(void *arg)
+{
+	vdev_t *vd = arg;
+	spa_t *spa = vd->vdev_spa;
+	int error = 0;
+	uint64_t ms_count = 0;
+
+	ASSERT(vdev_is_concrete(vd));
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+	vd->vdev_initialize_last_offset = 0;
+	VERIFY0(vdev_initialize_load(vd));
+
+	abd_t *deadbeef = vdev_initialize_block_alloc();
+
+	vd->vdev_initialize_tree = range_tree_create(NULL, NULL);
+
+	for (uint64_t i = 0; !vd->vdev_detached &&
+	    i < vd->vdev_top->vdev_ms_count; i++) {
+		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+
+		/*
+		 * If we've expanded the top-level vdev or it's our
+		 * first pass, calculate our progress.
+		 */
+		if (vd->vdev_top->vdev_ms_count != ms_count) {
+			vdev_initialize_calculate_progress(vd);
+			ms_count = vd->vdev_top->vdev_ms_count;
+		}
+
+		vdev_initialize_ms_mark(msp);
+		mutex_enter(&msp->ms_lock);
+		vdev_initialize_ms_load(msp);
+
+		range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
+		    vd);
+		mutex_exit(&msp->ms_lock);
+
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+		error = vdev_initialize_ranges(vd, deadbeef);
+		vdev_initialize_ms_unmark(msp);
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+		range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
+		if (error != 0)
+			break;
+	}
+
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+	mutex_enter(&vd->vdev_initialize_io_lock);
+	while (vd->vdev_initialize_inflight > 0) {
+		cv_wait(&vd->vdev_initialize_io_cv,
+		    &vd->vdev_initialize_io_lock);
+	}
+	mutex_exit(&vd->vdev_initialize_io_lock);
+
+	range_tree_destroy(vd->vdev_initialize_tree);
+	vdev_initialize_block_free(deadbeef);
+	vd->vdev_initialize_tree = NULL;
+
+	mutex_enter(&vd->vdev_initialize_lock);
+	if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) {
+		vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE);
+	}
+	ASSERT(vd->vdev_initialize_thread != NULL ||
+	    vd->vdev_initialize_inflight == 0);
+
+	/*
+	 * Drop the vdev_initialize_lock while we sync out the
+	 * txg since it's possible that a device might be trying to
+	 * come online and must check to see if it needs to restart an
+	 * initialization. That thread will be holding the spa_config_lock
+	 * which would prevent the txg_wait_synced from completing.
+	 */
+	mutex_exit(&vd->vdev_initialize_lock);
+	txg_wait_synced(spa_get_dsl(spa), 0);
+	mutex_enter(&vd->vdev_initialize_lock);
+
+	vd->vdev_initialize_thread = NULL;
+	cv_broadcast(&vd->vdev_initialize_cv);
+	mutex_exit(&vd->vdev_initialize_lock);
+}
+
+/*
+ * Initiates a device. Caller must hold vdev_initialize_lock.
+ * Device must be a leaf and not already be initializing.
+ */
+void
+vdev_initialize(vdev_t *vd)
+{
+	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+	ASSERT(vdev_is_concrete(vd));
+	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+	ASSERT(!vd->vdev_detached);
+	ASSERT(!vd->vdev_initialize_exit_wanted);
+	ASSERT(!vd->vdev_top->vdev_removing);
+
+	vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE);
+	vd->vdev_initialize_thread = thread_create(NULL, 0,
+	    vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+}
+
+/*
+ * Stop initializng a device, with the resultant initialing state being
+ * tgt_state. Blocks until the initializing thread has exited.
+ * Caller must hold vdev_initialize_lock and must not be writing to the spa
+ * config, as the initializing thread may try to enter the config as a reader
+ * before exiting.
+ */
+void
+vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state)
+{
+	ASSERTV(spa_t *spa = vd->vdev_spa);
+	ASSERT(!spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_WRITER));
+
+	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+	ASSERT(vdev_is_concrete(vd));
+
+	/*
+	 * Allow cancel requests to proceed even if the initialize thread
+	 * has stopped.
+	 */
+	if (vd->vdev_initialize_thread == NULL &&
+	    tgt_state != VDEV_INITIALIZE_CANCELED) {
+		return;
+	}
+
+	vdev_initialize_change_state(vd, tgt_state);
+	vd->vdev_initialize_exit_wanted = B_TRUE;
+	while (vd->vdev_initialize_thread != NULL)
+		cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock);
+
+	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+	vd->vdev_initialize_exit_wanted = B_FALSE;
+}
+
+static void
+vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state)
+{
+	if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
+		mutex_enter(&vd->vdev_initialize_lock);
+		vdev_initialize_stop(vd, tgt_state);
+		mutex_exit(&vd->vdev_initialize_lock);
+		return;
+	}
+
+	for (uint64_t i = 0; i < vd->vdev_children; i++) {
+		vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state);
+	}
+}
+
+/*
+ * Convenience function to stop initializing of a vdev tree and set all
+ * initialize thread pointers to NULL.
+ */
+void
+vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
+{
+	vdev_initialize_stop_all_impl(vd, tgt_state);
+
+	if (vd->vdev_spa->spa_sync_on) {
+		/* Make sure that our state has been synced to disk */
+		txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
+	}
+}
+
+void
+vdev_initialize_restart(vdev_t *vd)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+
+	if (vd->vdev_leaf_zap != 0) {
+		mutex_enter(&vd->vdev_initialize_lock);
+		uint64_t initialize_state = VDEV_INITIALIZE_NONE;
+		int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE,
+		    sizeof (initialize_state), 1, &initialize_state);
+		ASSERT(err == 0 || err == ENOENT);
+		vd->vdev_initialize_state = initialize_state;
+
+		uint64_t timestamp = 0;
+		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME,
+		    sizeof (timestamp), 1, &timestamp);
+		ASSERT(err == 0 || err == ENOENT);
+		vd->vdev_initialize_action_time = (time_t)timestamp;
+
+		if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
+		    vd->vdev_offline) {
+			/* load progress for reporting, but don't resume */
+			VERIFY0(vdev_initialize_load(vd));
+		} else if (vd->vdev_initialize_state ==
+		    VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd)) {
+			vdev_initialize(vd);
+		}
+
+		mutex_exit(&vd->vdev_initialize_lock);
+	}
+
+	for (uint64_t i = 0; i < vd->vdev_children; i++) {
+		vdev_initialize_restart(vd->vdev_child[i]);
+	}
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(vdev_initialize_restart);
+EXPORT_SYMBOL(vdev_xlate);
+EXPORT_SYMBOL(vdev_initialize_stop_all);
+EXPORT_SYMBOL(vdev_initialize);
+EXPORT_SYMBOL(vdev_initialize_stop);
+
+/* CSTYLED */
+module_param(zfs_initialize_value, ulong, 0644);
+MODULE_PARM_DESC(zfs_initialize_value,
+	"Value written during zpool initialize");
+#endif
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index 65357d841..b45c05db2 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -740,6 +740,7 @@ vdev_ops_t vdev_mirror_ops = {
 	NULL,
 	NULL,
 	NULL,
+	vdev_default_xlate,
 	VDEV_TYPE_MIRROR,	/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
@@ -755,6 +756,7 @@ vdev_ops_t vdev_replacing_ops = {
 	NULL,
 	NULL,
 	NULL,
+	vdev_default_xlate,
 	VDEV_TYPE_REPLACING,	/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
@@ -770,6 +772,7 @@ vdev_ops_t vdev_spare_ops = {
 	NULL,
 	NULL,
 	NULL,
+	vdev_default_xlate,
 	VDEV_TYPE_SPARE,	/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c
index b1c039f16..d85993bff 100644
--- a/module/zfs/vdev_missing.c
+++ b/module/zfs/vdev_missing.c
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  */
 
 /*
@@ -90,6 +90,7 @@ vdev_ops_t vdev_missing_ops = {
 	NULL,
 	NULL,
 	NULL,
+	NULL,
 	VDEV_TYPE_MISSING,	/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
@@ -105,6 +106,7 @@ vdev_ops_t vdev_hole_ops = {
 	NULL,
 	NULL,
 	NULL,
+	NULL,
 	VDEV_TYPE_HOLE,		/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index 89cdf7d81..939699cb8 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -154,6 +154,8 @@ uint32_t zfs_vdev_scrub_min_active = 1;
 uint32_t zfs_vdev_scrub_max_active = 2;
 uint32_t zfs_vdev_removal_min_active = 1;
 uint32_t zfs_vdev_removal_max_active = 2;
+uint32_t zfs_vdev_initializing_min_active = 1;
+uint32_t zfs_vdev_initializing_max_active = 1;
 
 /*
  * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
@@ -261,6 +263,8 @@ vdev_queue_class_min_active(zio_priority_t p)
 		return (zfs_vdev_scrub_min_active);
 	case ZIO_PRIORITY_REMOVAL:
 		return (zfs_vdev_removal_min_active);
+	case ZIO_PRIORITY_INITIALIZING:
+		return (zfs_vdev_initializing_min_active);
 	default:
 		panic("invalid priority %u", p);
 		return (0);
@@ -331,6 +335,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
 		return (zfs_vdev_scrub_max_active);
 	case ZIO_PRIORITY_REMOVAL:
 		return (zfs_vdev_removal_max_active);
+	case ZIO_PRIORITY_INITIALIZING:
+		return (zfs_vdev_initializing_max_active);
 	default:
 		panic("invalid priority %u", p);
 		return (0);
@@ -718,8 +724,8 @@ again:
 	}
 
 	/*
-	 * For LBA-ordered queues (async / scrub), issue the i/o which follows
-	 * the most recently issued i/o in LBA (offset) order.
+	 * For LBA-ordered queues (async / scrub / initializing), issue the
+	 * i/o which follows the most recently issued i/o in LBA (offset) order.
 	 *
 	 * For FIFO queues (sync), issue the i/o with the lowest timestamp.
 	 */
@@ -775,13 +781,15 @@ vdev_queue_io(zio_t *zio)
 		if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
 		    zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
 		    zio->io_priority != ZIO_PRIORITY_SCRUB &&
-		    zio->io_priority != ZIO_PRIORITY_REMOVAL)
+		    zio->io_priority != ZIO_PRIORITY_REMOVAL &&
+		    zio->io_priority != ZIO_PRIORITY_INITIALIZING)
 			zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
 	} else {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
 		    zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
-		    zio->io_priority != ZIO_PRIORITY_REMOVAL)
+		    zio->io_priority != ZIO_PRIORITY_REMOVAL &&
+		    zio->io_priority != ZIO_PRIORITY_INITIALIZING)
 			zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
 	}
 
@@ -938,11 +946,29 @@ module_param(zfs_vdev_async_write_min_active, int, 0644);
 MODULE_PARM_DESC(zfs_vdev_async_write_min_active,
 	"Min active async write I/Os per vdev");
 
+module_param(zfs_vdev_initializing_max_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_initializing_max_active,
+	"Max active initializing I/Os per vdev");
+
+module_param(zfs_vdev_initializing_min_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_initializing_min_active,
+	"Min active initializing I/Os per vdev");
+
+module_param(zfs_vdev_removal_max_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_removal_max_active,
+	"Max active removal I/Os per vdev");
+
+module_param(zfs_vdev_removal_min_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_removal_min_active,
+	"Min active removal I/Os per vdev");
+
 module_param(zfs_vdev_scrub_max_active, int, 0644);
-MODULE_PARM_DESC(zfs_vdev_scrub_max_active, "Max active scrub I/Os per vdev");
+MODULE_PARM_DESC(zfs_vdev_scrub_max_active,
+	"Max active scrub I/Os per vdev");
 
 module_param(zfs_vdev_scrub_min_active, int, 0644);
-MODULE_PARM_DESC(zfs_vdev_scrub_min_active, "Min active scrub I/Os per vdev");
+MODULE_PARM_DESC(zfs_vdev_scrub_min_active,
+	"Min active scrub I/Os per vdev");
 
 module_param(zfs_vdev_sync_read_max_active, int, 0644);
 MODULE_PARM_DESC(zfs_vdev_sync_read_max_active,
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index a21baf9c2..d10d89f3e 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -36,6 +36,10 @@
 #include <sys/vdev_raidz.h>
 #include <sys/vdev_raidz_impl.h>
 
+#ifdef ZFS_DEBUG
+#include <sys/vdev_initialize.h>	/* vdev_xlate testing */
+#endif
+
 /*
  * Virtual device vector for RAID-Z.
  *
@@ -1627,6 +1631,39 @@ vdev_raidz_child_done(zio_t *zio)
 	rc->rc_skipped = 0;
 }
 
+static void
+vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
+{
+#ifdef ZFS_DEBUG
+	vdev_t *vd = zio->io_vd;
+	vdev_t *tvd = vd->vdev_top;
+
+	range_seg_t logical_rs, physical_rs;
+	logical_rs.rs_start = zio->io_offset;
+	logical_rs.rs_end = logical_rs.rs_start +
+	    vdev_raidz_asize(zio->io_vd, zio->io_size);
+
+	raidz_col_t *rc = &rm->rm_col[col];
+	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+	vdev_xlate(cvd, &logical_rs, &physical_rs);
+	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
+	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
+	/*
+	 * It would be nice to assert that rs_end is equal
+	 * to rc_offset + rc_size but there might be an
+	 * optional I/O at the end that is not accounted in
+	 * rc_size.
+	 */
+	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
+		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
+		    rc->rc_size + (1 << tvd->vdev_ashift));
+	} else {
+		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
+	}
+#endif
+}
+
 /*
  * Start an IO operation on a RAIDZ VDev
  *
@@ -1665,6 +1702,12 @@ vdev_raidz_io_start(zio_t *zio)
 		for (c = 0; c < rm->rm_cols; c++) {
 			rc = &rm->rm_col[c];
 			cvd = vd->vdev_child[rc->rc_devidx];
+
+			/*
+			 * Verify physical to logical translation.
+			 */
+			vdev_raidz_io_verify(zio, rm, c);
+
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
@@ -2323,6 +2366,37 @@ vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
 	return (B_FALSE);
 }
 
+static void
+vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
+{
+	vdev_t *raidvd = cvd->vdev_parent;
+	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
+
+	uint64_t width = raidvd->vdev_children;
+	uint64_t tgt_col = cvd->vdev_id;
+	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
+
+	/* make sure the offsets are block-aligned */
+	ASSERT0(in->rs_start % (1 << ashift));
+	ASSERT0(in->rs_end % (1 << ashift));
+	uint64_t b_start = in->rs_start >> ashift;
+	uint64_t b_end = in->rs_end >> ashift;
+
+	uint64_t start_row = 0;
+	if (b_start > tgt_col) /* avoid underflow */
+		start_row = ((b_start - tgt_col - 1) / width) + 1;
+
+	uint64_t end_row = 0;
+	if (b_end > tgt_col)
+		end_row = ((b_end - tgt_col - 1) / width) + 1;
+
+	res->rs_start = start_row << ashift;
+	res->rs_end = end_row << ashift;
+
+	ASSERT3U(res->rs_start, <=, in->rs_start);
+	ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
+}
+
 vdev_ops_t vdev_raidz_ops = {
 	vdev_raidz_open,
 	vdev_raidz_close,
@@ -2334,6 +2408,7 @@ vdev_ops_t vdev_raidz_ops = {
 	NULL,
 	NULL,
 	NULL,
+	vdev_raidz_xlate,
 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index a706bc2a4..d0824aa84 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -44,6 +44,7 @@
 #include <sys/vdev_indirect_births.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/abd.h>
+#include <sys/vdev_initialize.h>
 #include <sys/trace_vdev.h>
 
 /*
@@ -1186,6 +1187,7 @@ vdev_remove_complete(spa_t *spa)
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 	txg = spa_vdev_enter(spa);
 	vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
+	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 
 	sysevent_t *ev = spa_event_create(spa, vd, NULL,
 	    ESC_ZFS_VDEV_REMOVE_DEV);
@@ -1896,6 +1898,9 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
 
 	spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
 
+	/* Stop initializing */
+	(void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
+
 	*txg = spa_vdev_config_enter(spa);
 
 	sysevent_t *ev = spa_event_create(spa, vd, NULL,
@@ -2072,6 +2077,13 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
 	 */
 	error = spa_reset_logs(spa);
 
+	/*
+	 * We stop any initializing that is currently in progress but leave
+	 * the state as "active". This will allow the initializing to resume
+	 * if the removal is canceled sometime later.
+	 */
+	vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
+
 	*txg = spa_vdev_config_enter(spa);
 
 	/*
@@ -2083,6 +2095,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
 
 	if (error != 0) {
 		metaslab_group_activate(mg);
+		spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
 		return (error);
 	}
 
diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c
index 9f86cbfa4..e40b7ce8e 100644
--- a/module/zfs/vdev_root.c
+++ b/module/zfs/vdev_root.c
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -150,6 +150,7 @@ vdev_ops_t vdev_root_ops = {
 	NULL,
 	NULL,
 	NULL,
+	NULL,
 	VDEV_TYPE_ROOT,		/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index a71da2837..3c36502d8 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -202,6 +202,8 @@
 #include <sys/zio_checksum.h>
 #include <sys/vdev_removal.h>
 #include <sys/zfs_sysfs.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_initialize.h>
 
 #include <linux/miscdevice.h>
 #include <linux/slab.h>
@@ -3843,6 +3845,85 @@ zfs_ioc_destroy(zfs_cmd_t *zc)
 }
 
 /*
+ * innvl: {
+ *     vdevs: {
+ *         guid 1, guid 2, ...
+ *     },
+ *     func: POOL_INITIALIZE_{CANCEL|DO|SUSPEND}
+ * }
+ *
+ * outnvl: {
+ *     [func: EINVAL (if provided command type didn't make sense)],
+ *     [vdevs: {
+ *         guid1: errno, (see function body for possible errnos)
+ *         ...
+ *     }]
+ * }
+ *
+ */
+static const zfs_ioc_key_t zfs_keys_pool_initialize[] = {
+	{ZPOOL_INITIALIZE_COMMAND,	DATA_TYPE_UINT64, 	0},
+	{ZPOOL_INITIALIZE_VDEVS,	DATA_TYPE_NVLIST,	0}
+};
+
+static int
+zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	spa_t *spa;
+	int error;
+
+	error = spa_open(poolname, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	uint64_t cmd_type;
+	if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND,
+	    &cmd_type) != 0) {
+		spa_close(spa, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+	if (!(cmd_type == POOL_INITIALIZE_CANCEL ||
+	    cmd_type == POOL_INITIALIZE_DO ||
+	    cmd_type == POOL_INITIALIZE_SUSPEND)) {
+		spa_close(spa, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	nvlist_t *vdev_guids;
+	if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS,
+	    &vdev_guids) != 0) {
+		spa_close(spa, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	nvlist_t *vdev_errlist = fnvlist_alloc();
+	int total_errors = 0;
+
+	for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) {
+		uint64_t vdev_guid = fnvpair_value_uint64(pair);
+
+		error = spa_vdev_initialize(spa, vdev_guid, cmd_type);
+		if (error != 0) {
+			char guid_as_str[MAXNAMELEN];
+
+			(void) snprintf(guid_as_str, sizeof (guid_as_str),
+			    "%llu", (unsigned long long)vdev_guid);
+			fnvlist_add_int64(vdev_errlist, guid_as_str, error);
+			total_errors++;
+		}
+	}
+	if (fnvlist_size(vdev_errlist) > 0) {
+		fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS,
+		    vdev_errlist);
+	}
+	fnvlist_free(vdev_errlist);
+
+	spa_close(spa, FTAG);
+	return (total_errors > 0 ? EINVAL : 0);
+}
+
+/*
  * fsname is name of dataset to rollback (to most recent snapshot)
  *
  * innvl may contain name of expected target snapshot
@@ -6453,6 +6534,11 @@ zfs_ioctl_init(void)
 	    zfs_keys_pool_discard_checkpoint,
 	    ARRAY_SIZE(zfs_keys_pool_discard_checkpoint));
 
+	zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE,
+	    zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+	    zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize));
+
 	/* IOCTLS that use the legacy function signature */
 
 	zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
diff --git a/module/zfs/zfs_sysfs.c b/module/zfs/zfs_sysfs.c
index b17c91f65..87c4ac117 100644
--- a/module/zfs/zfs_sysfs.c
+++ b/module/zfs/zfs_sysfs.c
@@ -358,6 +358,7 @@ pool_property_show(struct kobject *kobj, struct attribute *attr, char *buf)
  */
 static const char *zfs_features[]  = {
 	/* --> Add new kernel features here (post ZoL 0.8.0) */
+	"vdev_initialize"
 };
 
 #define	ZFS_FEATURE_COUNT	ARRAY_SIZE(zfs_features)