aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--cmd/zpool/zpool_main.c154
-rw-r--r--cmd/ztest/ztest.c94
-rw-r--r--configure.ac1
-rw-r--r--include/libzfs.h5
-rw-r--r--include/libzfs_core.h2
-rw-r--r--include/sys/Makefile.am1
-rw-r--r--include/sys/fs/zfs.h38
-rw-r--r--include/sys/metaslab_impl.h10
-rw-r--r--include/sys/spa.h2
-rw-r--r--include/sys/vdev_impl.h31
-rw-r--r--include/sys/vdev_initialize.h46
-rw-r--r--include/sys/zio_priority.h3
-rw-r--r--lib/libzfs/libzfs_pool.c94
-rw-r--r--lib/libzfs/libzfs_util.c7
-rw-r--r--lib/libzfs_core/libzfs_core.c37
-rw-r--r--lib/libzpool/Makefile.am1
-rw-r--r--man/man5/zfs-module-parameters.559
-rw-r--r--man/man8/zpool.831
-rw-r--r--module/zfs/Makefile.in1
-rw-r--r--module/zfs/metaslab.c23
-rw-r--r--module/zfs/spa.c155
-rw-r--r--module/zfs/spa_misc.c7
-rw-r--r--module/zfs/vdev.c46
-rw-r--r--module/zfs/vdev_disk.c1
-rw-r--r--module/zfs/vdev_file.c4
-rw-r--r--module/zfs/vdev_indirect.c1
-rw-r--r--module/zfs/vdev_initialize.c819
-rw-r--r--module/zfs/vdev_mirror.c3
-rw-r--r--module/zfs/vdev_missing.c4
-rw-r--r--module/zfs/vdev_queue.c38
-rw-r--r--module/zfs/vdev_raidz.c75
-rw-r--r--module/zfs/vdev_removal.c13
-rw-r--r--module/zfs/vdev_root.c3
-rw-r--r--module/zfs/zfs_ioctl.c86
-rw-r--r--module/zfs/zfs_sysfs.c1
-rw-r--r--tests/runfiles/linux.run15
-rw-r--r--tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c19
-rw-r--r--tests/zfs-tests/include/commands.cfg1
-rw-r--r--tests/zfs-tests/tests/functional/cli_root/Makefile.am1
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh2
-rw-r--r--tests/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile.am18
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh31
-rw-r--r--tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib43
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh68
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh78
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh66
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh74
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_split.ksh64
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh60
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh52
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh63
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh74
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh59
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh89
54 files changed, 2743 insertions, 30 deletions
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 8a4ac35d5..4e6a699c4 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -97,6 +97,7 @@ static int zpool_do_detach(int, char **);
static int zpool_do_replace(int, char **);
static int zpool_do_split(int, char **);
+static int zpool_do_initialize(int, char **);
static int zpool_do_scrub(int, char **);
static int zpool_do_resilver(int, char **);
@@ -150,6 +151,7 @@ typedef enum {
HELP_ONLINE,
HELP_REPLACE,
HELP_REMOVE,
+ HELP_INITIALIZE,
HELP_SCRUB,
HELP_RESILVER,
HELP_STATUS,
@@ -278,6 +280,7 @@ static zpool_command_t command_table[] = {
{ "replace", zpool_do_replace, HELP_REPLACE },
{ "split", zpool_do_split, HELP_SPLIT },
{ NULL },
+ { "initialize", zpool_do_initialize, HELP_INITIALIZE },
{ "scrub", zpool_do_scrub, HELP_SCRUB },
{ "resilver", zpool_do_resilver, HELP_RESILVER },
{ NULL },
@@ -360,6 +363,8 @@ get_usage(zpool_help_t idx)
return (gettext("\tremove [-nps] <pool> <device> ...\n"));
case HELP_REOPEN:
return (gettext("\treopen [-n] <pool>\n"));
+ case HELP_INITIALIZE:
+ return (gettext("\tinitialize [-cs] <pool> [<device> ...]\n"));
case HELP_SCRUB:
return (gettext("\tscrub [-s | -p] <pool> ...\n"));
case HELP_RESILVER:
@@ -393,6 +398,27 @@ get_usage(zpool_help_t idx)
/* NOTREACHED */
}
+static void
+zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res)
+{
+ uint_t children = 0;
+ nvlist_t **child;
+ uint_t i;
+
+ (void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &child, &children);
+
+ if (children == 0) {
+ char *path = zpool_vdev_name(g_zfs, zhp, nvroot, B_FALSE);
+ fnvlist_add_boolean(res, path);
+ free(path);
+ return;
+ }
+
+ for (i = 0; i < children; i++) {
+ zpool_collect_leaves(zhp, child[i], res);
+ }
+}
/*
* Callback routine that will print out a pool property value.
@@ -480,6 +506,97 @@ usage(boolean_t requested)
}
/*
+ * zpool initialize [-cs] <pool> [<vdev> ...]
+ * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool
+ * if none specified.
+ *
+ * -c Cancel. Ends active initializing.
+ * -s Suspend. Initializing can then be restarted with no flags.
+ */
+int
+zpool_do_initialize(int argc, char **argv)
+{
+ int c;
+ char *poolname;
+ zpool_handle_t *zhp;
+ nvlist_t *vdevs;
+ int err = 0;
+
+ struct option long_options[] = {
+ {"cancel", no_argument, NULL, 'c'},
+ {"suspend", no_argument, NULL, 's'},
+ {0, 0, 0, 0}
+ };
+
+ pool_initialize_func_t cmd_type = POOL_INITIALIZE_DO;
+ while ((c = getopt_long(argc, argv, "cs", long_options, NULL)) != -1) {
+ switch (c) {
+ case 'c':
+ if (cmd_type != POOL_INITIALIZE_DO) {
+ (void) fprintf(stderr, gettext("-c cannot be "
+ "combined with other options\n"));
+ usage(B_FALSE);
+ }
+ cmd_type = POOL_INITIALIZE_CANCEL;
+ break;
+ case 's':
+ if (cmd_type != POOL_INITIALIZE_DO) {
+ (void) fprintf(stderr, gettext("-s cannot be "
+ "combined with other options\n"));
+ usage(B_FALSE);
+ }
+ cmd_type = POOL_INITIALIZE_SUSPEND;
+ break;
+ case '?':
+ if (optopt != 0) {
+ (void) fprintf(stderr,
+ gettext("invalid option '%c'\n"), optopt);
+ } else {
+ (void) fprintf(stderr,
+ gettext("invalid option '%s'\n"),
+ argv[optind - 1]);
+ }
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name argument\n"));
+ usage(B_FALSE);
+ return (-1);
+ }
+
+ poolname = argv[0];
+ zhp = zpool_open(g_zfs, poolname);
+ if (zhp == NULL)
+ return (-1);
+
+ vdevs = fnvlist_alloc();
+ if (argc == 1) {
+ /* no individual leaf vdevs specified, so add them all */
+ nvlist_t *config = zpool_get_config(zhp, NULL);
+ nvlist_t *nvroot = fnvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE);
+ zpool_collect_leaves(zhp, nvroot, vdevs);
+ } else {
+ int i;
+ for (i = 1; i < argc; i++) {
+ fnvlist_add_boolean(vdevs, argv[i]);
+ }
+ }
+
+ err = zpool_initialize(zhp, cmd_type, vdevs);
+
+ fnvlist_free(vdevs);
+ zpool_close(zhp);
+
+ return (err);
+}
+
+/*
* print a pool vdev config for dry runs
*/
static void
@@ -1923,6 +2040,43 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
}
}
+ if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE ||
+ vs->vs_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
+ vs->vs_initialize_state == VDEV_INITIALIZE_COMPLETE) &&
+ !vs->vs_scan_removing) {
+ char zbuf[1024];
+ char tbuf[256];
+ struct tm zaction_ts;
+
+ time_t t = vs->vs_initialize_action_time;
+ int initialize_pct = 100;
+ if (vs->vs_initialize_state != VDEV_INITIALIZE_COMPLETE) {
+ initialize_pct = (vs->vs_initialize_bytes_done * 100 /
+ (vs->vs_initialize_bytes_est + 1));
+ }
+
+ (void) localtime_r(&t, &zaction_ts);
+ (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts);
+
+ switch (vs->vs_initialize_state) {
+ case VDEV_INITIALIZE_SUSPENDED:
+ (void) snprintf(zbuf, sizeof (zbuf),
+ ", suspended, started at %s", tbuf);
+ break;
+ case VDEV_INITIALIZE_ACTIVE:
+ (void) snprintf(zbuf, sizeof (zbuf),
+ ", started at %s", tbuf);
+ break;
+ case VDEV_INITIALIZE_COMPLETE:
+ (void) snprintf(zbuf, sizeof (zbuf),
+ ", completed at %s", tbuf);
+ break;
+ }
+
+ (void) printf(gettext(" (%d%% initialized%s)"),
+ initialize_pct, zbuf);
+ }
+
(void) printf("\n");
for (c = 0; c < children; c++) {
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index f9ba9b6d0..385984f84 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -106,6 +106,7 @@
#include <sys/zil_impl.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_file.h>
+#include <sys/vdev_initialize.h>
#include <sys/spa_impl.h>
#include <sys/metaslab_impl.h>
#include <sys/dsl_prop.h>
@@ -374,6 +375,7 @@ ztest_func_t ztest_spa_upgrade;
ztest_func_t ztest_device_removal;
ztest_func_t ztest_remap_blocks;
ztest_func_t ztest_spa_checkpoint_create_discard;
+ztest_func_t ztest_initialize;
ztest_func_t ztest_fletcher;
ztest_func_t ztest_fletcher_incr;
ztest_func_t ztest_verify_dnode_bt;
@@ -427,6 +429,7 @@ ztest_info_t ztest_info[] = {
ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes),
ZTI_INIT(ztest_remap_blocks, 1, &zopt_sometimes),
ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely),
+ ZTI_INIT(ztest_initialize, 1, &zopt_sometimes),
ZTI_INIT(ztest_fletcher, 1, &zopt_rarely),
ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely),
ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes),
@@ -6343,6 +6346,97 @@ ztest_get_zdb_bin(char *bin, int len)
strcpy(bin, "zdb");
}
+static vdev_t *
+ztest_random_concrete_vdev_leaf(vdev_t *vd)
+{
+ if (vd == NULL)
+ return (NULL);
+
+ if (vd->vdev_children == 0)
+ return (vd);
+
+ vdev_t *eligible[vd->vdev_children];
+ int eligible_idx = 0, i;
+ for (i = 0; i < vd->vdev_children; i++) {
+ vdev_t *cvd = vd->vdev_child[i];
+ if (cvd->vdev_top->vdev_removing)
+ continue;
+ if (cvd->vdev_children > 0 ||
+ (vdev_is_concrete(cvd) && !cvd->vdev_detached)) {
+ eligible[eligible_idx++] = cvd;
+ }
+ }
+ VERIFY(eligible_idx > 0);
+
+ uint64_t child_no = ztest_random(eligible_idx);
+ return (ztest_random_concrete_vdev_leaf(eligible[child_no]));
+}
+
+/* ARGSUSED */
+void
+ztest_initialize(ztest_ds_t *zd, uint64_t id)
+{
+ spa_t *spa = ztest_spa;
+ int error = 0;
+
+ mutex_enter(&ztest_vdev_lock);
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+ /* Random leaf vdev */
+ vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev);
+ if (rand_vd == NULL) {
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+ mutex_exit(&ztest_vdev_lock);
+ return;
+ }
+
+ /*
+ * The random vdev we've selected may change as soon as we
+ * drop the spa_config_lock. We create local copies of things
+ * we're interested in.
+ */
+ uint64_t guid = rand_vd->vdev_guid;
+ char *path = strdup(rand_vd->vdev_path);
+ boolean_t active = rand_vd->vdev_initialize_thread != NULL;
+
+ zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid);
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS);
+ error = spa_vdev_initialize(spa, guid, cmd);
+ switch (cmd) {
+ case POOL_INITIALIZE_CANCEL:
+ if (ztest_opts.zo_verbose >= 4) {
+ (void) printf("Cancel initialize %s", path);
+ if (!active)
+ (void) printf(" failed (no initialize active)");
+ (void) printf("\n");
+ }
+ break;
+ case POOL_INITIALIZE_DO:
+ if (ztest_opts.zo_verbose >= 4) {
+ (void) printf("Start initialize %s", path);
+ if (active && error == 0)
+ (void) printf(" failed (already active)");
+ else if (error != 0)
+ (void) printf(" failed (error %d)", error);
+ (void) printf("\n");
+ }
+ break;
+ case POOL_INITIALIZE_SUSPEND:
+ if (ztest_opts.zo_verbose >= 4) {
+ (void) printf("Suspend initialize %s", path);
+ if (!active)
+ (void) printf(" failed (no initialize active)");
+ (void) printf("\n");
+ }
+ break;
+ }
+ free(path);
+ mutex_exit(&ztest_vdev_lock);
+}
+
/*
* Verify pool integrity by running zdb.
*/
diff --git a/configure.ac b/configure.ac
index 028cae338..be3e556c1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -247,6 +247,7 @@ AC_CONFIG_FILES([
tests/zfs-tests/tests/functional/cli_root/zpool_history/Makefile
tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile
tests/zfs-tests/tests/functional/cli_root/zpool_import/blockfiles/Makefile
+ tests/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile
tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/Makefile
tests/zfs-tests/tests/functional/cli_root/zpool/Makefile
tests/zfs-tests/tests/functional/cli_root/zpool_offline/Makefile
diff --git a/include/libzfs.h b/include/libzfs.h
index c764e92dd..85b0bc0dd 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -139,6 +139,9 @@ typedef enum zfs_error {
EZFS_DEVRM_IN_PROGRESS, /* a device is currently being removed */
EZFS_VDEV_TOO_BIG, /* a device is too big to be used */
EZFS_IOC_NOTSUPPORTED, /* operation not supported by zfs module */
+ EZFS_TOOMANY, /* argument list too long */
+ EZFS_INITIALIZING, /* currently initializing */
+ EZFS_NO_INITIALIZE, /* no active initialize */
EZFS_UNKNOWN
} zfs_error_t;
@@ -253,6 +256,8 @@ typedef struct splitflags {
* Functions to manipulate pool and vdev state
*/
extern int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t);
+extern int zpool_initialize(zpool_handle_t *, pool_initialize_func_t,
+ nvlist_t *);
extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
extern int zpool_reguid(zpool_handle_t *);
extern int zpool_reopen_one(zpool_handle_t *, void *);
diff --git a/include/libzfs_core.h b/include/libzfs_core.h
index f84270d7e..264ce3fa0 100644
--- a/include/libzfs_core.h
+++ b/include/libzfs_core.h
@@ -62,6 +62,8 @@ int lzc_destroy_bookmarks(nvlist_t *, nvlist_t **);
int lzc_load_key(const char *, boolean_t, uint8_t *, uint_t);
int lzc_unload_key(const char *);
int lzc_change_key(const char *, uint64_t, nvlist_t *, uint8_t *, uint_t);
+int lzc_initialize(const char *, pool_initialize_func_t, nvlist_t *,
+ nvlist_t **);
int lzc_snaprange_space(const char *, const char *, uint64_t *);
diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am
index 8bf376998..e6c82d113 100644
--- a/include/sys/Makefile.am
+++ b/include/sys/Makefile.am
@@ -96,6 +96,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/vdev_impl.h \
$(top_srcdir)/include/sys/vdev_indirect_births.h \
$(top_srcdir)/include/sys/vdev_indirect_mapping.h \
+ $(top_srcdir)/include/sys/vdev_initialize.h \
$(top_srcdir)/include/sys/vdev_raidz.h \
$(top_srcdir)/include/sys/vdev_raidz_impl.h \
$(top_srcdir)/include/sys/vdev_removal.h \
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 85512618c..945853739 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -775,6 +775,13 @@ typedef struct zpool_load_policy {
#define VDEV_ALLOC_BIAS_SPECIAL "special"
#define VDEV_ALLOC_BIAS_DEDUP "dedup"
+#define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \
+ "com.delphix:next_offset_to_initialize"
+#define VDEV_LEAF_ZAP_INITIALIZE_STATE \
+ "com.delphix:vdev_initialize_state"
+#define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \
+ "com.delphix:vdev_initialize_action_time"
+
/*
* This is needed in userland to report the minimum necessary device size.
*/
@@ -988,10 +995,15 @@ typedef struct vdev_stat {
uint64_t vs_read_errors; /* read errors */
uint64_t vs_write_errors; /* write errors */
uint64_t vs_checksum_errors; /* checksum errors */
+ uint64_t vs_initialize_errors; /* initializing errors */
uint64_t vs_self_healed; /* self-healed bytes */
uint64_t vs_scan_removing; /* removing? */
uint64_t vs_scan_processed; /* scan processed bytes */
uint64_t vs_fragmentation; /* device fragmentation */
+ uint64_t vs_initialize_bytes_done; /* bytes initialized */
+ uint64_t vs_initialize_bytes_est; /* total bytes to initialize */
+ uint64_t vs_initialize_state; /* vdev_initialzing_state_t */
+ uint64_t vs_initialize_action_time; /* time_t */
uint64_t vs_checkpoint_space; /* checkpoint-consumed space */
uint64_t vs_resilver_deferred; /* resilver deferred */
uint64_t vs_slow_ios; /* slow IOs */
@@ -1023,7 +1035,6 @@ typedef struct vdev_stat_ex {
#define VDEV_L_HISTO_BUCKETS 37 /* Latency histo buckets */
#define VDEV_RQ_HISTO_BUCKETS 25 /* Request size histo buckets */
-
/* Amount of time in ZIO queue (ns) */
uint64_t vsx_queue_histo[ZIO_PRIORITY_NUM_QUEUEABLE]
[VDEV_L_HISTO_BUCKETS];
@@ -1051,6 +1062,16 @@ typedef struct vdev_stat_ex {
} vdev_stat_ex_t;
/*
+ * Initialize functions.
+ */
+typedef enum pool_initialize_func {
+ POOL_INITIALIZE_DO,
+ POOL_INITIALIZE_CANCEL,
+ POOL_INITIALIZE_SUSPEND,
+ POOL_INITIALIZE_FUNCS
+} pool_initialize_func_t;
+
+/*
* DDT statistics. Note: all fields should be 64-bit because this
* is passed between kernel and userland as an nvlist uint64 array.
*/
@@ -1094,6 +1115,14 @@ typedef struct ddt_histogram {
#define ZVOL_PROP_NAME "name"
#define ZVOL_DEFAULT_BLOCKSIZE 8192
+typedef enum {
+ VDEV_INITIALIZE_NONE,
+ VDEV_INITIALIZE_ACTIVE,
+ VDEV_INITIALIZE_CANCELED,
+ VDEV_INITIALIZE_SUSPENDED,
+ VDEV_INITIALIZE_COMPLETE
+} vdev_initializing_state_t;
+
/*
* /dev/zfs ioctl numbers.
*
@@ -1184,6 +1213,7 @@ typedef enum zfs_ioc {
ZFS_IOC_REMAP, /* 0x5a4c */
ZFS_IOC_POOL_CHECKPOINT, /* 0x5a4d */
ZFS_IOC_POOL_DISCARD_CHECKPOINT, /* 0x5a4e */
+ ZFS_IOC_POOL_INITIALIZE, /* 0x5a4f */
/*
* Linux - 3/64 numbers reserved.
@@ -1278,6 +1308,12 @@ typedef enum {
#define ZPOOL_HIDDEN_ARGS "hidden_args"
/*
+ * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE.
+ */
+#define ZPOOL_INITIALIZE_COMMAND "initialize_command"
+#define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs"
+
+/*
* Flags for ZFS_IOC_VDEV_SET_STATE
*/
#define ZFS_ONLINE_CHECKREMOVE 0x1
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index aa1c82a02..3e32eace6 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -68,7 +68,8 @@ typedef enum trace_alloc_type {
TRACE_GROUP_FAILURE = -5ULL,
TRACE_ENOSPC = -6ULL,
TRACE_CONDENSING = -7ULL,
- TRACE_VDEV_ERROR = -8ULL
+ TRACE_VDEV_ERROR = -8ULL,
+ TRACE_INITIALIZING = -9ULL
} trace_alloc_type_t;
#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
@@ -270,6 +271,11 @@ struct metaslab_group {
uint64_t mg_failed_allocations;
uint64_t mg_fragmentation;
uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+
+ int mg_ms_initializing;
+ boolean_t mg_initialize_updating;
+ kmutex_t mg_ms_initialize_lock;
+ kcondvar_t mg_ms_initialize_cv;
};
/*
@@ -360,6 +366,8 @@ struct metaslab {
boolean_t ms_condense_wanted;
uint64_t ms_condense_checked_txg;
+ uint64_t ms_initializing; /* leaves initializing this ms */
+
/*
* We must hold both ms_lock and ms_group->mg_lock in order to
* modify ms_loaded.
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 5dc27e334..4a66260ef 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -772,6 +772,7 @@ extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
#define SPA_ASYNC_AUTOEXPAND 0x20
#define SPA_ASYNC_REMOVE_DONE 0x40
#define SPA_ASYNC_REMOVE_STOP 0x80
+#define SPA_ASYNC_INITIALIZE_RESTART 0x100
/*
* Controls the behavior of spa_vdev_remove().
@@ -787,6 +788,7 @@ extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
int replace_done);
extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
extern boolean_t spa_vdev_remove_active(spa_t *spa);
+extern int spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type);
extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 6c13a548f..ae21e037e 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -82,6 +82,12 @@ typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd,
uint64_t offset, uint64_t size, void *arg);
typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size,
vdev_remap_cb_t callback, void *arg);
+/*
+ * Given a target vdev, translates the logical range "in" to the physical
+ * range "res"
+ */
+typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in,
+ range_seg_t *res);
typedef const struct vdev_ops {
vdev_open_func_t *vdev_op_open;
@@ -94,6 +100,11 @@ typedef const struct vdev_ops {
vdev_hold_func_t *vdev_op_hold;
vdev_rele_func_t *vdev_op_rele;
vdev_remap_func_t *vdev_op_remap;
+ /*
+ * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves.
+ * Used when initializing vdevs. Isn't used by leaf ops.
+ */
+ vdev_xlation_func_t *vdev_op_xlate;
char vdev_op_type[16];
boolean_t vdev_op_leaf;
} vdev_ops_t;
@@ -250,6 +261,24 @@ struct vdev {
/* pool checkpoint related */
space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */
+ boolean_t vdev_initialize_exit_wanted;
+ vdev_initializing_state_t vdev_initialize_state;
+ kthread_t *vdev_initialize_thread;
+ /* Protects vdev_initialize_thread and vdev_initialize_state. */
+ kmutex_t vdev_initialize_lock;
+ kcondvar_t vdev_initialize_cv;
+ uint64_t vdev_initialize_offset[TXG_SIZE];
+ uint64_t vdev_initialize_last_offset;
+ range_tree_t *vdev_initialize_tree; /* valid while initializing */
+ uint64_t vdev_initialize_bytes_est;
+ uint64_t vdev_initialize_bytes_done;
+ time_t vdev_initialize_action_time; /* start and end time */
+
+ /* for limiting outstanding I/Os */
+ kmutex_t vdev_initialize_io_lock;
+ kcondvar_t vdev_initialize_io_cv;
+ uint64_t vdev_initialize_inflight;
+
/*
* Values stored in the config for an indirect or removing vdev.
*/
@@ -478,6 +507,8 @@ extern vdev_ops_t vdev_indirect_ops;
/*
* Common size functions
*/
+extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in,
+ range_seg_t *out);
extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
extern uint64_t vdev_get_min_asize(vdev_t *vd);
extern void vdev_set_min_asize(vdev_t *vd);
diff --git a/include/sys/vdev_initialize.h b/include/sys/vdev_initialize.h
new file mode 100644
index 000000000..db4b0572c
--- /dev/null
+++ b/include/sys/vdev_initialize.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_INITIALIZE_H
+#define _SYS_VDEV_INITIALIZE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void vdev_initialize(vdev_t *vd);
+extern void vdev_initialize_stop(vdev_t *vd,
+ vdev_initializing_state_t tgt_state);
+extern void vdev_initialize_stop_all(vdev_t *vd,
+ vdev_initializing_state_t tgt_state);
+extern void vdev_initialize_restart(vdev_t *vd);
+extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs,
+ range_seg_t *physical_rs);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_INITIALIZE_H */
diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h
index c2cc8b2d5..d8e6a1745 100644
--- a/include/sys/zio_priority.h
+++ b/include/sys/zio_priority.h
@@ -13,7 +13,7 @@
* CDDL HEADER END
*/
/*
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
*/
#ifndef _ZIO_PRIORITY_H
#define _ZIO_PRIORITY_H
@@ -29,6 +29,7 @@ typedef enum zio_priority {
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */
+ ZIO_PRIORITY_INITIALIZING, /* initializing I/O */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW, /* non-queued i/os (e.g. free) */
} zio_priority_t;
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index bc320e516..f799471e4 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -2092,6 +2092,100 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
return (ret);
}
+static int
+xlate_init_err(int err)
+{
+ switch (err) {
+ case ENODEV:
+ return (EZFS_NODEVICE);
+ case EINVAL:
+ case EROFS:
+ return (EZFS_BADDEV);
+ case EBUSY:
+ return (EZFS_INITIALIZING);
+ case ESRCH:
+ return (EZFS_NO_INITIALIZE);
+ }
+ return (err);
+}
+
+/*
+ * Begin, suspend, or cancel the initialization (initializing of all free
+ * blocks) for the given vdevs in the given pool.
+ */
+int
+zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
+ nvlist_t *vds)
+{
+ char msg[1024];
+ libzfs_handle_t *hdl = zhp->zpool_hdl;
+
+ nvlist_t *errlist;
+
+ /* translate vdev names to guids */
+ nvlist_t *vdev_guids = fnvlist_alloc();
+ nvlist_t *guids_to_paths = fnvlist_alloc();
+ boolean_t spare, cache;
+ nvlist_t *tgt;
+ nvpair_t *elem;
+
+ for (elem = nvlist_next_nvpair(vds, NULL); elem != NULL;
+ elem = nvlist_next_nvpair(vds, elem)) {
+ char *vd_path = nvpair_name(elem);
+ tgt = zpool_find_vdev(zhp, vd_path, &spare, &cache, NULL);
+
+ if ((tgt == NULL) || cache || spare) {
+ (void) snprintf(msg, sizeof (msg),
+ dgettext(TEXT_DOMAIN, "cannot initialize '%s'"),
+ vd_path);
+ int err = (tgt == NULL) ? EZFS_NODEVICE :
+ (spare ? EZFS_ISSPARE : EZFS_ISL2CACHE);
+ fnvlist_free(vdev_guids);
+ fnvlist_free(guids_to_paths);
+ return (zfs_error(hdl, err, msg));
+ }
+
+ uint64_t guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID);
+ fnvlist_add_uint64(vdev_guids, vd_path, guid);
+
+ (void) snprintf(msg, sizeof (msg), "%llu", (u_longlong_t)guid);
+ fnvlist_add_string(guids_to_paths, msg, vd_path);
+ }
+
+ int err = lzc_initialize(zhp->zpool_name, cmd_type, vdev_guids,
+ &errlist);
+ fnvlist_free(vdev_guids);
+
+ if (err == 0) {
+ fnvlist_free(guids_to_paths);
+ return (0);
+ }
+
+ nvlist_t *vd_errlist = NULL;
+ if (errlist != NULL) {
+ vd_errlist = fnvlist_lookup_nvlist(errlist,
+ ZPOOL_INITIALIZE_VDEVS);
+ }
+
+ (void) snprintf(msg, sizeof (msg),
+ dgettext(TEXT_DOMAIN, "operation failed"));
+
+ for (elem = nvlist_next_nvpair(vd_errlist, NULL); elem != NULL;
+ elem = nvlist_next_nvpair(vd_errlist, elem)) {
+ int64_t vd_error = xlate_init_err(fnvpair_value_int64(elem));
+ char *path = fnvlist_lookup_string(guids_to_paths,
+ nvpair_name(elem));
+ (void) zfs_error_fmt(hdl, vd_error, "cannot initialize '%s'",
+ path);
+ }
+
+ fnvlist_free(guids_to_paths);
+ if (vd_errlist != NULL)
+ return (-1);
+
+ return (zpool_standard_error(hdl, err, msg));
+}
+
/*
* Scan the pool.
*/
diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c
index 776d887a2..d7401cdf4 100644
--- a/lib/libzfs/libzfs_util.c
+++ b/lib/libzfs/libzfs_util.c
@@ -283,6 +283,13 @@ libzfs_error_description(libzfs_handle_t *hdl)
"different host"));
case EZFS_CRYPTOFAILED:
return (dgettext(TEXT_DOMAIN, "encryption failure"));
+ case EZFS_TOOMANY:
+ return (dgettext(TEXT_DOMAIN, "argument list too long"));
+ case EZFS_INITIALIZING:
+ return (dgettext(TEXT_DOMAIN, "currently initializing"));
+ case EZFS_NO_INITIALIZE:
+ return (dgettext(TEXT_DOMAIN, "there is no active "
+ "initialization"));
case EZFS_UNKNOWN:
return (dgettext(TEXT_DOMAIN, "unknown error"));
default:
diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c
index 1baec04a4..524a637e4 100644
--- a/lib/libzfs_core/libzfs_core.c
+++ b/lib/libzfs_core/libzfs_core.c
@@ -1397,3 +1397,40 @@ lzc_reopen(const char *pool_name, boolean_t scrub_restart)
nvlist_free(args);
return (error);
}
+
+/*
+ * Changes initializing state.
+ *
+ * vdevs should be a list of (<key>, guid) where guid is a uint64 vdev GUID.
+ * The key is ignored.
+ *
+ * If there are errors related to vdev arguments, per-vdev errors are returned
+ * in an nvlist with the key "vdevs". Each error is a (guid, errno) pair where
+ * guid is stringified with PRIu64, and errno is one of the following as
+ * an int64_t:
+ * - ENODEV if the device was not found
+ * - EINVAL if the devices is not a leaf or is not concrete (e.g. missing)
+ * - EROFS if the device is not writeable
+ * - EBUSY start requested but the device is already being initialized
+ * - ESRCH cancel/suspend requested but device is not being initialized
+ *
+ * If the errlist is empty, then return value will be:
+ * - EINVAL if one or more arguments was invalid
+ * - Other spa_open failures
+ * - 0 if the operation succeeded
+ */
+int
+lzc_initialize(const char *poolname, pool_initialize_func_t cmd_type,
+ nvlist_t *vdevs, nvlist_t **errlist)
+{
+ int error;
+ nvlist_t *args = fnvlist_alloc();
+ fnvlist_add_uint64(args, ZPOOL_INITIALIZE_COMMAND, (uint64_t)cmd_type);
+ fnvlist_add_nvlist(args, ZPOOL_INITIALIZE_VDEVS, vdevs);
+
+ error = lzc_ioctl(ZFS_IOC_POOL_INITIALIZE, poolname, args, errlist);
+
+ fnvlist_free(args);
+
+ return (error);
+}
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index efc44b27e..e13bb0f58 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -113,6 +113,7 @@ KERNEL_C = \
vdev_indirect_births.c \
vdev_indirect.c \
vdev_indirect_mapping.c \
+ vdev_initialize.c \
vdev_label.c \
vdev_mirror.c \
vdev_missing.c \
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index 55115c266..1dbf865f4 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -1375,6 +1375,30 @@ Default value: \fB2\fR.
.sp
.ne 2
.na
+\fBzfs_vdev_initializing_max_active\fR (int)
+.ad
+.RS 12n
+Maximum initializing I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB1\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_vdev_initializing_min_active\fR (int)
+.ad
+.RS 12n
+Minimum initializing I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB1\fR.
+.RE
+
+.sp
+.ne 2
+.na
\fBzfs_vdev_max_active\fR (int)
.ad
.RS 12n
@@ -1388,6 +1412,30 @@ Default value: \fB1,000\fR.
.sp
.ne 2
.na
+\fBzfs_vdev_removal_max_active\fR (int)
+.ad
+.RS 12n
+Maximum removal I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB2\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_vdev_removal_min_active\fR (int)
+.ad
+.RS 12n
+Minimum removal I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB1\fR.
+.RE
+
+.sp
+.ne 2
+.na
\fBzfs_vdev_scrub_max_active\fR (int)
.ad
.RS 12n
@@ -1615,6 +1663,17 @@ Default value: \fB32,768\fR.
.sp
.ne 2
.na
+\fBzfs_initialize_value\fR (ulong)
+.ad
+.RS 12n
+Pattern written to vdev free space by \fBzpool initialize\fR.
+.sp
+Default value: \fB16,045,690,984,833,335,022\fR (0xdeadbeefdeadbeee).
+.RE
+
+.sp
+.ne 2
+.na
\fBzfs_lua_max_instrlimit\fR (ulong)
.ad
.RS 12n
diff --git a/man/man8/zpool.8 b/man/man8/zpool.8
index 83e50bd01..a05d4d1f3 100644
--- a/man/man8/zpool.8
+++ b/man/man8/zpool.8
@@ -115,6 +115,11 @@
.Ar pool Ns | Ns Ar id
.Op Ar newpool Oo Fl t Oc
.Nm
+.Cm initialize
+.Op Fl cs
+.Ar pool
+.Op Ar device Ns ...
+.Nm
.Cm iostat
.Op Oo Oo Fl c Ar SCRIPT Oc Oo Fl lq Oc Oc Ns | Ns Fl rw
.Op Fl T Sy u Ns | Ns Sy d
@@ -1597,6 +1602,32 @@ Will also set -o cachefile=none when not explicitly specified.
.El
.It Xo
.Nm
+.Cm initialize
+.Op Fl cs
+.Ar pool
+.Op Ar device Ns ...
+.Xc
+Begins initializing by writing to all unallocated regions on the specified
+devices, or all eligible devices in the pool if no individual devices are
+specified.
+Only leaf data or log devices may be initialized.
+.Bl -tag -width Ds
+.It Fl c, -cancel
+Cancel initializing on the specified devices, or all eligible devices if none
+are specified.
+If one or more target devices are invalid or are not currently being
+initialized, the command will fail and no cancellation will occur on any device.
+.It Fl s -suspend
+Suspend initializing on the specified devices, or all eligible devices if none
+are specified.
+If one or more target devices are invalid or are not currently being
+initialized, the command will fail and no suspension will occur on any device.
+Initializing can then be resumed by running
+.Nm zpool Cm initialize
+with no flags on the relevant target devices.
+.El
+.It Xo
+.Nm
.Cm iostat
.Op Oo Oo Fl c Ar SCRIPT Oc Oo Fl lq Oc Oc Ns | Ns Fl rw
.Op Fl T Sy u Ns | Ns Sy d
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in
index a243f51d8..193bdc510 100644
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@@ -89,6 +89,7 @@ $(MODULE)-objs += vdev_file.o
$(MODULE)-objs += vdev_indirect.o
$(MODULE)-objs += vdev_indirect_births.o
$(MODULE)-objs += vdev_indirect_mapping.o
+$(MODULE)-objs += vdev_initialize.o
$(MODULE)-objs += vdev_label.o
$(MODULE)-objs += vdev_mirror.o
$(MODULE)-objs += vdev_missing.o
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 4b5baf6a6..71688b420 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -635,6 +635,8 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL);
mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
KM_SLEEP);
mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
@@ -681,6 +683,8 @@ metaslab_group_destroy(metaslab_group_t *mg)
kmem_free(mg->mg_secondaries, mg->mg_allocators *
sizeof (metaslab_t *));
mutex_destroy(&mg->mg_lock);
+ mutex_destroy(&mg->mg_ms_initialize_lock);
+ cv_destroy(&mg->mg_ms_initialize_cv);
for (int i = 0; i < mg->mg_allocators; i++) {
zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]);
@@ -1502,6 +1506,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
+
ms->ms_id = id;
ms->ms_start = id << vd->vdev_ms_shift;
ms->ms_size = 1ULL << vd->vdev_ms_shift;
@@ -2686,6 +2691,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
* from it in 'metaslab_unload_delay' txgs, then unload it.
*/
if (msp->ms_loaded &&
+ msp->ms_initializing == 0 &&
msp->ms_selected_txg + metaslab_unload_delay < txg) {
for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
@@ -2967,6 +2973,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
metaslab_class_t *mc = msp->ms_group->mg_class;
VERIFY(!msp->ms_condensing);
+ VERIFY0(msp->ms_initializing);
start = mc->mc_ops->msop_alloc(msp, size);
if (start != -1ULL) {
@@ -3027,9 +3034,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
}
/*
- * If the selected metaslab is condensing, skip it.
+ * If the selected metaslab is condensing or being
+ * initialized, skip it.
*/
- if (msp->ms_condensing)
+ if (msp->ms_condensing || msp->ms_initializing > 0)
continue;
*was_active = msp->ms_allocator != -1;
@@ -3190,7 +3198,9 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
/*
* If this metaslab is currently condensing then pick again as
* we can't manipulate this metaslab until it's committed
- * to disk.
+ * to disk. If this metaslab is being initialized, we shouldn't
+ * allocate from it since the allocated region might be
+ * overwritten after allocation.
*/
if (msp->ms_condensing) {
metaslab_trace_add(zal, mg, msp, asize, d,
@@ -3199,6 +3209,13 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
~METASLAB_ACTIVE_MASK);
mutex_exit(&msp->ms_lock);
continue;
+ } else if (msp->ms_initializing > 0) {
+ metaslab_trace_add(zal, mg, msp, asize, d,
+ TRACE_INITIALIZING, allocator);
+ metaslab_passivate(msp, msp->ms_weight &
+ ~METASLAB_ACTIVE_MASK);
+ mutex_exit(&msp->ms_lock);
+ continue;
}
offset = metaslab_block_alloc(msp, asize, txg);
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index f0683b0b8..622be75f9 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -56,6 +56,7 @@
#include <sys/vdev_removal.h>
#include <sys/vdev_indirect_mapping.h>
#include <sys/vdev_indirect_births.h>
+#include <sys/vdev_initialize.h>
#include <sys/vdev_disk.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
@@ -434,8 +435,9 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
dp = spa_get_dsl(spa);
dsl_pool_config_enter(dp, FTAG);
- if ((err = dsl_dataset_hold_obj(dp,
- za.za_first_integer, FTAG, &ds))) {
+ err = dsl_dataset_hold_obj(dp,
+ za.za_first_integer, FTAG, &ds);
+ if (err != 0) {
dsl_pool_config_exit(dp, FTAG);
break;
}
@@ -601,7 +603,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
}
error = dmu_objset_hold(strval, FTAG, &os);
- if (error)
+ if (error != 0)
break;
/*
@@ -1218,8 +1220,10 @@ spa_activate(spa_t *spa, int mode)
spa_create_zio_taskqs(spa);
}
- for (size_t i = 0; i < TXG_SIZE; i++)
- spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 0);
+ for (size_t i = 0; i < TXG_SIZE; i++) {
+ spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+ }
list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_config_dirty_node));
@@ -1437,6 +1441,11 @@ spa_unload(spa_t *spa)
*/
spa_async_suspend(spa);
+ if (spa->spa_root_vdev) {
+ vdev_initialize_stop_all(spa->spa_root_vdev,
+ VDEV_INITIALIZE_ACTIVE);
+ }
+
/*
* Stop syncing.
*/
@@ -1452,10 +1461,10 @@ spa_unload(spa_t *spa)
* calling taskq_wait(mg_taskq).
*/
if (spa->spa_root_vdev != NULL) {
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++)
vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]);
- spa_config_exit(spa, SCL_ALL, FTAG);
+ spa_config_exit(spa, SCL_ALL, spa);
}
if (spa->spa_mmp.mmp_thread)
@@ -1492,7 +1501,7 @@ spa_unload(spa_t *spa)
bpobj_close(&spa->spa_deferred_bpobj);
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
/*
* Close all vdevs.
@@ -1554,7 +1563,7 @@ spa_unload(spa_t *spa)
spa->spa_comment = NULL;
}
- spa_config_exit(spa, SCL_ALL, FTAG);
+ spa_config_exit(spa, SCL_ALL, spa);
}
/*
@@ -4246,6 +4255,9 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
*/
dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_initialize_restart(spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
}
spa_load_note(spa, "LOADED");
@@ -5654,6 +5666,18 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
}
/*
+ * We're about to export or destroy this pool. Make sure
+ * we stop all initializtion activity here before we
+ * set the spa_final_txg. This will ensure that all
+ * dirty data resulting from the initialization is
+ * committed to disk before we unload the pool.
+ */
+ if (spa->spa_root_vdev != NULL) {
+ vdev_initialize_stop_all(spa->spa_root_vdev,
+ VDEV_INITIALIZE_ACTIVE);
+ }
+
+ /*
* We want this to be reflected on every label,
* so mark them all dirty. spa_unload() will do the
* final sync that pushes these changes out.
@@ -6357,6 +6381,86 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
return (error);
}
+int
+spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type)
+{
+ /*
+ * We hold the namespace lock through the whole function
+ * to prevent any changes to the pool while we're starting or
+ * stopping initialization. The config and state locks are held so that
+ * we can properly assess the vdev state before we commit to
+ * the initializing operation.
+ */
+ mutex_enter(&spa_namespace_lock);
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+
+ /* Look up vdev and ensure it's a leaf. */
+ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+ if (vd == NULL || vd->vdev_detached) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(ENODEV));
+ } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(EINVAL));
+ } else if (!vdev_writeable(vd)) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(EROFS));
+ }
+ mutex_enter(&vd->vdev_initialize_lock);
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+
+ /*
+ * When we activate an initialize action we check to see
+ * if the vdev_initialize_thread is NULL. We do this instead
+ * of using the vdev_initialize_state since there might be
+ * a previous initialization process which has completed but
+ * the thread is not exited.
+ */
+ if (cmd_type == POOL_INITIALIZE_DO &&
+ (vd->vdev_initialize_thread != NULL ||
+ vd->vdev_top->vdev_removing)) {
+ mutex_exit(&vd->vdev_initialize_lock);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(EBUSY));
+ } else if (cmd_type == POOL_INITIALIZE_CANCEL &&
+ (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
+ vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
+ mutex_exit(&vd->vdev_initialize_lock);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(ESRCH));
+ } else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
+ vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
+ mutex_exit(&vd->vdev_initialize_lock);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(ESRCH));
+ }
+
+ switch (cmd_type) {
+ case POOL_INITIALIZE_DO:
+ vdev_initialize(vd);
+ break;
+ case POOL_INITIALIZE_CANCEL:
+ vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
+ break;
+ case POOL_INITIALIZE_SUSPEND:
+ vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED);
+ break;
+ default:
+ panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
+ }
+ mutex_exit(&vd->vdev_initialize_lock);
+
+ /* Sync out the initializing state */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+
/*
* Split a set of devices from their mirrors, and create a new pool from them.
*/
@@ -6565,6 +6669,19 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
spa_activate(newspa, spa_mode_global);
spa_async_suspend(newspa);
+ for (c = 0; c < children; c++) {
+ if (vml[c] != NULL) {
+ /*
+ * Temporarily stop the initializing activity. We set
+ * the state to ACTIVE so that we know to resume
+ * the initializing once the split has completed.
+ */
+ mutex_enter(&vml[c]->vdev_initialize_lock);
+ vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE);
+ mutex_exit(&vml[c]->vdev_initialize_lock);
+ }
+ }
+
newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
/* create the new pool from the disks of the original pool */
@@ -6652,6 +6769,10 @@ out:
if (vml[c] != NULL)
vml[c]->vdev_offline = B_FALSE;
}
+
+ /* restart initializing disks as necessary */
+ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
+
vdev_reopen(spa->spa_root_vdev);
nvlist_free(spa->spa_config_splitting);
@@ -7025,6 +7146,14 @@ spa_async_thread(void *arg)
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
dsl_resilver_restart(dp, 0);
+ if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
+ mutex_enter(&spa_namespace_lock);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_initialize_restart(spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ }
+
/*
* Let the world know that we're done.
*/
@@ -7677,8 +7806,9 @@ spa_sync(spa_t *spa, uint64_t txg)
* Wait for i/os issued in open context that need to complete
* before this txg syncs.
*/
- VERIFY0(zio_wait(spa->spa_txg_zio[txg & TXG_MASK]));
- spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 0);
+ (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
+ spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
/*
* Lock out configuration changes.
@@ -7983,7 +8113,8 @@ spa_sync(spa_t *spa, uint64_t txg)
/*
* Update usable space statistics.
*/
- while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))))
+ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
+ != NULL)
vdev_sync_done(vd, txg);
spa_update_dspace(spa);
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index a3ac70f07..dfac92d45 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -38,6 +38,7 @@
#include <sys/zap.h>
#include <sys/zil.h>
#include <sys/vdev_impl.h>
+#include <sys/vdev_initialize.h>
#include <sys/vdev_file.h>
#include <sys/vdev_raidz.h>
#include <sys/metaslab.h>
@@ -1194,6 +1195,12 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
if (vd != NULL) {
ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
+ if (vd->vdev_ops->vdev_op_leaf) {
+ mutex_enter(&vd->vdev_initialize_lock);
+ vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
+ mutex_exit(&vd->vdev_initialize_lock);
+ }
+
spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
vdev_free(vd);
spa_config_exit(spa, SCL_ALL, spa);
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 8273e7907..f808f8ee7 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -50,6 +50,7 @@
#include <sys/zil.h>
#include <sys/dsl_scan.h>
#include <sys/abd.h>
+#include <sys/vdev_initialize.h>
#include <sys/zvol.h>
#include <sys/zfs_ratelimit.h>
@@ -212,6 +213,14 @@ vdev_getops(const char *type)
return (ops);
}
+/* ARGSUSED */
+void
+vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res)
+{
+ res->rs_start = in->rs_start;
+ res->rs_end = in->rs_end;
+}
+
/*
* Derive the enumerated alloction bias from string input.
* String origin is either the per-vdev zap or zpool(1M).
@@ -526,6 +535,10 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
for (int t = 0; t < DTL_TYPES; t++) {
vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
@@ -850,6 +863,7 @@ void
vdev_free(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
/*
* Scan queues are normally destroyed at the end of a scan. If the
@@ -880,6 +894,7 @@ vdev_free(vdev_t *vd)
ASSERT(vd->vdev_child == NULL);
ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
+ ASSERT(vd->vdev_initialize_thread == NULL);
/*
* Discard allocation state.
@@ -957,6 +972,10 @@ vdev_free(vdev_t *vd)
mutex_destroy(&vd->vdev_stat_lock);
mutex_destroy(&vd->vdev_probe_lock);
mutex_destroy(&vd->vdev_scan_io_queue_lock);
+ mutex_destroy(&vd->vdev_initialize_lock);
+ mutex_destroy(&vd->vdev_initialize_io_lock);
+ cv_destroy(&vd->vdev_initialize_io_cv);
+ cv_destroy(&vd->vdev_initialize_cv);
zfs_ratelimit_fini(&vd->vdev_delay_rl);
zfs_ratelimit_fini(&vd->vdev_checksum_rl);
@@ -3207,7 +3226,8 @@ vdev_sync_done(vdev_t *vd, uint64_t txg)
ASSERT(vdev_is_concrete(vd));
- while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))))
+ while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
+ != NULL)
metaslab_sync_done(msp, txg);
if (reassess)
@@ -3458,6 +3478,15 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
}
+ /* Restart initializing if necessary */
+ mutex_enter(&vd->vdev_initialize_lock);
+ if (vdev_writeable(vd) &&
+ vd->vdev_initialize_thread == NULL &&
+ vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
+ (void) vdev_initialize(vd);
+ }
+ mutex_exit(&vd->vdev_initialize_lock);
+
if (wasoffline ||
(oldstate < VDEV_STATE_DEGRADED &&
vd->vdev_state >= VDEV_STATE_DEGRADED))
@@ -3848,9 +3877,22 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
vs->vs_state = vd->vdev_state;
vs->vs_rsize = vdev_get_min_asize(vd);
- if (vd->vdev_ops->vdev_op_leaf)
+ if (vd->vdev_ops->vdev_op_leaf) {
vs->vs_rsize += VDEV_LABEL_START_SIZE +
VDEV_LABEL_END_SIZE;
+ /*
+ * Report intializing progress. Since we don't
+ * have the initializing locks held, this is only
+ * an estimate (although a fairly accurate one).
+ */
+ vs->vs_initialize_bytes_done =
+ vd->vdev_initialize_bytes_done;
+ vs->vs_initialize_bytes_est =
+ vd->vdev_initialize_bytes_est;
+ vs->vs_initialize_state = vd->vdev_initialize_state;
+ vs->vs_initialize_action_time =
+ vd->vdev_initialize_action_time;
+ }
/*
* Report expandable space on top-level, non-auxillary devices
* only. The expandable space is reported in terms of metaslab
diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
index 9c44ba12a..d13f365dd 100644
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -890,6 +890,7 @@ vdev_ops_t vdev_disk_ops = {
vdev_disk_hold,
vdev_disk_rele,
NULL,
+ vdev_default_xlate,
VDEV_TYPE_DISK, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c
index bd7e0bc2e..3551898e0 100644
--- a/module/zfs/vdev_file.c
+++ b/module/zfs/vdev_file.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -254,6 +254,7 @@ vdev_ops_t vdev_file_ops = {
vdev_file_hold,
vdev_file_rele,
NULL,
+ vdev_default_xlate,
VDEV_TYPE_FILE, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
@@ -289,6 +290,7 @@ vdev_ops_t vdev_disk_ops = {
vdev_file_hold,
vdev_file_rele,
NULL,
+ vdev_default_xlate,
VDEV_TYPE_DISK, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c
index 724457df4..070d1b8d9 100644
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c
@@ -1857,6 +1857,7 @@ vdev_ops_t vdev_indirect_ops = {
NULL,
NULL,
vdev_indirect_remap,
+ NULL,
VDEV_TYPE_INDIRECT, /* name of this vdev type */
B_FALSE /* leaf vdev */
};
diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c
new file mode 100644
index 000000000..fcd2c76f9
--- /dev/null
+++ b/module/zfs/vdev_initialize.c
@@ -0,0 +1,819 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/txg.h>
+#include <sys/vdev_impl.h>
+#include <sys/refcount.h>
+#include <sys/metaslab_impl.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+
+/*
+ * Maximum number of metaslabs per group that can be initialized
+ * simultaneously.
+ */
+int max_initialize_ms = 3;
+
+/*
+ * Value that is written to disk during initialization.
+ */
+#ifdef _ILP32
+unsigned long zfs_initialize_value = 0xdeadbeefUL;
+#else
+unsigned long zfs_initialize_value = 0xdeadbeefdeadbeeeULL;
+#endif
+
+/* maximum number of I/Os outstanding per leaf vdev */
+int zfs_initialize_limit = 1;
+
+/* size of initializing writes; default 1MiB, see zfs_remove_max_segment */
+uint64_t zfs_initialize_chunk_size = 1024 * 1024;
+
+static boolean_t
+vdev_initialize_should_stop(vdev_t *vd)
+{
+ return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) ||
+ vd->vdev_detached || vd->vdev_top->vdev_removing);
+}
+
+static void
+vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
+{
+ /*
+ * We pass in the guid instead of the vdev_t since the vdev may
+ * have been freed prior to the sync task being processed. This
+ * happens when a vdev is detached as we call spa_config_vdev_exit(),
+ * stop the intializing thread, schedule the sync task, and free
+ * the vdev. Later when the scheduled sync task is invoked, it would
+ * find that the vdev has been freed.
+ */
+ uint64_t guid = *(uint64_t *)arg;
+ uint64_t txg = dmu_tx_get_txg(tx);
+ kmem_free(arg, sizeof (uint64_t));
+
+ vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
+ if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
+ return;
+
+ uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK];
+ vd->vdev_initialize_offset[txg & TXG_MASK] = 0;
+
+ VERIFY(vd->vdev_leaf_zap != 0);
+
+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
+
+ if (last_offset > 0) {
+ vd->vdev_initialize_last_offset = last_offset;
+ VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
+ sizeof (last_offset), 1, &last_offset, tx));
+ }
+ if (vd->vdev_initialize_action_time > 0) {
+ uint64_t val = (uint64_t)vd->vdev_initialize_action_time;
+ VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val),
+ 1, &val, tx));
+ }
+
+ uint64_t initialize_state = vd->vdev_initialize_state;
+ VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1,
+ &initialize_state, tx));
+}
+
+static void
+vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
+{
+ ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+ spa_t *spa = vd->vdev_spa;
+
+ if (new_state == vd->vdev_initialize_state)
+ return;
+
+ /*
+ * Copy the vd's guid, this will be freed by the sync task.
+ */
+ uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+ *guid = vd->vdev_guid;
+
+ /*
+ * If we're suspending, then preserving the original start time.
+ */
+ if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
+ vd->vdev_initialize_action_time = gethrestime_sec();
+ }
+ vd->vdev_initialize_state = new_state;
+
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
+ guid, 2, ZFS_SPACE_CHECK_RESERVED, tx);
+
+ switch (new_state) {
+ case VDEV_INITIALIZE_ACTIVE:
+ spa_history_log_internal(spa, "initialize", tx,
+ "vdev=%s activated", vd->vdev_path);
+ break;
+ case VDEV_INITIALIZE_SUSPENDED:
+ spa_history_log_internal(spa, "initialize", tx,
+ "vdev=%s suspended", vd->vdev_path);
+ break;
+ case VDEV_INITIALIZE_CANCELED:
+ spa_history_log_internal(spa, "initialize", tx,
+ "vdev=%s canceled", vd->vdev_path);
+ break;
+ case VDEV_INITIALIZE_COMPLETE:
+ spa_history_log_internal(spa, "initialize", tx,
+ "vdev=%s complete", vd->vdev_path);
+ break;
+ default:
+ panic("invalid state %llu", (unsigned long long)new_state);
+ }
+
+ dmu_tx_commit(tx);
+}
+
+static void
+vdev_initialize_cb(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ mutex_enter(&vd->vdev_initialize_io_lock);
+ if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
+ /*
+ * The I/O failed because the vdev was unavailable; roll the
+ * last offset back. (This works because spa_sync waits on
+ * spa_txg_zio before it runs sync tasks.)
+ */
+ uint64_t *off =
+ &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK];
+ *off = MIN(*off, zio->io_offset);
+ } else {
+ /*
+ * Since initializing is best-effort, we ignore I/O errors and
+ * rely on vdev_probe to determine if the errors are more
+ * critical.
+ */
+ if (zio->io_error != 0)
+ vd->vdev_stat.vs_initialize_errors++;
+
+ vd->vdev_initialize_bytes_done += zio->io_orig_size;
+ }
+ ASSERT3U(vd->vdev_initialize_inflight, >, 0);
+ vd->vdev_initialize_inflight--;
+ cv_broadcast(&vd->vdev_initialize_io_cv);
+ mutex_exit(&vd->vdev_initialize_io_lock);
+
+ spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+
+/* Takes care of physical writing and limiting # of concurrent ZIOs. */
+static int
+vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ /* Limit inflight initializing I/Os */
+ mutex_enter(&vd->vdev_initialize_io_lock);
+ while (vd->vdev_initialize_inflight >= zfs_initialize_limit) {
+ cv_wait(&vd->vdev_initialize_io_cv,
+ &vd->vdev_initialize_io_lock);
+ }
+ vd->vdev_initialize_inflight++;
+ mutex_exit(&vd->vdev_initialize_io_lock);
+
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ uint64_t txg = dmu_tx_get_txg(tx);
+
+ spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
+ mutex_enter(&vd->vdev_initialize_lock);
+
+ if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) {
+ uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+ *guid = vd->vdev_guid;
+
+ /* This is the first write of this txg. */
+ dsl_sync_task_nowait(spa_get_dsl(spa),
+ vdev_initialize_zap_update_sync, guid, 2,
+ ZFS_SPACE_CHECK_RESERVED, tx);
+ }
+
+ /*
+ * We know the vdev struct will still be around since all
+ * consumers of vdev_free must stop the initialization first.
+ */
+ if (vdev_initialize_should_stop(vd)) {
+ mutex_enter(&vd->vdev_initialize_io_lock);
+ ASSERT3U(vd->vdev_initialize_inflight, >, 0);
+ vd->vdev_initialize_inflight--;
+ mutex_exit(&vd->vdev_initialize_io_lock);
+ spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+ mutex_exit(&vd->vdev_initialize_lock);
+ dmu_tx_commit(tx);
+ return (SET_ERROR(EINTR));
+ }
+ mutex_exit(&vd->vdev_initialize_lock);
+
+ vd->vdev_initialize_offset[txg & TXG_MASK] = start + size;
+ zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start,
+ size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL,
+ ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE));
+ /* vdev_initialize_cb releases SCL_STATE_ALL */
+
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+
+/*
+ * Translate a logical range to the physical range for the specified vdev_t.
+ * This function is initially called with a leaf vdev and will walk each
+ * parent vdev until it reaches a top-level vdev. Once the top-level is
+ * reached the physical range is initialized and the recursive function
+ * begins to unwind. As it unwinds it calls the parent's vdev specific
+ * translation function to do the real conversion.
+ */
+void
+vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs)
+{
+ /*
+ * Walk up the vdev tree
+ */
+ if (vd != vd->vdev_top) {
+ vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
+ } else {
+ /*
+ * We've reached the top-level vdev, initialize the
+ * physical range to the logical range and start to
+ * unwind.
+ */
+ physical_rs->rs_start = logical_rs->rs_start;
+ physical_rs->rs_end = logical_rs->rs_end;
+ return;
+ }
+
+ vdev_t *pvd = vd->vdev_parent;
+ ASSERT3P(pvd, !=, NULL);
+ ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
+
+ /*
+ * As this recursive function unwinds, translate the logical
+ * range into its physical components by calling the
+ * vdev specific translate function.
+ */
+ range_seg_t intermediate = { { { 0, 0 } } };
+ pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
+
+ physical_rs->rs_start = intermediate.rs_start;
+ physical_rs->rs_end = intermediate.rs_end;
+}
+
+/*
+ * Callback to fill each ABD chunk with zfs_initialize_value. len must be
+ * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
+ * allocation will guarantee these for us.
+ */
+/* ARGSUSED */
+static int
+vdev_initialize_block_fill(void *buf, size_t len, void *unused)
+{
+ ASSERT0(len % sizeof (uint64_t));
+#ifdef _ILP32
+ for (uint64_t i = 0; i < len; i += sizeof (uint32_t)) {
+ *(uint32_t *)((char *)(buf) + i) = zfs_initialize_value;
+ }
+#else
+ for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) {
+ *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value;
+ }
+#endif
+ return (0);
+}
+
+static abd_t *
+vdev_initialize_block_alloc(void)
+{
+ /* Allocate ABD for filler data */
+ abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE);
+
+ ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t));
+ (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size,
+ vdev_initialize_block_fill, NULL);
+
+ return (data);
+}
+
+static void
+vdev_initialize_block_free(abd_t *data)
+{
+ abd_free(data);
+}
+
+static int
+vdev_initialize_ranges(vdev_t *vd, abd_t *data)
+{
+ avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root;
+
+ for (range_seg_t *rs = avl_first(rt); rs != NULL;
+ rs = AVL_NEXT(rt, rs)) {
+ uint64_t size = rs->rs_end - rs->rs_start;
+
+ /* Split range into legally-sized physical chunks */
+ uint64_t writes_required =
+ ((size - 1) / zfs_initialize_chunk_size) + 1;
+
+ for (uint64_t w = 0; w < writes_required; w++) {
+ int error;
+
+ error = vdev_initialize_write(vd,
+ VDEV_LABEL_START_SIZE + rs->rs_start +
+ (w * zfs_initialize_chunk_size),
+ MIN(size - (w * zfs_initialize_chunk_size),
+ zfs_initialize_chunk_size), data);
+ if (error != 0)
+ return (error);
+ }
+ }
+ return (0);
+}
+
+static void
+vdev_initialize_ms_load(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ metaslab_load_wait(msp);
+ if (!msp->ms_loaded)
+ VERIFY0(metaslab_load(msp));
+}
+
+static void
+vdev_initialize_mg_wait(metaslab_group_t *mg)
+{
+ ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
+ while (mg->mg_initialize_updating) {
+ cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
+ }
+}
+
+static void
+vdev_initialize_mg_mark(metaslab_group_t *mg)
+{
+ ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
+ ASSERT(mg->mg_initialize_updating);
+
+ while (mg->mg_ms_initializing >= max_initialize_ms) {
+ cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
+ }
+ mg->mg_ms_initializing++;
+ ASSERT3U(mg->mg_ms_initializing, <=, max_initialize_ms);
+}
+
+/*
+ * Mark the metaslab as being initialized to prevent any allocations
+ * on this metaslab. We must also track how many metaslabs are currently
+ * being initialized within a metaslab group and limit them to prevent
+ * allocation failures from occurring because all metaslabs are being
+ * initialized.
+ */
+static void
+vdev_initialize_ms_mark(metaslab_t *msp)
+{
+ ASSERT(!MUTEX_HELD(&msp->ms_lock));
+ metaslab_group_t *mg = msp->ms_group;
+
+ mutex_enter(&mg->mg_ms_initialize_lock);
+
+ /*
+ * To keep an accurate count of how many threads are initializing
+ * a specific metaslab group, we only allow one thread to mark
+ * the metaslab group at a time. This ensures that the value of
+ * ms_initializing will be accurate when we decide to mark a metaslab
+ * group as being initialized. To do this we force all other threads
+ * to wait till the metaslab's mg_initialize_updating flag is no
+ * longer set.
+ */
+ vdev_initialize_mg_wait(mg);
+ mg->mg_initialize_updating = B_TRUE;
+ if (msp->ms_initializing == 0) {
+ vdev_initialize_mg_mark(mg);
+ }
+ mutex_enter(&msp->ms_lock);
+ msp->ms_initializing++;
+ mutex_exit(&msp->ms_lock);
+
+ mg->mg_initialize_updating = B_FALSE;
+ cv_broadcast(&mg->mg_ms_initialize_cv);
+ mutex_exit(&mg->mg_ms_initialize_lock);
+}
+
+static void
+vdev_initialize_ms_unmark(metaslab_t *msp)
+{
+ ASSERT(!MUTEX_HELD(&msp->ms_lock));
+ metaslab_group_t *mg = msp->ms_group;
+ mutex_enter(&mg->mg_ms_initialize_lock);
+ mutex_enter(&msp->ms_lock);
+ if (--msp->ms_initializing == 0) {
+ mg->mg_ms_initializing--;
+ cv_broadcast(&mg->mg_ms_initialize_cv);
+ }
+ mutex_exit(&msp->ms_lock);
+ mutex_exit(&mg->mg_ms_initialize_lock);
+}
+
+static void
+vdev_initialize_calculate_progress(vdev_t *vd)
+{
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+ spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+ ASSERT(vd->vdev_leaf_zap != 0);
+
+ vd->vdev_initialize_bytes_est = 0;
+ vd->vdev_initialize_bytes_done = 0;
+
+ for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
+ metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+ mutex_enter(&msp->ms_lock);
+
+ uint64_t ms_free = msp->ms_size -
+ space_map_allocated(msp->ms_sm);
+
+ if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
+ ms_free /= vd->vdev_top->vdev_children;
+
+ /*
+ * Convert the metaslab range to a physical range
+ * on our vdev. We use this to determine if we are
+ * in the middle of this metaslab range.
+ */
+ range_seg_t logical_rs, physical_rs;
+ logical_rs.rs_start = msp->ms_start;
+ logical_rs.rs_end = msp->ms_start + msp->ms_size;
+ vdev_xlate(vd, &logical_rs, &physical_rs);
+
+ if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
+ vd->vdev_initialize_bytes_est += ms_free;
+ mutex_exit(&msp->ms_lock);
+ continue;
+ } else if (vd->vdev_initialize_last_offset >
+ physical_rs.rs_end) {
+ vd->vdev_initialize_bytes_done += ms_free;
+ vd->vdev_initialize_bytes_est += ms_free;
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ /*
+ * If we get here, we're in the middle of initializing this
+ * metaslab. Load it and walk the free tree for more accurate
+ * progress estimation.
+ */
+ vdev_initialize_ms_load(msp);
+
+ for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
+ rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
+ logical_rs.rs_start = rs->rs_start;
+ logical_rs.rs_end = rs->rs_end;
+ vdev_xlate(vd, &logical_rs, &physical_rs);
+
+ uint64_t size = physical_rs.rs_end -
+ physical_rs.rs_start;
+ vd->vdev_initialize_bytes_est += size;
+ if (vd->vdev_initialize_last_offset >
+ physical_rs.rs_end) {
+ vd->vdev_initialize_bytes_done += size;
+ } else if (vd->vdev_initialize_last_offset >
+ physical_rs.rs_start &&
+ vd->vdev_initialize_last_offset <
+ physical_rs.rs_end) {
+ vd->vdev_initialize_bytes_done +=
+ vd->vdev_initialize_last_offset -
+ physical_rs.rs_start;
+ }
+ }
+ mutex_exit(&msp->ms_lock);
+ }
+}
+
+static int
+vdev_initialize_load(vdev_t *vd)
+{
+ int err = 0;
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+ spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+ ASSERT(vd->vdev_leaf_zap != 0);
+
+ if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE ||
+ vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) {
+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
+ sizeof (vd->vdev_initialize_last_offset), 1,
+ &vd->vdev_initialize_last_offset);
+ if (err == ENOENT) {
+ vd->vdev_initialize_last_offset = 0;
+ err = 0;
+ }
+ }
+
+ vdev_initialize_calculate_progress(vd);
+ return (err);
+}
+
+
+/*
+ * Convert the logical range into a physcial range and add it to our
+ * avl tree.
+ */
+void
+vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
+{
+ vdev_t *vd = arg;
+ range_seg_t logical_rs, physical_rs;
+ logical_rs.rs_start = start;
+ logical_rs.rs_end = start + size;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ vdev_xlate(vd, &logical_rs, &physical_rs);
+
+ IMPLY(vd->vdev_top == vd,
+ logical_rs.rs_start == physical_rs.rs_start);
+ IMPLY(vd->vdev_top == vd,
+ logical_rs.rs_end == physical_rs.rs_end);
+
+ /* Only add segments that we have not visited yet */
+ if (physical_rs.rs_end <= vd->vdev_initialize_last_offset)
+ return;
+
+ /* Pick up where we left off mid-range. */
+ if (vd->vdev_initialize_last_offset > physical_rs.rs_start) {
+ zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
+ "(%llu, %llu)", vd->vdev_path,
+ (u_longlong_t)physical_rs.rs_start,
+ (u_longlong_t)physical_rs.rs_end,
+ (u_longlong_t)vd->vdev_initialize_last_offset,
+ (u_longlong_t)physical_rs.rs_end);
+ ASSERT3U(physical_rs.rs_end, >,
+ vd->vdev_initialize_last_offset);
+ physical_rs.rs_start = vd->vdev_initialize_last_offset;
+ }
+ ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
+
+ /*
+ * With raidz, it's possible that the logical range does not live on
+ * this leaf vdev. We only add the physical range to this vdev's if it
+ * has a length greater than 0.
+ */
+ if (physical_rs.rs_end > physical_rs.rs_start) {
+ range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start,
+ physical_rs.rs_end - physical_rs.rs_start);
+ } else {
+ ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
+ }
+}
+
+static void
+vdev_initialize_thread(void *arg)
+{
+ vdev_t *vd = arg;
+ spa_t *spa = vd->vdev_spa;
+ int error = 0;
+ uint64_t ms_count = 0;
+
+ ASSERT(vdev_is_concrete(vd));
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ vd->vdev_initialize_last_offset = 0;
+ VERIFY0(vdev_initialize_load(vd));
+
+ abd_t *deadbeef = vdev_initialize_block_alloc();
+
+ vd->vdev_initialize_tree = range_tree_create(NULL, NULL);
+
+ for (uint64_t i = 0; !vd->vdev_detached &&
+ i < vd->vdev_top->vdev_ms_count; i++) {
+ metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+
+ /*
+ * If we've expanded the top-level vdev or it's our
+ * first pass, calculate our progress.
+ */
+ if (vd->vdev_top->vdev_ms_count != ms_count) {
+ vdev_initialize_calculate_progress(vd);
+ ms_count = vd->vdev_top->vdev_ms_count;
+ }
+
+ vdev_initialize_ms_mark(msp);
+ mutex_enter(&msp->ms_lock);
+ vdev_initialize_ms_load(msp);
+
+ range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
+ vd);
+ mutex_exit(&msp->ms_lock);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ error = vdev_initialize_ranges(vd, deadbeef);
+ vdev_initialize_ms_unmark(msp);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
+ if (error != 0)
+ break;
+ }
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ mutex_enter(&vd->vdev_initialize_io_lock);
+ while (vd->vdev_initialize_inflight > 0) {
+ cv_wait(&vd->vdev_initialize_io_cv,
+ &vd->vdev_initialize_io_lock);
+ }
+ mutex_exit(&vd->vdev_initialize_io_lock);
+
+ range_tree_destroy(vd->vdev_initialize_tree);
+ vdev_initialize_block_free(deadbeef);
+ vd->vdev_initialize_tree = NULL;
+
+ mutex_enter(&vd->vdev_initialize_lock);
+ if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) {
+ vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE);
+ }
+ ASSERT(vd->vdev_initialize_thread != NULL ||
+ vd->vdev_initialize_inflight == 0);
+
+ /*
+ * Drop the vdev_initialize_lock while we sync out the
+ * txg since it's possible that a device might be trying to
+ * come online and must check to see if it needs to restart an
+ * initialization. That thread will be holding the spa_config_lock
+ * which would prevent the txg_wait_synced from completing.
+ */
+ mutex_exit(&vd->vdev_initialize_lock);
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ mutex_enter(&vd->vdev_initialize_lock);
+
+ vd->vdev_initialize_thread = NULL;
+ cv_broadcast(&vd->vdev_initialize_cv);
+ mutex_exit(&vd->vdev_initialize_lock);
+}
+
+/*
+ * Initiates a device. Caller must hold vdev_initialize_lock.
+ * Device must be a leaf and not already be initializing.
+ */
+void
+vdev_initialize(vdev_t *vd)
+{
+ ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+ ASSERT(!vd->vdev_detached);
+ ASSERT(!vd->vdev_initialize_exit_wanted);
+ ASSERT(!vd->vdev_top->vdev_removing);
+
+ vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE);
+ vd->vdev_initialize_thread = thread_create(NULL, 0,
+ vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+}
+
+/*
+ * Stop initializng a device, with the resultant initialing state being
+ * tgt_state. Blocks until the initializing thread has exited.
+ * Caller must hold vdev_initialize_lock and must not be writing to the spa
+ * config, as the initializing thread may try to enter the config as a reader
+ * before exiting.
+ */
+void
+vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state)
+{
+ ASSERTV(spa_t *spa = vd->vdev_spa);
+ ASSERT(!spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_WRITER));
+
+ ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(vdev_is_concrete(vd));
+
+ /*
+ * Allow cancel requests to proceed even if the initialize thread
+ * has stopped.
+ */
+ if (vd->vdev_initialize_thread == NULL &&
+ tgt_state != VDEV_INITIALIZE_CANCELED) {
+ return;
+ }
+
+ vdev_initialize_change_state(vd, tgt_state);
+ vd->vdev_initialize_exit_wanted = B_TRUE;
+ while (vd->vdev_initialize_thread != NULL)
+ cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock);
+
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+ vd->vdev_initialize_exit_wanted = B_FALSE;
+}
+
+static void
+vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state)
+{
+ if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
+ mutex_enter(&vd->vdev_initialize_lock);
+ vdev_initialize_stop(vd, tgt_state);
+ mutex_exit(&vd->vdev_initialize_lock);
+ return;
+ }
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state);
+ }
+}
+
+/*
+ * Convenience function to stop initializing of a vdev tree and set all
+ * initialize thread pointers to NULL.
+ */
+void
+vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
+{
+ vdev_initialize_stop_all_impl(vd, tgt_state);
+
+ if (vd->vdev_spa->spa_sync_on) {
+ /* Make sure that our state has been synced to disk */
+ txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
+ }
+}
+
+void
+vdev_initialize_restart(vdev_t *vd)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+
+ if (vd->vdev_leaf_zap != 0) {
+ mutex_enter(&vd->vdev_initialize_lock);
+ uint64_t initialize_state = VDEV_INITIALIZE_NONE;
+ int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE,
+ sizeof (initialize_state), 1, &initialize_state);
+ ASSERT(err == 0 || err == ENOENT);
+ vd->vdev_initialize_state = initialize_state;
+
+ uint64_t timestamp = 0;
+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME,
+ sizeof (timestamp), 1, &timestamp);
+ ASSERT(err == 0 || err == ENOENT);
+ vd->vdev_initialize_action_time = (time_t)timestamp;
+
+ if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
+ vd->vdev_offline) {
+ /* load progress for reporting, but don't resume */
+ VERIFY0(vdev_initialize_load(vd));
+ } else if (vd->vdev_initialize_state ==
+ VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd)) {
+ vdev_initialize(vd);
+ }
+
+ mutex_exit(&vd->vdev_initialize_lock);
+ }
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ vdev_initialize_restart(vd->vdev_child[i]);
+ }
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(vdev_initialize_restart);
+EXPORT_SYMBOL(vdev_xlate);
+EXPORT_SYMBOL(vdev_initialize_stop_all);
+EXPORT_SYMBOL(vdev_initialize);
+EXPORT_SYMBOL(vdev_initialize_stop);
+
+/* CSTYLED */
+module_param(zfs_initialize_value, ulong, 0644);
+MODULE_PARM_DESC(zfs_initialize_value,
+ "Value written during zpool initialize");
+#endif
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index 65357d841..b45c05db2 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -740,6 +740,7 @@ vdev_ops_t vdev_mirror_ops = {
NULL,
NULL,
NULL,
+ vdev_default_xlate,
VDEV_TYPE_MIRROR, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
@@ -755,6 +756,7 @@ vdev_ops_t vdev_replacing_ops = {
NULL,
NULL,
NULL,
+ vdev_default_xlate,
VDEV_TYPE_REPLACING, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
@@ -770,6 +772,7 @@ vdev_ops_t vdev_spare_ops = {
NULL,
NULL,
NULL,
+ vdev_default_xlate,
VDEV_TYPE_SPARE, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c
index b1c039f16..d85993bff 100644
--- a/module/zfs/vdev_missing.c
+++ b/module/zfs/vdev_missing.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
*/
/*
@@ -90,6 +90,7 @@ vdev_ops_t vdev_missing_ops = {
NULL,
NULL,
NULL,
+ NULL,
VDEV_TYPE_MISSING, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
@@ -105,6 +106,7 @@ vdev_ops_t vdev_hole_ops = {
NULL,
NULL,
NULL,
+ NULL,
VDEV_TYPE_HOLE, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index 89cdf7d81..939699cb8 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -154,6 +154,8 @@ uint32_t zfs_vdev_scrub_min_active = 1;
uint32_t zfs_vdev_scrub_max_active = 2;
uint32_t zfs_vdev_removal_min_active = 1;
uint32_t zfs_vdev_removal_max_active = 2;
+uint32_t zfs_vdev_initializing_min_active = 1;
+uint32_t zfs_vdev_initializing_max_active = 1;
/*
* When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
@@ -261,6 +263,8 @@ vdev_queue_class_min_active(zio_priority_t p)
return (zfs_vdev_scrub_min_active);
case ZIO_PRIORITY_REMOVAL:
return (zfs_vdev_removal_min_active);
+ case ZIO_PRIORITY_INITIALIZING:
+ return (zfs_vdev_initializing_min_active);
default:
panic("invalid priority %u", p);
return (0);
@@ -331,6 +335,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
return (zfs_vdev_scrub_max_active);
case ZIO_PRIORITY_REMOVAL:
return (zfs_vdev_removal_max_active);
+ case ZIO_PRIORITY_INITIALIZING:
+ return (zfs_vdev_initializing_max_active);
default:
panic("invalid priority %u", p);
return (0);
@@ -718,8 +724,8 @@ again:
}
/*
- * For LBA-ordered queues (async / scrub), issue the i/o which follows
- * the most recently issued i/o in LBA (offset) order.
+ * For LBA-ordered queues (async / scrub / initializing), issue the
+ * i/o which follows the most recently issued i/o in LBA (offset) order.
*
* For FIFO queues (sync), issue the i/o with the lowest timestamp.
*/
@@ -775,13 +781,15 @@ vdev_queue_io(zio_t *zio)
if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
zio->io_priority != ZIO_PRIORITY_SCRUB &&
- zio->io_priority != ZIO_PRIORITY_REMOVAL)
+ zio->io_priority != ZIO_PRIORITY_REMOVAL &&
+ zio->io_priority != ZIO_PRIORITY_INITIALIZING)
zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
} else {
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
- zio->io_priority != ZIO_PRIORITY_REMOVAL)
+ zio->io_priority != ZIO_PRIORITY_REMOVAL &&
+ zio->io_priority != ZIO_PRIORITY_INITIALIZING)
zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
}
@@ -938,11 +946,29 @@ module_param(zfs_vdev_async_write_min_active, int, 0644);
MODULE_PARM_DESC(zfs_vdev_async_write_min_active,
"Min active async write I/Os per vdev");
+module_param(zfs_vdev_initializing_max_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_initializing_max_active,
+ "Max active initializing I/Os per vdev");
+
+module_param(zfs_vdev_initializing_min_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_initializing_min_active,
+ "Min active initializing I/Os per vdev");
+
+module_param(zfs_vdev_removal_max_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_removal_max_active,
+ "Max active removal I/Os per vdev");
+
+module_param(zfs_vdev_removal_min_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_removal_min_active,
+ "Min active removal I/Os per vdev");
+
module_param(zfs_vdev_scrub_max_active, int, 0644);
-MODULE_PARM_DESC(zfs_vdev_scrub_max_active, "Max active scrub I/Os per vdev");
+MODULE_PARM_DESC(zfs_vdev_scrub_max_active,
+ "Max active scrub I/Os per vdev");
module_param(zfs_vdev_scrub_min_active, int, 0644);
-MODULE_PARM_DESC(zfs_vdev_scrub_min_active, "Min active scrub I/Os per vdev");
+MODULE_PARM_DESC(zfs_vdev_scrub_min_active,
+ "Min active scrub I/Os per vdev");
module_param(zfs_vdev_sync_read_max_active, int, 0644);
MODULE_PARM_DESC(zfs_vdev_sync_read_max_active,
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index a21baf9c2..d10d89f3e 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -36,6 +36,10 @@
#include <sys/vdev_raidz.h>
#include <sys/vdev_raidz_impl.h>
+#ifdef ZFS_DEBUG
+#include <sys/vdev_initialize.h> /* vdev_xlate testing */
+#endif
+
/*
* Virtual device vector for RAID-Z.
*
@@ -1627,6 +1631,39 @@ vdev_raidz_child_done(zio_t *zio)
rc->rc_skipped = 0;
}
+static void
+vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
+{
+#ifdef ZFS_DEBUG
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd->vdev_top;
+
+ range_seg_t logical_rs, physical_rs;
+ logical_rs.rs_start = zio->io_offset;
+ logical_rs.rs_end = logical_rs.rs_start +
+ vdev_raidz_asize(zio->io_vd, zio->io_size);
+
+ raidz_col_t *rc = &rm->rm_col[col];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ vdev_xlate(cvd, &logical_rs, &physical_rs);
+ ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
+ ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
+ /*
+ * It would be nice to assert that rs_end is equal
+ * to rc_offset + rc_size but there might be an
+ * optional I/O at the end that is not accounted in
+ * rc_size.
+ */
+ if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
+ ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
+ rc->rc_size + (1 << tvd->vdev_ashift));
+ } else {
+ ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
+ }
+#endif
+}
+
/*
* Start an IO operation on a RAIDZ VDev
*
@@ -1665,6 +1702,12 @@ vdev_raidz_io_start(zio_t *zio)
for (c = 0; c < rm->rm_cols; c++) {
rc = &rm->rm_col[c];
cvd = vd->vdev_child[rc->rc_devidx];
+
+ /*
+ * Verify physical to logical translation.
+ */
+ vdev_raidz_io_verify(zio, rm, c);
+
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_abd, rc->rc_size,
zio->io_type, zio->io_priority, 0,
@@ -2323,6 +2366,37 @@ vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
return (B_FALSE);
}
+static void
+vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
+{
+ vdev_t *raidvd = cvd->vdev_parent;
+ ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
+
+ uint64_t width = raidvd->vdev_children;
+ uint64_t tgt_col = cvd->vdev_id;
+ uint64_t ashift = raidvd->vdev_top->vdev_ashift;
+
+ /* make sure the offsets are block-aligned */
+ ASSERT0(in->rs_start % (1 << ashift));
+ ASSERT0(in->rs_end % (1 << ashift));
+ uint64_t b_start = in->rs_start >> ashift;
+ uint64_t b_end = in->rs_end >> ashift;
+
+ uint64_t start_row = 0;
+ if (b_start > tgt_col) /* avoid underflow */
+ start_row = ((b_start - tgt_col - 1) / width) + 1;
+
+ uint64_t end_row = 0;
+ if (b_end > tgt_col)
+ end_row = ((b_end - tgt_col - 1) / width) + 1;
+
+ res->rs_start = start_row << ashift;
+ res->rs_end = end_row << ashift;
+
+ ASSERT3U(res->rs_start, <=, in->rs_start);
+ ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
+}
+
vdev_ops_t vdev_raidz_ops = {
vdev_raidz_open,
vdev_raidz_close,
@@ -2334,6 +2408,7 @@ vdev_ops_t vdev_raidz_ops = {
NULL,
NULL,
NULL,
+ vdev_raidz_xlate,
VDEV_TYPE_RAIDZ, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index a706bc2a4..d0824aa84 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -44,6 +44,7 @@
#include <sys/vdev_indirect_births.h>
#include <sys/vdev_indirect_mapping.h>
#include <sys/abd.h>
+#include <sys/vdev_initialize.h>
#include <sys/trace_vdev.h>
/*
@@ -1186,6 +1187,7 @@ vdev_remove_complete(spa_t *spa)
txg_wait_synced(spa->spa_dsl_pool, 0);
txg = spa_vdev_enter(spa);
vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
sysevent_t *ev = spa_event_create(spa, vd, NULL,
ESC_ZFS_VDEV_REMOVE_DEV);
@@ -1896,6 +1898,9 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
+ /* Stop initializing */
+ (void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
+
*txg = spa_vdev_config_enter(spa);
sysevent_t *ev = spa_event_create(spa, vd, NULL,
@@ -2072,6 +2077,13 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
*/
error = spa_reset_logs(spa);
+ /*
+ * We stop any initializing that is currently in progress but leave
+ * the state as "active". This will allow the initializing to resume
+ * if the removal is canceled sometime later.
+ */
+ vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
+
*txg = spa_vdev_config_enter(spa);
/*
@@ -2083,6 +2095,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
if (error != 0) {
metaslab_group_activate(mg);
+ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
return (error);
}
diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c
index 9f86cbfa4..e40b7ce8e 100644
--- a/module/zfs/vdev_root.c
+++ b/module/zfs/vdev_root.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -150,6 +150,7 @@ vdev_ops_t vdev_root_ops = {
NULL,
NULL,
NULL,
+ NULL,
VDEV_TYPE_ROOT, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index a71da2837..3c36502d8 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -202,6 +202,8 @@
#include <sys/zio_checksum.h>
#include <sys/vdev_removal.h>
#include <sys/zfs_sysfs.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_initialize.h>
#include <linux/miscdevice.h>
#include <linux/slab.h>
@@ -3843,6 +3845,85 @@ zfs_ioc_destroy(zfs_cmd_t *zc)
}
/*
+ * innvl: {
+ * vdevs: {
+ * guid 1, guid 2, ...
+ * },
+ * func: POOL_INITIALIZE_{CANCEL|DO|SUSPEND}
+ * }
+ *
+ * outnvl: {
+ * [func: EINVAL (if provided command type didn't make sense)],
+ * [vdevs: {
+ * guid1: errno, (see function body for possible errnos)
+ * ...
+ * }]
+ * }
+ *
+ */
+static const zfs_ioc_key_t zfs_keys_pool_initialize[] = {
+ {ZPOOL_INITIALIZE_COMMAND, DATA_TYPE_UINT64, 0},
+ {ZPOOL_INITIALIZE_VDEVS, DATA_TYPE_NVLIST, 0}
+};
+
+static int
+zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ spa_t *spa;
+ int error;
+
+ error = spa_open(poolname, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ uint64_t cmd_type;
+ if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND,
+ &cmd_type) != 0) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ if (!(cmd_type == POOL_INITIALIZE_CANCEL ||
+ cmd_type == POOL_INITIALIZE_DO ||
+ cmd_type == POOL_INITIALIZE_SUSPEND)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ nvlist_t *vdev_guids;
+ if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS,
+ &vdev_guids) != 0) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ nvlist_t *vdev_errlist = fnvlist_alloc();
+ int total_errors = 0;
+
+ for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) {
+ uint64_t vdev_guid = fnvpair_value_uint64(pair);
+
+ error = spa_vdev_initialize(spa, vdev_guid, cmd_type);
+ if (error != 0) {
+ char guid_as_str[MAXNAMELEN];
+
+ (void) snprintf(guid_as_str, sizeof (guid_as_str),
+ "%llu", (unsigned long long)vdev_guid);
+ fnvlist_add_int64(vdev_errlist, guid_as_str, error);
+ total_errors++;
+ }
+ }
+ if (fnvlist_size(vdev_errlist) > 0) {
+ fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS,
+ vdev_errlist);
+ }
+ fnvlist_free(vdev_errlist);
+
+ spa_close(spa, FTAG);
+ return (total_errors > 0 ? EINVAL : 0);
+}
+
+/*
* fsname is name of dataset to rollback (to most recent snapshot)
*
* innvl may contain name of expected target snapshot
@@ -6453,6 +6534,11 @@ zfs_ioctl_init(void)
zfs_keys_pool_discard_checkpoint,
ARRAY_SIZE(zfs_keys_pool_discard_checkpoint));
+ zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE,
+ zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize));
+
/* IOCTLS that use the legacy function signature */
zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
diff --git a/module/zfs/zfs_sysfs.c b/module/zfs/zfs_sysfs.c
index b17c91f65..87c4ac117 100644
--- a/module/zfs/zfs_sysfs.c
+++ b/module/zfs/zfs_sysfs.c
@@ -358,6 +358,7 @@ pool_property_show(struct kobject *kobj, struct attribute *attr, char *buf)
*/
static const char *zfs_features[] = {
/* --> Add new kernel features here (post ZoL 0.8.0) */
+ "vdev_initialize"
};
#define ZFS_FEATURE_COUNT ARRAY_SIZE(zfs_features)
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 4463bfa1e..38f040126 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -397,6 +397,21 @@ pre =
post =
tags = ['functional', 'cli_root', 'zpool_labelclear']
+[tests/functional/cli_root/zpool_initialize]
+tests = ['zpool_initialize_attach_detach_add_remove',
+ 'zpool_initialize_import_export',
+ 'zpool_initialize_offline_export_import_online',
+ 'zpool_initialize_online_offline',
+ 'zpool_initialize_split',
+ 'zpool_initialize_start_and_cancel_neg',
+ 'zpool_initialize_start_and_cancel_pos',
+ 'zpool_initialize_suspend_resume',
+ 'zpool_initialize_unsupported_vdevs',
+ 'zpool_initialize_verify_checksums',
+ 'zpool_initialize_verify_initialized']
+pre =
+tags = ['functional', 'cli_root', 'zpool_initialize']
+
[tests/functional/cli_root/zpool_offline]
tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg',
'zpool_offline_003_pos']
diff --git a/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c b/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c
index 88a18d8f0..61ac2fecc 100644
--- a/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c
+++ b/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c
@@ -642,6 +642,22 @@ test_unload_key(const char *dataset)
IOC_INPUT_TEST(ZFS_IOC_UNLOAD_KEY, dataset, NULL, NULL, EACCES);
}
+static void
+test_vdev_initialize(const char *pool)
+{
+ nvlist_t *required = fnvlist_alloc();
+ nvlist_t *vdev_guids = fnvlist_alloc();
+
+ fnvlist_add_uint64(vdev_guids, "path", 0xdeadbeefdeadbeef);
+ fnvlist_add_uint64(required, ZPOOL_INITIALIZE_COMMAND,
+ POOL_INITIALIZE_DO);
+ fnvlist_add_nvlist(required, ZPOOL_INITIALIZE_VDEVS, vdev_guids);
+
+ IOC_INPUT_TEST(ZFS_IOC_POOL_INITIALIZE, pool, required, NULL, EINVAL);
+ nvlist_free(vdev_guids);
+ nvlist_free(required);
+}
+
static int
zfs_destroy(const char *dataset)
{
@@ -732,6 +748,8 @@ zfs_ioc_input_tests(const char *pool)
test_change_key(dataset);
test_unload_key(dataset);
+ test_vdev_initialize(pool);
+
/*
* cleanup
*/
@@ -869,6 +887,7 @@ validate_ioc_values(void)
ZFS_IOC_BASE + 76 == ZFS_IOC_REMAP &&
ZFS_IOC_BASE + 77 == ZFS_IOC_POOL_CHECKPOINT &&
ZFS_IOC_BASE + 78 == ZFS_IOC_POOL_DISCARD_CHECKPOINT &&
+ ZFS_IOC_BASE + 79 == ZFS_IOC_POOL_INITIALIZE &&
LINUX_IOC_BASE + 1 == ZFS_IOC_EVENTS_NEXT &&
LINUX_IOC_BASE + 2 == ZFS_IOC_EVENTS_CLEAR &&
LINUX_IOC_BASE + 3 == ZFS_IOC_EVENTS_SEEK);
diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg
index 8ced03e93..78ba2488d 100644
--- a/tests/zfs-tests/include/commands.cfg
+++ b/tests/zfs-tests/include/commands.cfg
@@ -82,6 +82,7 @@ export SYSTEM_FILES='arp
mv
net
nproc
+ od
openssl
parted
pax
diff --git a/tests/zfs-tests/tests/functional/cli_root/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/Makefile.am
index 13ff889d8..625cf8579 100644
--- a/tests/zfs-tests/tests/functional/cli_root/Makefile.am
+++ b/tests/zfs-tests/tests/functional/cli_root/Makefile.am
@@ -46,6 +46,7 @@ SUBDIRS = \
zpool_get \
zpool_history \
zpool_import \
+ zpool_initialize \
zpool_labelclear \
zpool_offline \
zpool_online \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh
index 0cd9b5959..79ceaabd0 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh
@@ -150,7 +150,7 @@ function do_testing #<clear type> <vdevs>
#
# Make errors to the testing pool by overwrite the vdev device with
- # /usr/bin/dd command. We do not want to have a full overwrite. That
+ # dd command. We do not want to have a full overwrite. That
# may cause the system panic. So, we should skip the vdev label space.
#
(( i = $RANDOM % 3 ))
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile.am
new file mode 100644
index 000000000..a0a0e0b5c
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile.am
@@ -0,0 +1,18 @@
+pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_initialize
+dist_pkgdata_SCRIPTS = \
+ cleanup.ksh \
+ zpool_initialize_attach_detach_add_remove.ksh \
+ zpool_initialize_import_export.ksh \
+ zpool_initialize.kshlib \
+ zpool_initialize_offline_export_import_online.ksh \
+ zpool_initialize_online_offline.ksh \
+ zpool_initialize_split.ksh \
+ zpool_initialize_start_and_cancel_neg.ksh \
+ zpool_initialize_start_and_cancel_pos.ksh \
+ zpool_initialize_suspend_resume.ksh \
+ zpool_initialize_unsupported_vdevs.ksh \
+ zpool_initialize_verify_checksums.ksh \
+ zpool_initialize_verify_initialized.ksh
+
+dist_pkgdata_DATA = \
+ zpool_initialize.kshlib
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh
new file mode 100755
index 000000000..3c7dbd317
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh
@@ -0,0 +1,31 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
new file mode 100644
index 000000000..0f4e7f0fa
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
@@ -0,0 +1,43 @@
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+
+function initialize_prog_line # pool disk
+{
+ typeset pool="$1"
+ typeset disk="$2"
+ zpool status "$pool" | grep "$disk" | grep "initialized"
+}
+
+function initialize_progress # pool disk
+{
+ initialize_prog_line "$1" "$2" | \
+ sed 's/.*(\([0-9]\{1,\}\)% initialized.*/\1/g'
+}
+
+function cleanup
+{
+ if poolexists $TESTPOOL; then
+ log_must zpool destroy -f $TESTPOOL
+ fi
+
+ if poolexists $TESTPOOL1; then
+ log_must zpool destroy -f $TESTPOOL1
+ fi
+}
+log_onexit cleanup
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh
new file mode 100755
index 000000000..2a695025d
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh
@@ -0,0 +1,68 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Detaching/attaching, adding/removing data devices works with initializing.
+#
+# STRATEGY:
+# 1. Create a single-disk pool.
+# 2. Start initializing.
+# 3. Attach a second disk, ensure initializing continues.
+# 4. Detach the second disk, ensure initializing continues.
+# 5. Add a second disk, ensure initializing continues.
+# 6. Remove the first disk, ensure initializing stops.
+#
+
+DISK1="$(echo $DISKS | cut -d' ' -f1)"
+DISK2="$(echo $DISKS | cut -d' ' -f2)"
+
+log_must zpool create -f $TESTPOOL $DISK1
+
+log_must zpool initialize $TESTPOOL $DISK1
+progress="$(initialize_progress $TESTPOOL $DISK1)"
+[[ -z "$progress" ]] && log_fail "Initializing did not start"
+
+log_must zpool attach $TESTPOOL $DISK1 $DISK2
+new_progress="$(initialize_progress $TESTPOOL $DISK1)"
+[[ "$progress" -le "$new_progress" ]] || \
+ log_fail "Lost initializing progress on demotion to child vdev"
+progress="$new_progress"
+
+log_must zpool detach $TESTPOOL $DISK2
+new_progress="$(initialize_progress $TESTPOOL $DISK1)"
+[[ "$progress" -le "$new_progress" ]] || \
+ log_fail "Lost initializing progress on promotion to top vdev"
+progress="$new_progress"
+
+log_must zpool add $TESTPOOL $DISK2
+log_must zpool remove $TESTPOOL $DISK1
+[[ -z "$(initialize_prog_line $TESTPOOL $DISK1)" ]] || \
+ log_fail "Initializing continued after initiating removal"
+
+log_pass "Initializing worked as expected across attach/detach and add/remove"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh
new file mode 100755
index 000000000..386d2a5dc
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh
@@ -0,0 +1,78 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Initializing automatically resumes across import/export.
+#
+# STRATEGY:
+# 1. Create a one-disk pool.
+# 2. Start initializing and verify that initializing is active.
+# 3. Export the pool.
+# 4. Import the pool.
+# 5. Verify that initializing resumes and progress does not regress.
+# 6. Suspend initializing.
+# 7. Repeat steps 3-4.
+# 8. Verify that progress does not regress but initializing is still suspended.
+#
+
+DISK1=${DISKS%% *}
+
+log_must zpool create -f $TESTPOOL $DISK1
+log_must zpool initialize $TESTPOOL
+
+sleep 2
+
+progress="$(initialize_progress $TESTPOOL $DISK1)"
+[[ -z "$progress" ]] && log_fail "Initializing did not start"
+
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+
+new_progress="$(initialize_progress $TESTPOOL $DISK1)"
+[[ -z "$new_progress" ]] && log_fail "Initializing did not restart after import"
+[[ "$progress" -le "$new_progress" ]] || \
+ log_fail "Initializing lost progress after import"
+log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended"
+
+log_must zpool initialize -s $TESTPOOL $DISK1
+action_date="$(initialize_prog_line $TESTPOOL $DISK1 | \
+ sed 's/.*ed at \(.*\)).*/\1/g')"
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+new_action_date=$(initialize_prog_line $TESTPOOL $DISK1 | \
+ sed 's/.*ed at \(.*\)).*/\1/g')
+[[ "$action_date" != "$new_action_date" ]] && \
+ log_fail "Initializing action date did not persist across export/import"
+
+[[ "$new_progress" -le "$(initialize_progress $TESTPOOL $DISK1)" ]] || \
+ log_fail "Initializing lost progress after import"
+
+log_must eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended"
+
+log_pass "Initializing retains state as expected across export/import"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh
new file mode 100755
index 000000000..dedd466e4
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh
@@ -0,0 +1,66 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Miscellaneous complex sequences of operations function as expected.
+#
+# STRATEGY:
+# 1. Create a pool with a two-way mirror.
+# 2. Start initializing, offline, export, import, online and verify that
+# initializing state is preserved / initializing behaves as expected
+# at each step.
+#
+
+DISK1="$(echo $DISKS | cut -d' ' -f1)"
+DISK2="$(echo $DISKS | cut -d' ' -f2)"
+
+log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2
+
+log_must zpool initialize $TESTPOOL $DISK1
+log_must zpool offline $TESTPOOL $DISK1
+progress="$(initialize_progress $TESTPOOL $DISK1)"
+[[ -z "$progress" ]] && log_fail "Initializing did not start"
+log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended"
+
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+
+new_progress="$(initialize_progress $TESTPOOL $DISK1)"
+[[ -z "$new_progress" ]] && log_fail "Initializing did not start after import"
+[[ "$new_progress" -ge "$progress" ]] || \
+ log_fail "Initializing lost progress after import"
+log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended"
+
+log_must zpool online $TESTPOOL $DISK1
+new_progress="$(initialize_progress $TESTPOOL $DISK1)"
+[[ "$new_progress" -ge "$progress" ]] || \
+ log_fail "Initializing lost progress after online"
+
+log_pass "Initializing behaves as expected at each step of:" \
+ "initialize + offline + export + import + online"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh
new file mode 100755
index 000000000..55bd3188c
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh
@@ -0,0 +1,74 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Initializing automatically resumes across offline/online.
+#
+# STRATEGY:
+# 1. Create a pool with a two-way mirror.
+# 2. Start initializing one of the disks and verify that initializing is active.
+# 3. Offline the disk.
+# 4. Online the disk.
+# 5. Verify that initializing resumes and progress does not regress.
+# 6. Suspend initializing.
+# 7. Repeat steps 3-4 and verify that initializing does not resume.
+#
+
+DISK1=${DISKS%% *}
+DISK2="$(echo $DISKS | cut -d' ' -f2)"
+
+log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2
+log_must zpool initialize $TESTPOOL $DISK1
+
+log_must zpool offline $TESTPOOL $DISK1
+
+progress="$(initialize_progress $TESTPOOL $DISK1)"
+[[ -z "$progress" ]] && log_fail "Initializing did not start"
+
+log_must zpool online $TESTPOOL $DISK1
+
+new_progress="$(initialize_progress $TESTPOOL $DISK1)"
+[[ -z "$new_progress" ]] && \
+ log_fail "Initializing did not restart after onlining"
+[[ "$progress" -le "$new_progress" ]] || \
+ log_fail "Initializing lost progress after onlining"
+log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended"
+
+log_must zpool initialize -s $TESTPOOL $DISK1
+action_date="$(initialize_prog_line $TESTPOOL $DISK1 | \
+ sed 's/.*ed at \(.*\)).*/\1/g')"
+log_must zpool offline $TESTPOOL $DISK1
+log_must zpool online $TESTPOOL $DISK1
+new_action_date=$(initialize_prog_line $TESTPOOL $DISK1 | \
+ sed 's/.*ed at \(.*\)).*/\1/g')
+[[ "$action_date" != "$new_action_date" ]] && \
+ log_fail "Initializing action date did not persist across offline/online"
+log_must eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended"
+
+log_pass "Initializing performs as expected across offline/online"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_split.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_split.ksh
new file mode 100755
index 000000000..69b27c26c
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_split.ksh
@@ -0,0 +1,64 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Initializing state is preserved across zpool split.
+#
+# STRATEGY:
+# 1. Create a pool with a two-way mirror.
+# 2. Start initializing both devices.
+# 3. Split the pool. Ensure initializing continues on the original.
+# 4. Import the new pool. Ensure initializing resumes on it.
+#
+
+DISK1="$(echo $DISKS | cut -d' ' -f1)"
+DISK2="$(echo $DISKS | cut -d' ' -f2)"
+POOL2="${TESTPOOL}_split"
+
+log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2
+
+log_must zpool initialize $TESTPOOL $DISK1 $DISK2
+orig_prog1="$(initialize_progress $TESTPOOL $DISK1)"
+orig_prog2="$(initialize_progress $TESTPOOL $DISK2)"
+[[ -z "$orig_prog1" ]] && log_fail "Initializing did not start"
+
+log_must zpool split $TESTPOOL $TESTPOOL1 $DISK2
+
+# Ensure initializing continued as expected on the original pool.
+[[ "$(initialize_progress $TESTPOOL $DISK1)" -ge "$orig_prog1" ]] || \
+ log_fail "Initializing lost progress on original pool"
+log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended"
+
+log_must zpool import $TESTPOOL1
+
+[[ "$(initialize_progress $TESTPOOL1 $DISK2)" -ge "$orig_prog2" ]] || \
+ log_fail "Initializing lost progress on split pool"
+log_mustnot eval "initialize_prog_line $TESTPOOL1 $DISK1 | grep suspended"
+
+log_pass "Initializing behaves as expected on zpool split"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh
new file mode 100755
index 000000000..59b266d32
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh
@@ -0,0 +1,60 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Cancelling and suspending initialize doesn't work if not all specified vdevs
+# are being initialized.
+#
+# STRATEGY:
+# 1. Create a three-disk pool.
+# 2. Start initializing and verify that initializing is active.
+# 3. Try to cancel and suspend initializing on the non-initializing disks.
+# 4. Try to re-initialize the currently initializing disk.
+#
+
+DISK1=${DISKS%% *}
+DISK2="$(echo $DISKS | cut -d' ' -f2)"
+DISK3="$(echo $DISKS | cut -d' ' -f3)"
+
+log_must zpool list -v
+log_must zpool create -f $TESTPOOL $DISK1 $DISK2 $DISK3
+log_must zpool initialize $TESTPOOL $DISK1
+
+[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \
+ log_fail "Initialize did not start"
+
+log_mustnot zpool initialize -c $TESTPOOL $DISK2
+log_mustnot zpool initialize -c $TESTPOOL $DISK2 $DISK3
+
+log_mustnot zpool initialize -s $TESTPOOL $DISK2
+log_mustnot zpool initialize -s $TESTPOOL $DISK2 $DISK3
+
+log_mustnot zpool initialize $TESTPOOL $DISK1
+
+log_pass "Nonsensical initialize operations fail"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh
new file mode 100755
index 000000000..5003b5f10
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh
@@ -0,0 +1,52 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Starting and stopping an initialize works.
+#
+# STRATEGY:
+# 1. Create a one-disk pool.
+# 2. Start initializing and verify that initializing is active.
+# 3. Cancel initializing and verify that initializing is not active.
+#
+
+DISK1=${DISKS%% *}
+
+log_must zpool create -f $TESTPOOL $DISK1
+log_must zpool initialize $TESTPOOL
+
+[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \
+ log_fail "Initialize did not start"
+
+log_must zpool initialize -c $TESTPOOL
+
+[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] || \
+ log_fail "Initialize did not stop"
+
+log_pass "Initialize start + cancel works"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh
new file mode 100755
index 000000000..bce3da526
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh
@@ -0,0 +1,63 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Suspending and resuming initializing works.
+#
+# STRATEGY:
+# 1. Create a one-disk pool.
+# 2. Start initializing and verify that initializing is active.
+# 3. Wait 3 seconds, then suspend initializing and verify that the progress
+# reporting says so.
+# 4. Wait 5 seconds and ensure initializing progress doesn't advance.
+# 5. Restart initializing and verify that the progress doesn't regress.
+#
+
+DISK1=${DISKS%% *}
+
+log_must zpool create -f $TESTPOOL $DISK1
+log_must zpool initialize $TESTPOOL
+
+[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \
+ log_fail "Initializing did not start"
+
+sleep 5
+log_must zpool initialize -s $TESTPOOL
+log_must eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended"
+progress="$(initialize_progress $TESTPOOL $DISK1)"
+
+sleep 3
+[[ "$progress" -eq "$(initialize_progress $TESTPOOL $DISK1)" ]] || \
+ log_fail "Initializing progress advanced while suspended"
+
+log_must zpool initialize $TESTPOOL $DISK1
+[[ "$progress" -le "$(initialize_progress $TESTPOOL $DISK1)" ]] ||
+ log_fail "Initializing progress regressed after resuming"
+
+log_pass "Suspend + resume initializing works as expected"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh
new file mode 100755
index 000000000..bd4ca069c
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh
@@ -0,0 +1,74 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Attempting to initialize unsupported vdevs should fail.
+#
+# STRATEGY:
+# 1. Create a pool with the following configuration:
+# root
+# mirror
+# vdev0
+# vdev1 (offline)
+# cache
+# vdev2
+# spare
+# vdev3
+# 2. Try to initialize vdev1, vdev2, and vdev3. Ensure that all 3 fail.
+#
+function cleanup
+{
+ if datasetexists $TESTPOOL; then
+ log_must zpool destroy -f $TESTPOOL
+ fi
+ if [[ -d $TESTDIR ]]; then
+ log_must rm -rf $TESTDIR
+ fi
+}
+log_onexit cleanup
+
+log_must mkdir $TESTDIR
+set -A FDISKS
+for n in {0..2}; do
+ log_must mkfile $MINVDEVSIZE $TESTDIR/vdev$n
+ FDISKS+=("$TESTDIR/vdev$n")
+done
+FDISKS+=("${DISKS%% *}")
+
+log_must zpool create $TESTPOOL mirror ${FDISKS[0]} ${FDISKS[1]} \
+ spare ${FDISKS[2]} cache ${FDISKS[3]}
+
+log_must zpool offline $TESTPOOL ${FDISKS[1]}
+
+log_mustnot zpool initialize $TESTPOOL mirror-0
+for n in {1..3}; do
+ log_mustnot zpool initialize $TESTPOOL ${FDISKS[$n]}
+done
+
+log_pass "Attempting to initialize failed on unsupported devices"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh
new file mode 100755
index 000000000..9be752ff8
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh
@@ -0,0 +1,59 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Initializing does not cause file corruption.
+#
+# STRATEGY:
+# 1. Create a one-disk pool.
+# 2. Write data to the pool.
+# 3. Start initializing and verify that initializing is active.
+# 4. Write more data to the pool.
+# 5. Run zdb to validate checksums.
+#
+
+DISK1=${DISKS%% *}
+
+log_must zpool create -f $TESTPOOL $DISK1
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=1M count=30
+log_must sync
+
+log_must zpool initialize $TESTPOOL
+
+log_must zdb -cc $TESTPOOL
+
+[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \
+ log_fail "Initializing did not start"
+
+log_must dd if=/dev/urandom of=/$TESTPOOL/file2 bs=1M count=30
+log_must sync
+
+log_must zdb -cc $TESTPOOL
+
+log_pass "Initializing does not corrupt existing or new data"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh
new file mode 100755
index 000000000..0fa6a0be9
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh
@@ -0,0 +1,89 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# After initializing, the disk is actually initialized.
+#
+# STRATEGY:
+# 1. Create a one-disk pool.
+# 2. Initialize the disk to completion.
+# 3. Load all metaslabs that don't have a spacemap, and make sure the entire
+# metaslab has been filled with the initializing pattern (deadbeef).
+#
+
+function cleanup
+{
+ set_tunable64 zfs_initialize_value $ORIG_PATTERN
+ zpool import -d $TESTDIR $TESTPOOL
+
+ if datasetexists $TESTPOOL ; then
+ zpool destroy -f $TESTPOOL
+ fi
+ if [[ -d "$TESTDIR" ]]; then
+ rm -rf "$TESTDIR"
+ fi
+}
+log_onexit cleanup
+
+PATTERN="deadbeefdeadbeef"
+SMALLFILE="$TESTDIR/smallfile"
+
+ORIG_PATTERN=$(get_tunable zfs_initialize_value)
+log_must set_tunable64 zfs_initialize_value $(printf %llu 0x$PATTERN)
+
+log_must mkdir "$TESTDIR"
+log_must mkfile $MINVDEVSIZE "$SMALLFILE"
+log_must zpool create $TESTPOOL "$SMALLFILE"
+log_must zpool initialize $TESTPOOL
+
+while [[ "$(initialize_progress $TESTPOOL $SMALLFILE)" -lt "100" ]]; do
+ sleep 0.5
+done
+
+log_must zpool export $TESTPOOL
+
+spacemaps=0
+bs=512
+while read -r sm; do
+ typeset offset="$(echo $sm | cut -d ' ' -f1)"
+ typeset size="$(echo $sm | cut -d ' ' -f2)"
+
+ spacemaps=$((spacemaps + 1))
+ offset=$(((4 * 1024 * 1024) + 16#$offset))
+ out=$(dd if=$SMALLFILE skip=$(($offset / $bs)) \
+ count=$(($size / $bs)) bs=$bs 2>/dev/null | od -t x8 -Ad)
+ echo "$out" | log_must egrep "$PATTERN|\*|$size"
+done <<< "$(zdb -p $TESTDIR -Pme $TESTPOOL | egrep 'spacemap[ ]+0 ' | \
+ awk '{print $4, $8}')"
+
+if [[ $spacemaps -eq 0 ]];then
+ log_fail "Did not find any empty space maps to check"
+else
+ log_pass "Initializing wrote appropriate amount to disk"
+fi