diff options
54 files changed, 2743 insertions, 30 deletions
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 8a4ac35d5..4e6a699c4 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -97,6 +97,7 @@ static int zpool_do_detach(int, char **); static int zpool_do_replace(int, char **); static int zpool_do_split(int, char **); +static int zpool_do_initialize(int, char **); static int zpool_do_scrub(int, char **); static int zpool_do_resilver(int, char **); @@ -150,6 +151,7 @@ typedef enum { HELP_ONLINE, HELP_REPLACE, HELP_REMOVE, + HELP_INITIALIZE, HELP_SCRUB, HELP_RESILVER, HELP_STATUS, @@ -278,6 +280,7 @@ static zpool_command_t command_table[] = { { "replace", zpool_do_replace, HELP_REPLACE }, { "split", zpool_do_split, HELP_SPLIT }, { NULL }, + { "initialize", zpool_do_initialize, HELP_INITIALIZE }, { "scrub", zpool_do_scrub, HELP_SCRUB }, { "resilver", zpool_do_resilver, HELP_RESILVER }, { NULL }, @@ -360,6 +363,8 @@ get_usage(zpool_help_t idx) return (gettext("\tremove [-nps] <pool> <device> ...\n")); case HELP_REOPEN: return (gettext("\treopen [-n] <pool>\n")); + case HELP_INITIALIZE: + return (gettext("\tinitialize [-cs] <pool> [<device> ...]\n")); case HELP_SCRUB: return (gettext("\tscrub [-s | -p] <pool> ...\n")); case HELP_RESILVER: @@ -393,6 +398,27 @@ get_usage(zpool_help_t idx) /* NOTREACHED */ } +static void +zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res) +{ + uint_t children = 0; + nvlist_t **child; + uint_t i; + + (void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children); + + if (children == 0) { + char *path = zpool_vdev_name(g_zfs, zhp, nvroot, B_FALSE); + fnvlist_add_boolean(res, path); + free(path); + return; + } + + for (i = 0; i < children; i++) { + zpool_collect_leaves(zhp, child[i], res); + } +} /* * Callback routine that will print out a pool property value. @@ -480,6 +506,97 @@ usage(boolean_t requested) } /* + * zpool initialize [-cs] <pool> [<vdev> ...] + * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool + * if none specified. + * + * -c Cancel. Ends active initializing. + * -s Suspend. Initializing can then be restarted with no flags. + */ +int +zpool_do_initialize(int argc, char **argv) +{ + int c; + char *poolname; + zpool_handle_t *zhp; + nvlist_t *vdevs; + int err = 0; + + struct option long_options[] = { + {"cancel", no_argument, NULL, 'c'}, + {"suspend", no_argument, NULL, 's'}, + {0, 0, 0, 0} + }; + + pool_initialize_func_t cmd_type = POOL_INITIALIZE_DO; + while ((c = getopt_long(argc, argv, "cs", long_options, NULL)) != -1) { + switch (c) { + case 'c': + if (cmd_type != POOL_INITIALIZE_DO) { + (void) fprintf(stderr, gettext("-c cannot be " + "combined with other options\n")); + usage(B_FALSE); + } + cmd_type = POOL_INITIALIZE_CANCEL; + break; + case 's': + if (cmd_type != POOL_INITIALIZE_DO) { + (void) fprintf(stderr, gettext("-s cannot be " + "combined with other options\n")); + usage(B_FALSE); + } + cmd_type = POOL_INITIALIZE_SUSPEND; + break; + case '?': + if (optopt != 0) { + (void) fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + } else { + (void) fprintf(stderr, + gettext("invalid option '%s'\n"), + argv[optind - 1]); + } + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + return (-1); + } + + poolname = argv[0]; + zhp = zpool_open(g_zfs, poolname); + if (zhp == NULL) + return (-1); + + vdevs = fnvlist_alloc(); + if (argc == 1) { + /* no individual leaf vdevs specified, so add them all */ + nvlist_t *config = zpool_get_config(zhp, NULL); + nvlist_t *nvroot = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE); + zpool_collect_leaves(zhp, nvroot, vdevs); + } else { + int i; + for (i = 1; i < argc; i++) { + fnvlist_add_boolean(vdevs, argv[i]); + } + } + + err = zpool_initialize(zhp, cmd_type, vdevs); + + fnvlist_free(vdevs); + zpool_close(zhp); + + return (err); +} + +/* * print a pool vdev config for dry runs */ static void @@ -1923,6 +2040,43 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, } } + if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE || + vs->vs_initialize_state == VDEV_INITIALIZE_SUSPENDED || + vs->vs_initialize_state == VDEV_INITIALIZE_COMPLETE) && + !vs->vs_scan_removing) { + char zbuf[1024]; + char tbuf[256]; + struct tm zaction_ts; + + time_t t = vs->vs_initialize_action_time; + int initialize_pct = 100; + if (vs->vs_initialize_state != VDEV_INITIALIZE_COMPLETE) { + initialize_pct = (vs->vs_initialize_bytes_done * 100 / + (vs->vs_initialize_bytes_est + 1)); + } + + (void) localtime_r(&t, &zaction_ts); + (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); + + switch (vs->vs_initialize_state) { + case VDEV_INITIALIZE_SUSPENDED: + (void) snprintf(zbuf, sizeof (zbuf), + ", suspended, started at %s", tbuf); + break; + case VDEV_INITIALIZE_ACTIVE: + (void) snprintf(zbuf, sizeof (zbuf), + ", started at %s", tbuf); + break; + case VDEV_INITIALIZE_COMPLETE: + (void) snprintf(zbuf, sizeof (zbuf), + ", completed at %s", tbuf); + break; + } + + (void) printf(gettext(" (%d%% initialized%s)"), + initialize_pct, zbuf); + } + (void) printf("\n"); for (c = 0; c < children; c++) { diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index f9ba9b6d0..385984f84 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -106,6 +106,7 @@ #include <sys/zil_impl.h> #include <sys/vdev_impl.h> #include <sys/vdev_file.h> +#include <sys/vdev_initialize.h> #include <sys/spa_impl.h> #include <sys/metaslab_impl.h> #include <sys/dsl_prop.h> @@ -374,6 +375,7 @@ ztest_func_t ztest_spa_upgrade; ztest_func_t ztest_device_removal; ztest_func_t ztest_remap_blocks; ztest_func_t ztest_spa_checkpoint_create_discard; +ztest_func_t ztest_initialize; ztest_func_t ztest_fletcher; ztest_func_t ztest_fletcher_incr; ztest_func_t ztest_verify_dnode_bt; @@ -427,6 +429,7 @@ ztest_info_t ztest_info[] = { ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), ZTI_INIT(ztest_remap_blocks, 1, &zopt_sometimes), ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), + ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), @@ -6343,6 +6346,97 @@ ztest_get_zdb_bin(char *bin, int len) strcpy(bin, "zdb"); } +static vdev_t * +ztest_random_concrete_vdev_leaf(vdev_t *vd) +{ + if (vd == NULL) + return (NULL); + + if (vd->vdev_children == 0) + return (vd); + + vdev_t *eligible[vd->vdev_children]; + int eligible_idx = 0, i; + for (i = 0; i < vd->vdev_children; i++) { + vdev_t *cvd = vd->vdev_child[i]; + if (cvd->vdev_top->vdev_removing) + continue; + if (cvd->vdev_children > 0 || + (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { + eligible[eligible_idx++] = cvd; + } + } + VERIFY(eligible_idx > 0); + + uint64_t child_no = ztest_random(eligible_idx); + return (ztest_random_concrete_vdev_leaf(eligible[child_no])); +} + +/* ARGSUSED */ +void +ztest_initialize(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa = ztest_spa; + int error = 0; + + mutex_enter(&ztest_vdev_lock); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + /* Random leaf vdev */ + vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); + if (rand_vd == NULL) { + spa_config_exit(spa, SCL_VDEV, FTAG); + mutex_exit(&ztest_vdev_lock); + return; + } + + /* + * The random vdev we've selected may change as soon as we + * drop the spa_config_lock. We create local copies of things + * we're interested in. + */ + uint64_t guid = rand_vd->vdev_guid; + char *path = strdup(rand_vd->vdev_path); + boolean_t active = rand_vd->vdev_initialize_thread != NULL; + + zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid); + spa_config_exit(spa, SCL_VDEV, FTAG); + + uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); + error = spa_vdev_initialize(spa, guid, cmd); + switch (cmd) { + case POOL_INITIALIZE_CANCEL: + if (ztest_opts.zo_verbose >= 4) { + (void) printf("Cancel initialize %s", path); + if (!active) + (void) printf(" failed (no initialize active)"); + (void) printf("\n"); + } + break; + case POOL_INITIALIZE_DO: + if (ztest_opts.zo_verbose >= 4) { + (void) printf("Start initialize %s", path); + if (active && error == 0) + (void) printf(" failed (already active)"); + else if (error != 0) + (void) printf(" failed (error %d)", error); + (void) printf("\n"); + } + break; + case POOL_INITIALIZE_SUSPEND: + if (ztest_opts.zo_verbose >= 4) { + (void) printf("Suspend initialize %s", path); + if (!active) + (void) printf(" failed (no initialize active)"); + (void) printf("\n"); + } + break; + } + free(path); + mutex_exit(&ztest_vdev_lock); +} + /* * Verify pool integrity by running zdb. */ diff --git a/configure.ac b/configure.ac index 028cae338..be3e556c1 100644 --- a/configure.ac +++ b/configure.ac @@ -247,6 +247,7 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/cli_root/zpool_history/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_import/blockfiles/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/Makefile tests/zfs-tests/tests/functional/cli_root/zpool/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_offline/Makefile diff --git a/include/libzfs.h b/include/libzfs.h index c764e92dd..85b0bc0dd 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -139,6 +139,9 @@ typedef enum zfs_error { EZFS_DEVRM_IN_PROGRESS, /* a device is currently being removed */ EZFS_VDEV_TOO_BIG, /* a device is too big to be used */ EZFS_IOC_NOTSUPPORTED, /* operation not supported by zfs module */ + EZFS_TOOMANY, /* argument list too long */ + EZFS_INITIALIZING, /* currently initializing */ + EZFS_NO_INITIALIZE, /* no active initialize */ EZFS_UNKNOWN } zfs_error_t; @@ -253,6 +256,8 @@ typedef struct splitflags { * Functions to manipulate pool and vdev state */ extern int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t); +extern int zpool_initialize(zpool_handle_t *, pool_initialize_func_t, + nvlist_t *); extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); extern int zpool_reguid(zpool_handle_t *); extern int zpool_reopen_one(zpool_handle_t *, void *); diff --git a/include/libzfs_core.h b/include/libzfs_core.h index f84270d7e..264ce3fa0 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -62,6 +62,8 @@ int lzc_destroy_bookmarks(nvlist_t *, nvlist_t **); int lzc_load_key(const char *, boolean_t, uint8_t *, uint_t); int lzc_unload_key(const char *); int lzc_change_key(const char *, uint64_t, nvlist_t *, uint8_t *, uint_t); +int lzc_initialize(const char *, pool_initialize_func_t, nvlist_t *, + nvlist_t **); int lzc_snaprange_space(const char *, const char *, uint64_t *); diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 8bf376998..e6c82d113 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -96,6 +96,7 @@ COMMON_H = \ $(top_srcdir)/include/sys/vdev_impl.h \ $(top_srcdir)/include/sys/vdev_indirect_births.h \ $(top_srcdir)/include/sys/vdev_indirect_mapping.h \ + $(top_srcdir)/include/sys/vdev_initialize.h \ $(top_srcdir)/include/sys/vdev_raidz.h \ $(top_srcdir)/include/sys/vdev_raidz_impl.h \ $(top_srcdir)/include/sys/vdev_removal.h \ diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 85512618c..945853739 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -775,6 +775,13 @@ typedef struct zpool_load_policy { #define VDEV_ALLOC_BIAS_SPECIAL "special" #define VDEV_ALLOC_BIAS_DEDUP "dedup" +#define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \ + "com.delphix:next_offset_to_initialize" +#define VDEV_LEAF_ZAP_INITIALIZE_STATE \ + "com.delphix:vdev_initialize_state" +#define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \ + "com.delphix:vdev_initialize_action_time" + /* * This is needed in userland to report the minimum necessary device size. */ @@ -988,10 +995,15 @@ typedef struct vdev_stat { uint64_t vs_read_errors; /* read errors */ uint64_t vs_write_errors; /* write errors */ uint64_t vs_checksum_errors; /* checksum errors */ + uint64_t vs_initialize_errors; /* initializing errors */ uint64_t vs_self_healed; /* self-healed bytes */ uint64_t vs_scan_removing; /* removing? */ uint64_t vs_scan_processed; /* scan processed bytes */ uint64_t vs_fragmentation; /* device fragmentation */ + uint64_t vs_initialize_bytes_done; /* bytes initialized */ + uint64_t vs_initialize_bytes_est; /* total bytes to initialize */ + uint64_t vs_initialize_state; /* vdev_initialzing_state_t */ + uint64_t vs_initialize_action_time; /* time_t */ uint64_t vs_checkpoint_space; /* checkpoint-consumed space */ uint64_t vs_resilver_deferred; /* resilver deferred */ uint64_t vs_slow_ios; /* slow IOs */ @@ -1023,7 +1035,6 @@ typedef struct vdev_stat_ex { #define VDEV_L_HISTO_BUCKETS 37 /* Latency histo buckets */ #define VDEV_RQ_HISTO_BUCKETS 25 /* Request size histo buckets */ - /* Amount of time in ZIO queue (ns) */ uint64_t vsx_queue_histo[ZIO_PRIORITY_NUM_QUEUEABLE] [VDEV_L_HISTO_BUCKETS]; @@ -1051,6 +1062,16 @@ typedef struct vdev_stat_ex { } vdev_stat_ex_t; /* + * Initialize functions. + */ +typedef enum pool_initialize_func { + POOL_INITIALIZE_DO, + POOL_INITIALIZE_CANCEL, + POOL_INITIALIZE_SUSPEND, + POOL_INITIALIZE_FUNCS +} pool_initialize_func_t; + +/* * DDT statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ @@ -1094,6 +1115,14 @@ typedef struct ddt_histogram { #define ZVOL_PROP_NAME "name" #define ZVOL_DEFAULT_BLOCKSIZE 8192 +typedef enum { + VDEV_INITIALIZE_NONE, + VDEV_INITIALIZE_ACTIVE, + VDEV_INITIALIZE_CANCELED, + VDEV_INITIALIZE_SUSPENDED, + VDEV_INITIALIZE_COMPLETE +} vdev_initializing_state_t; + /* * /dev/zfs ioctl numbers. * @@ -1184,6 +1213,7 @@ typedef enum zfs_ioc { ZFS_IOC_REMAP, /* 0x5a4c */ ZFS_IOC_POOL_CHECKPOINT, /* 0x5a4d */ ZFS_IOC_POOL_DISCARD_CHECKPOINT, /* 0x5a4e */ + ZFS_IOC_POOL_INITIALIZE, /* 0x5a4f */ /* * Linux - 3/64 numbers reserved. @@ -1278,6 +1308,12 @@ typedef enum { #define ZPOOL_HIDDEN_ARGS "hidden_args" /* + * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE. + */ +#define ZPOOL_INITIALIZE_COMMAND "initialize_command" +#define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs" + +/* * Flags for ZFS_IOC_VDEV_SET_STATE */ #define ZFS_ONLINE_CHECKREMOVE 0x1 diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index aa1c82a02..3e32eace6 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -68,7 +68,8 @@ typedef enum trace_alloc_type { TRACE_GROUP_FAILURE = -5ULL, TRACE_ENOSPC = -6ULL, TRACE_CONDENSING = -7ULL, - TRACE_VDEV_ERROR = -8ULL + TRACE_VDEV_ERROR = -8ULL, + TRACE_INITIALIZING = -9ULL } trace_alloc_type_t; #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) @@ -270,6 +271,11 @@ struct metaslab_group { uint64_t mg_failed_allocations; uint64_t mg_fragmentation; uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE]; + + int mg_ms_initializing; + boolean_t mg_initialize_updating; + kmutex_t mg_ms_initialize_lock; + kcondvar_t mg_ms_initialize_cv; }; /* @@ -360,6 +366,8 @@ struct metaslab { boolean_t ms_condense_wanted; uint64_t ms_condense_checked_txg; + uint64_t ms_initializing; /* leaves initializing this ms */ + /* * We must hold both ms_lock and ms_group->mg_lock in order to * modify ms_loaded. diff --git a/include/sys/spa.h b/include/sys/spa.h index 5dc27e334..4a66260ef 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -772,6 +772,7 @@ extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); #define SPA_ASYNC_AUTOEXPAND 0x20 #define SPA_ASYNC_REMOVE_DONE 0x40 #define SPA_ASYNC_REMOVE_STOP 0x80 +#define SPA_ASYNC_INITIALIZE_RESTART 0x100 /* * Controls the behavior of spa_vdev_remove(). @@ -787,6 +788,7 @@ extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); extern boolean_t spa_vdev_remove_active(spa_t *spa); +extern int spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 6c13a548f..ae21e037e 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -82,6 +82,12 @@ typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg); typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size, vdev_remap_cb_t callback, void *arg); +/* + * Given a target vdev, translates the logical range "in" to the physical + * range "res" + */ +typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in, + range_seg_t *res); typedef const struct vdev_ops { vdev_open_func_t *vdev_op_open; @@ -94,6 +100,11 @@ typedef const struct vdev_ops { vdev_hold_func_t *vdev_op_hold; vdev_rele_func_t *vdev_op_rele; vdev_remap_func_t *vdev_op_remap; + /* + * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves. + * Used when initializing vdevs. Isn't used by leaf ops. + */ + vdev_xlation_func_t *vdev_op_xlate; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; @@ -250,6 +261,24 @@ struct vdev { /* pool checkpoint related */ space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */ + boolean_t vdev_initialize_exit_wanted; + vdev_initializing_state_t vdev_initialize_state; + kthread_t *vdev_initialize_thread; + /* Protects vdev_initialize_thread and vdev_initialize_state. */ + kmutex_t vdev_initialize_lock; + kcondvar_t vdev_initialize_cv; + uint64_t vdev_initialize_offset[TXG_SIZE]; + uint64_t vdev_initialize_last_offset; + range_tree_t *vdev_initialize_tree; /* valid while initializing */ + uint64_t vdev_initialize_bytes_est; + uint64_t vdev_initialize_bytes_done; + time_t vdev_initialize_action_time; /* start and end time */ + + /* for limiting outstanding I/Os */ + kmutex_t vdev_initialize_io_lock; + kcondvar_t vdev_initialize_io_cv; + uint64_t vdev_initialize_inflight; + /* * Values stored in the config for an indirect or removing vdev. */ @@ -478,6 +507,8 @@ extern vdev_ops_t vdev_indirect_ops; /* * Common size functions */ +extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in, + range_seg_t *out); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); diff --git a/include/sys/vdev_initialize.h b/include/sys/vdev_initialize.h new file mode 100644 index 000000000..db4b0572c --- /dev/null +++ b/include/sys/vdev_initialize.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VDEV_INITIALIZE_H +#define _SYS_VDEV_INITIALIZE_H + +#ifdef __cplusplus +extern "C" { +#endif + +extern void vdev_initialize(vdev_t *vd); +extern void vdev_initialize_stop(vdev_t *vd, + vdev_initializing_state_t tgt_state); +extern void vdev_initialize_stop_all(vdev_t *vd, + vdev_initializing_state_t tgt_state); +extern void vdev_initialize_restart(vdev_t *vd); +extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, + range_seg_t *physical_rs); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_INITIALIZE_H */ diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h index c2cc8b2d5..d8e6a1745 100644 --- a/include/sys/zio_priority.h +++ b/include/sys/zio_priority.h @@ -13,7 +13,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, 2016 by Delphix. All rights reserved. */ #ifndef _ZIO_PRIORITY_H #define _ZIO_PRIORITY_H @@ -29,6 +29,7 @@ typedef enum zio_priority { ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */ + ZIO_PRIORITY_INITIALIZING, /* initializing I/O */ ZIO_PRIORITY_NUM_QUEUEABLE, ZIO_PRIORITY_NOW, /* non-queued i/os (e.g. free) */ } zio_priority_t; diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index bc320e516..f799471e4 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -2092,6 +2092,100 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, return (ret); } +static int +xlate_init_err(int err) +{ + switch (err) { + case ENODEV: + return (EZFS_NODEVICE); + case EINVAL: + case EROFS: + return (EZFS_BADDEV); + case EBUSY: + return (EZFS_INITIALIZING); + case ESRCH: + return (EZFS_NO_INITIALIZE); + } + return (err); +} + +/* + * Begin, suspend, or cancel the initialization (initializing of all free + * blocks) for the given vdevs in the given pool. + */ +int +zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, + nvlist_t *vds) +{ + char msg[1024]; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + nvlist_t *errlist; + + /* translate vdev names to guids */ + nvlist_t *vdev_guids = fnvlist_alloc(); + nvlist_t *guids_to_paths = fnvlist_alloc(); + boolean_t spare, cache; + nvlist_t *tgt; + nvpair_t *elem; + + for (elem = nvlist_next_nvpair(vds, NULL); elem != NULL; + elem = nvlist_next_nvpair(vds, elem)) { + char *vd_path = nvpair_name(elem); + tgt = zpool_find_vdev(zhp, vd_path, &spare, &cache, NULL); + + if ((tgt == NULL) || cache || spare) { + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot initialize '%s'"), + vd_path); + int err = (tgt == NULL) ? EZFS_NODEVICE : + (spare ? EZFS_ISSPARE : EZFS_ISL2CACHE); + fnvlist_free(vdev_guids); + fnvlist_free(guids_to_paths); + return (zfs_error(hdl, err, msg)); + } + + uint64_t guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); + fnvlist_add_uint64(vdev_guids, vd_path, guid); + + (void) snprintf(msg, sizeof (msg), "%llu", (u_longlong_t)guid); + fnvlist_add_string(guids_to_paths, msg, vd_path); + } + + int err = lzc_initialize(zhp->zpool_name, cmd_type, vdev_guids, + &errlist); + fnvlist_free(vdev_guids); + + if (err == 0) { + fnvlist_free(guids_to_paths); + return (0); + } + + nvlist_t *vd_errlist = NULL; + if (errlist != NULL) { + vd_errlist = fnvlist_lookup_nvlist(errlist, + ZPOOL_INITIALIZE_VDEVS); + } + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "operation failed")); + + for (elem = nvlist_next_nvpair(vd_errlist, NULL); elem != NULL; + elem = nvlist_next_nvpair(vd_errlist, elem)) { + int64_t vd_error = xlate_init_err(fnvpair_value_int64(elem)); + char *path = fnvlist_lookup_string(guids_to_paths, + nvpair_name(elem)); + (void) zfs_error_fmt(hdl, vd_error, "cannot initialize '%s'", + path); + } + + fnvlist_free(guids_to_paths); + if (vd_errlist != NULL) + return (-1); + + return (zpool_standard_error(hdl, err, msg)); +} + /* * Scan the pool. */ diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 776d887a2..d7401cdf4 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -283,6 +283,13 @@ libzfs_error_description(libzfs_handle_t *hdl) "different host")); case EZFS_CRYPTOFAILED: return (dgettext(TEXT_DOMAIN, "encryption failure")); + case EZFS_TOOMANY: + return (dgettext(TEXT_DOMAIN, "argument list too long")); + case EZFS_INITIALIZING: + return (dgettext(TEXT_DOMAIN, "currently initializing")); + case EZFS_NO_INITIALIZE: + return (dgettext(TEXT_DOMAIN, "there is no active " + "initialization")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index 1baec04a4..524a637e4 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -1397,3 +1397,40 @@ lzc_reopen(const char *pool_name, boolean_t scrub_restart) nvlist_free(args); return (error); } + +/* + * Changes initializing state. + * + * vdevs should be a list of (<key>, guid) where guid is a uint64 vdev GUID. + * The key is ignored. + * + * If there are errors related to vdev arguments, per-vdev errors are returned + * in an nvlist with the key "vdevs". Each error is a (guid, errno) pair where + * guid is stringified with PRIu64, and errno is one of the following as + * an int64_t: + * - ENODEV if the device was not found + * - EINVAL if the devices is not a leaf or is not concrete (e.g. missing) + * - EROFS if the device is not writeable + * - EBUSY start requested but the device is already being initialized + * - ESRCH cancel/suspend requested but device is not being initialized + * + * If the errlist is empty, then return value will be: + * - EINVAL if one or more arguments was invalid + * - Other spa_open failures + * - 0 if the operation succeeded + */ +int +lzc_initialize(const char *poolname, pool_initialize_func_t cmd_type, + nvlist_t *vdevs, nvlist_t **errlist) +{ + int error; + nvlist_t *args = fnvlist_alloc(); + fnvlist_add_uint64(args, ZPOOL_INITIALIZE_COMMAND, (uint64_t)cmd_type); + fnvlist_add_nvlist(args, ZPOOL_INITIALIZE_VDEVS, vdevs); + + error = lzc_ioctl(ZFS_IOC_POOL_INITIALIZE, poolname, args, errlist); + + fnvlist_free(args); + + return (error); +} diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index efc44b27e..e13bb0f58 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -113,6 +113,7 @@ KERNEL_C = \ vdev_indirect_births.c \ vdev_indirect.c \ vdev_indirect_mapping.c \ + vdev_initialize.c \ vdev_label.c \ vdev_mirror.c \ vdev_missing.c \ diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 55115c266..1dbf865f4 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -1375,6 +1375,30 @@ Default value: \fB2\fR. .sp .ne 2 .na +\fBzfs_vdev_initializing_max_active\fR (int) +.ad +.RS 12n +Maximum initializing I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_initializing_min_active\fR (int) +.ad +.RS 12n +Minimum initializing I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na \fBzfs_vdev_max_active\fR (int) .ad .RS 12n @@ -1388,6 +1412,30 @@ Default value: \fB1,000\fR. .sp .ne 2 .na +\fBzfs_vdev_removal_max_active\fR (int) +.ad +.RS 12n +Maximum removal I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB2\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_removal_min_active\fR (int) +.ad +.RS 12n +Minimum removal I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na \fBzfs_vdev_scrub_max_active\fR (int) .ad .RS 12n @@ -1615,6 +1663,17 @@ Default value: \fB32,768\fR. .sp .ne 2 .na +\fBzfs_initialize_value\fR (ulong) +.ad +.RS 12n +Pattern written to vdev free space by \fBzpool initialize\fR. +.sp +Default value: \fB16,045,690,984,833,335,022\fR (0xdeadbeefdeadbeee). +.RE + +.sp +.ne 2 +.na \fBzfs_lua_max_instrlimit\fR (ulong) .ad .RS 12n diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index 83e50bd01..a05d4d1f3 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -115,6 +115,11 @@ .Ar pool Ns | Ns Ar id .Op Ar newpool Oo Fl t Oc .Nm +.Cm initialize +.Op Fl cs +.Ar pool +.Op Ar device Ns ... +.Nm .Cm iostat .Op Oo Oo Fl c Ar SCRIPT Oc Oo Fl lq Oc Oc Ns | Ns Fl rw .Op Fl T Sy u Ns | Ns Sy d @@ -1597,6 +1602,32 @@ Will also set -o cachefile=none when not explicitly specified. .El .It Xo .Nm +.Cm initialize +.Op Fl cs +.Ar pool +.Op Ar device Ns ... +.Xc +Begins initializing by writing to all unallocated regions on the specified +devices, or all eligible devices in the pool if no individual devices are +specified. +Only leaf data or log devices may be initialized. +.Bl -tag -width Ds +.It Fl c, -cancel +Cancel initializing on the specified devices, or all eligible devices if none +are specified. +If one or more target devices are invalid or are not currently being +initialized, the command will fail and no cancellation will occur on any device. +.It Fl s -suspend +Suspend initializing on the specified devices, or all eligible devices if none +are specified. +If one or more target devices are invalid or are not currently being +initialized, the command will fail and no suspension will occur on any device. +Initializing can then be resumed by running +.Nm zpool Cm initialize +with no flags on the relevant target devices. +.El +.It Xo +.Nm .Cm iostat .Op Oo Oo Fl c Ar SCRIPT Oc Oo Fl lq Oc Oc Ns | Ns Fl rw .Op Fl T Sy u Ns | Ns Sy d diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index a243f51d8..193bdc510 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -89,6 +89,7 @@ $(MODULE)-objs += vdev_file.o $(MODULE)-objs += vdev_indirect.o $(MODULE)-objs += vdev_indirect_births.o $(MODULE)-objs += vdev_indirect_mapping.o +$(MODULE)-objs += vdev_initialize.o $(MODULE)-objs += vdev_label.o $(MODULE)-objs += vdev_mirror.o $(MODULE)-objs += vdev_missing.o diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 4b5baf6a6..71688b420 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -635,6 +635,8 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL); mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), KM_SLEEP); mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), @@ -681,6 +683,8 @@ metaslab_group_destroy(metaslab_group_t *mg) kmem_free(mg->mg_secondaries, mg->mg_allocators * sizeof (metaslab_t *)); mutex_destroy(&mg->mg_lock); + mutex_destroy(&mg->mg_ms_initialize_lock); + cv_destroy(&mg->mg_ms_initialize_cv); for (int i = 0; i < mg->mg_allocators; i++) { zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]); @@ -1502,6 +1506,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); + ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; ms->ms_size = 1ULL << vd->vdev_ms_shift; @@ -2686,6 +2691,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) * from it in 'metaslab_unload_delay' txgs, then unload it. */ if (msp->ms_loaded && + msp->ms_initializing == 0 && msp->ms_selected_txg + metaslab_unload_delay < txg) { for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { @@ -2967,6 +2973,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) metaslab_class_t *mc = msp->ms_group->mg_class; VERIFY(!msp->ms_condensing); + VERIFY0(msp->ms_initializing); start = mc->mc_ops->msop_alloc(msp, size); if (start != -1ULL) { @@ -3027,9 +3034,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, } /* - * If the selected metaslab is condensing, skip it. + * If the selected metaslab is condensing or being + * initialized, skip it. */ - if (msp->ms_condensing) + if (msp->ms_condensing || msp->ms_initializing > 0) continue; *was_active = msp->ms_allocator != -1; @@ -3190,7 +3198,9 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, /* * If this metaslab is currently condensing then pick again as * we can't manipulate this metaslab until it's committed - * to disk. + * to disk. If this metaslab is being initialized, we shouldn't + * allocate from it since the allocated region might be + * overwritten after allocation. */ if (msp->ms_condensing) { metaslab_trace_add(zal, mg, msp, asize, d, @@ -3199,6 +3209,13 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, ~METASLAB_ACTIVE_MASK); mutex_exit(&msp->ms_lock); continue; + } else if (msp->ms_initializing > 0) { + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_INITIALIZING, allocator); + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); + mutex_exit(&msp->ms_lock); + continue; } offset = metaslab_block_alloc(msp, asize, txg); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index f0683b0b8..622be75f9 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -56,6 +56,7 @@ #include <sys/vdev_removal.h> #include <sys/vdev_indirect_mapping.h> #include <sys/vdev_indirect_births.h> +#include <sys/vdev_initialize.h> #include <sys/vdev_disk.h> #include <sys/metaslab.h> #include <sys/metaslab_impl.h> @@ -434,8 +435,9 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) dp = spa_get_dsl(spa); dsl_pool_config_enter(dp, FTAG); - if ((err = dsl_dataset_hold_obj(dp, - za.za_first_integer, FTAG, &ds))) { + err = dsl_dataset_hold_obj(dp, + za.za_first_integer, FTAG, &ds); + if (err != 0) { dsl_pool_config_exit(dp, FTAG); break; } @@ -601,7 +603,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) } error = dmu_objset_hold(strval, FTAG, &os); - if (error) + if (error != 0) break; /* @@ -1218,8 +1220,10 @@ spa_activate(spa_t *spa, int mode) spa_create_zio_taskqs(spa); } - for (size_t i = 0; i < TXG_SIZE; i++) - spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 0); + for (size_t i = 0; i < TXG_SIZE; i++) { + spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + } list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); @@ -1437,6 +1441,11 @@ spa_unload(spa_t *spa) */ spa_async_suspend(spa); + if (spa->spa_root_vdev) { + vdev_initialize_stop_all(spa->spa_root_vdev, + VDEV_INITIALIZE_ACTIVE); + } + /* * Stop syncing. */ @@ -1452,10 +1461,10 @@ spa_unload(spa_t *spa) * calling taskq_wait(mg_taskq). */ if (spa->spa_root_vdev != NULL) { - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); - spa_config_exit(spa, SCL_ALL, FTAG); + spa_config_exit(spa, SCL_ALL, spa); } if (spa->spa_mmp.mmp_thread) @@ -1492,7 +1501,7 @@ spa_unload(spa_t *spa) bpobj_close(&spa->spa_deferred_bpobj); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); /* * Close all vdevs. @@ -1554,7 +1563,7 @@ spa_unload(spa_t *spa) spa->spa_comment = NULL; } - spa_config_exit(spa, SCL_ALL, FTAG); + spa_config_exit(spa, SCL_ALL, spa); } /* @@ -4246,6 +4255,9 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) */ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_initialize_restart(spa->spa_root_vdev); + spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_load_note(spa, "LOADED"); @@ -5654,6 +5666,18 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, } /* + * We're about to export or destroy this pool. Make sure + * we stop all initializtion activity here before we + * set the spa_final_txg. This will ensure that all + * dirty data resulting from the initialization is + * committed to disk before we unload the pool. + */ + if (spa->spa_root_vdev != NULL) { + vdev_initialize_stop_all(spa->spa_root_vdev, + VDEV_INITIALIZE_ACTIVE); + } + + /* * We want this to be reflected on every label, * so mark them all dirty. spa_unload() will do the * final sync that pushes these changes out. @@ -6357,6 +6381,86 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) return (error); } +int +spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type) +{ + /* + * We hold the namespace lock through the whole function + * to prevent any changes to the pool while we're starting or + * stopping initialization. The config and state locks are held so that + * we can properly assess the vdev state before we commit to + * the initializing operation. + */ + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + + /* Look up vdev and ensure it's a leaf. */ + vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); + if (vd == NULL || vd->vdev_detached) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ENODEV)); + } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EINVAL)); + } else if (!vdev_writeable(vd)) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EROFS)); + } + mutex_enter(&vd->vdev_initialize_lock); + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + + /* + * When we activate an initialize action we check to see + * if the vdev_initialize_thread is NULL. We do this instead + * of using the vdev_initialize_state since there might be + * a previous initialization process which has completed but + * the thread is not exited. + */ + if (cmd_type == POOL_INITIALIZE_DO && + (vd->vdev_initialize_thread != NULL || + vd->vdev_top->vdev_removing)) { + mutex_exit(&vd->vdev_initialize_lock); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EBUSY)); + } else if (cmd_type == POOL_INITIALIZE_CANCEL && + (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && + vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { + mutex_exit(&vd->vdev_initialize_lock); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ESRCH)); + } else if (cmd_type == POOL_INITIALIZE_SUSPEND && + vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { + mutex_exit(&vd->vdev_initialize_lock); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ESRCH)); + } + + switch (cmd_type) { + case POOL_INITIALIZE_DO: + vdev_initialize(vd); + break; + case POOL_INITIALIZE_CANCEL: + vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED); + break; + case POOL_INITIALIZE_SUSPEND: + vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED); + break; + default: + panic("invalid cmd_type %llu", (unsigned long long)cmd_type); + } + mutex_exit(&vd->vdev_initialize_lock); + + /* Sync out the initializing state */ + txg_wait_synced(spa->spa_dsl_pool, 0); + mutex_exit(&spa_namespace_lock); + + return (0); +} + + /* * Split a set of devices from their mirrors, and create a new pool from them. */ @@ -6565,6 +6669,19 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, spa_activate(newspa, spa_mode_global); spa_async_suspend(newspa); + for (c = 0; c < children; c++) { + if (vml[c] != NULL) { + /* + * Temporarily stop the initializing activity. We set + * the state to ACTIVE so that we know to resume + * the initializing once the split has completed. + */ + mutex_enter(&vml[c]->vdev_initialize_lock); + vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE); + mutex_exit(&vml[c]->vdev_initialize_lock); + } + } + newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; /* create the new pool from the disks of the original pool */ @@ -6652,6 +6769,10 @@ out: if (vml[c] != NULL) vml[c]->vdev_offline = B_FALSE; } + + /* restart initializing disks as necessary */ + spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); + vdev_reopen(spa->spa_root_vdev); nvlist_free(spa->spa_config_splitting); @@ -7025,6 +7146,14 @@ spa_async_thread(void *arg) !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) dsl_resilver_restart(dp, 0); + if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_initialize_restart(spa->spa_root_vdev); + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_exit(&spa_namespace_lock); + } + /* * Let the world know that we're done. */ @@ -7677,8 +7806,9 @@ spa_sync(spa_t *spa, uint64_t txg) * Wait for i/os issued in open context that need to complete * before this txg syncs. */ - VERIFY0(zio_wait(spa->spa_txg_zio[txg & TXG_MASK])); - spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 0); + (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); + spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL); /* * Lock out configuration changes. @@ -7983,7 +8113,8 @@ spa_sync(spa_t *spa, uint64_t txg) /* * Update usable space statistics. */ - while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))) + while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) + != NULL) vdev_sync_done(vd, txg); spa_update_dspace(spa); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index a3ac70f07..dfac92d45 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -38,6 +38,7 @@ #include <sys/zap.h> #include <sys/zil.h> #include <sys/vdev_impl.h> +#include <sys/vdev_initialize.h> #include <sys/vdev_file.h> #include <sys/vdev_raidz.h> #include <sys/metaslab.h> @@ -1194,6 +1195,12 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) if (vd != NULL) { ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); + if (vd->vdev_ops->vdev_op_leaf) { + mutex_enter(&vd->vdev_initialize_lock); + vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED); + mutex_exit(&vd->vdev_initialize_lock); + } + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); vdev_free(vd); spa_config_exit(spa, SCL_ALL, spa); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 8273e7907..f808f8ee7 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -50,6 +50,7 @@ #include <sys/zil.h> #include <sys/dsl_scan.h> #include <sys/abd.h> +#include <sys/vdev_initialize.h> #include <sys/zvol.h> #include <sys/zfs_ratelimit.h> @@ -212,6 +213,14 @@ vdev_getops(const char *type) return (ops); } +/* ARGSUSED */ +void +vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res) +{ + res->rs_start = in->rs_start; + res->rs_end = in->rs_end; +} + /* * Derive the enumerated alloction bias from string input. * String origin is either the per-vdev zap or zpool(1M). @@ -526,6 +535,10 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); + cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, NULL); @@ -850,6 +863,7 @@ void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); /* * Scan queues are normally destroyed at the end of a scan. If the @@ -880,6 +894,7 @@ vdev_free(vdev_t *vd) ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); + ASSERT(vd->vdev_initialize_thread == NULL); /* * Discard allocation state. @@ -957,6 +972,10 @@ vdev_free(vdev_t *vd) mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_scan_io_queue_lock); + mutex_destroy(&vd->vdev_initialize_lock); + mutex_destroy(&vd->vdev_initialize_io_lock); + cv_destroy(&vd->vdev_initialize_io_cv); + cv_destroy(&vd->vdev_initialize_cv); zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); @@ -3207,7 +3226,8 @@ vdev_sync_done(vdev_t *vd, uint64_t txg) ASSERT(vdev_is_concrete(vd)); - while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))) + while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) + != NULL) metaslab_sync_done(msp, txg); if (reassess) @@ -3458,6 +3478,15 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } + /* Restart initializing if necessary */ + mutex_enter(&vd->vdev_initialize_lock); + if (vdev_writeable(vd) && + vd->vdev_initialize_thread == NULL && + vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { + (void) vdev_initialize(vd); + } + mutex_exit(&vd->vdev_initialize_lock); + if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && vd->vdev_state >= VDEV_STATE_DEGRADED)) @@ -3848,9 +3877,22 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); - if (vd->vdev_ops->vdev_op_leaf) + if (vd->vdev_ops->vdev_op_leaf) { vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + /* + * Report intializing progress. Since we don't + * have the initializing locks held, this is only + * an estimate (although a fairly accurate one). + */ + vs->vs_initialize_bytes_done = + vd->vdev_initialize_bytes_done; + vs->vs_initialize_bytes_est = + vd->vdev_initialize_bytes_est; + vs->vs_initialize_state = vd->vdev_initialize_state; + vs->vs_initialize_action_time = + vd->vdev_initialize_action_time; + } /* * Report expandable space on top-level, non-auxillary devices * only. The expandable space is reported in terms of metaslab diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 9c44ba12a..d13f365dd 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -890,6 +890,7 @@ vdev_ops_t vdev_disk_ops = { vdev_disk_hold, vdev_disk_rele, NULL, + vdev_default_xlate, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index bd7e0bc2e..3551898e0 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -254,6 +254,7 @@ vdev_ops_t vdev_file_ops = { vdev_file_hold, vdev_file_rele, NULL, + vdev_default_xlate, VDEV_TYPE_FILE, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; @@ -289,6 +290,7 @@ vdev_ops_t vdev_disk_ops = { vdev_file_hold, vdev_file_rele, NULL, + vdev_default_xlate, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 724457df4..070d1b8d9 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -1857,6 +1857,7 @@ vdev_ops_t vdev_indirect_ops = { NULL, NULL, vdev_indirect_remap, + NULL, VDEV_TYPE_INDIRECT, /* name of this vdev type */ B_FALSE /* leaf vdev */ }; diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c new file mode 100644 index 000000000..fcd2c76f9 --- /dev/null +++ b/module/zfs/vdev_initialize.c @@ -0,0 +1,819 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/txg.h> +#include <sys/vdev_impl.h> +#include <sys/refcount.h> +#include <sys/metaslab_impl.h> +#include <sys/dsl_synctask.h> +#include <sys/zap.h> +#include <sys/dmu_tx.h> + +/* + * Maximum number of metaslabs per group that can be initialized + * simultaneously. + */ +int max_initialize_ms = 3; + +/* + * Value that is written to disk during initialization. + */ +#ifdef _ILP32 +unsigned long zfs_initialize_value = 0xdeadbeefUL; +#else +unsigned long zfs_initialize_value = 0xdeadbeefdeadbeeeULL; +#endif + +/* maximum number of I/Os outstanding per leaf vdev */ +int zfs_initialize_limit = 1; + +/* size of initializing writes; default 1MiB, see zfs_remove_max_segment */ +uint64_t zfs_initialize_chunk_size = 1024 * 1024; + +static boolean_t +vdev_initialize_should_stop(vdev_t *vd) +{ + return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || + vd->vdev_detached || vd->vdev_top->vdev_removing); +} + +static void +vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) +{ + /* + * We pass in the guid instead of the vdev_t since the vdev may + * have been freed prior to the sync task being processed. This + * happens when a vdev is detached as we call spa_config_vdev_exit(), + * stop the intializing thread, schedule the sync task, and free + * the vdev. Later when the scheduled sync task is invoked, it would + * find that the vdev has been freed. + */ + uint64_t guid = *(uint64_t *)arg; + uint64_t txg = dmu_tx_get_txg(tx); + kmem_free(arg, sizeof (uint64_t)); + + vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); + if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) + return; + + uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; + vd->vdev_initialize_offset[txg & TXG_MASK] = 0; + + VERIFY(vd->vdev_leaf_zap != 0); + + objset_t *mos = vd->vdev_spa->spa_meta_objset; + + if (last_offset > 0) { + vd->vdev_initialize_last_offset = last_offset; + VERIFY0(zap_update(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, + sizeof (last_offset), 1, &last_offset, tx)); + } + if (vd->vdev_initialize_action_time > 0) { + uint64_t val = (uint64_t)vd->vdev_initialize_action_time; + VERIFY0(zap_update(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val), + 1, &val, tx)); + } + + uint64_t initialize_state = vd->vdev_initialize_state; + VERIFY0(zap_update(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1, + &initialize_state, tx)); +} + +static void +vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) +{ + ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); + spa_t *spa = vd->vdev_spa; + + if (new_state == vd->vdev_initialize_state) + return; + + /* + * Copy the vd's guid, this will be freed by the sync task. + */ + uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); + *guid = vd->vdev_guid; + + /* + * If we're suspending, then preserving the original start time. + */ + if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { + vd->vdev_initialize_action_time = gethrestime_sec(); + } + vd->vdev_initialize_state = new_state; + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, + guid, 2, ZFS_SPACE_CHECK_RESERVED, tx); + + switch (new_state) { + case VDEV_INITIALIZE_ACTIVE: + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s activated", vd->vdev_path); + break; + case VDEV_INITIALIZE_SUSPENDED: + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s suspended", vd->vdev_path); + break; + case VDEV_INITIALIZE_CANCELED: + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s canceled", vd->vdev_path); + break; + case VDEV_INITIALIZE_COMPLETE: + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s complete", vd->vdev_path); + break; + default: + panic("invalid state %llu", (unsigned long long)new_state); + } + + dmu_tx_commit(tx); +} + +static void +vdev_initialize_cb(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + mutex_enter(&vd->vdev_initialize_io_lock); + if (zio->io_error == ENXIO && !vdev_writeable(vd)) { + /* + * The I/O failed because the vdev was unavailable; roll the + * last offset back. (This works because spa_sync waits on + * spa_txg_zio before it runs sync tasks.) + */ + uint64_t *off = + &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK]; + *off = MIN(*off, zio->io_offset); + } else { + /* + * Since initializing is best-effort, we ignore I/O errors and + * rely on vdev_probe to determine if the errors are more + * critical. + */ + if (zio->io_error != 0) + vd->vdev_stat.vs_initialize_errors++; + + vd->vdev_initialize_bytes_done += zio->io_orig_size; + } + ASSERT3U(vd->vdev_initialize_inflight, >, 0); + vd->vdev_initialize_inflight--; + cv_broadcast(&vd->vdev_initialize_io_cv); + mutex_exit(&vd->vdev_initialize_io_lock); + + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); +} + +/* Takes care of physical writing and limiting # of concurrent ZIOs. */ +static int +vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data) +{ + spa_t *spa = vd->vdev_spa; + + /* Limit inflight initializing I/Os */ + mutex_enter(&vd->vdev_initialize_io_lock); + while (vd->vdev_initialize_inflight >= zfs_initialize_limit) { + cv_wait(&vd->vdev_initialize_io_cv, + &vd->vdev_initialize_io_lock); + } + vd->vdev_initialize_inflight++; + mutex_exit(&vd->vdev_initialize_io_lock); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + + spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); + mutex_enter(&vd->vdev_initialize_lock); + + if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) { + uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); + *guid = vd->vdev_guid; + + /* This is the first write of this txg. */ + dsl_sync_task_nowait(spa_get_dsl(spa), + vdev_initialize_zap_update_sync, guid, 2, + ZFS_SPACE_CHECK_RESERVED, tx); + } + + /* + * We know the vdev struct will still be around since all + * consumers of vdev_free must stop the initialization first. + */ + if (vdev_initialize_should_stop(vd)) { + mutex_enter(&vd->vdev_initialize_io_lock); + ASSERT3U(vd->vdev_initialize_inflight, >, 0); + vd->vdev_initialize_inflight--; + mutex_exit(&vd->vdev_initialize_io_lock); + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); + mutex_exit(&vd->vdev_initialize_lock); + dmu_tx_commit(tx); + return (SET_ERROR(EINTR)); + } + mutex_exit(&vd->vdev_initialize_lock); + + vd->vdev_initialize_offset[txg & TXG_MASK] = start + size; + zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start, + size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL, + ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE)); + /* vdev_initialize_cb releases SCL_STATE_ALL */ + + dmu_tx_commit(tx); + + return (0); +} + +/* + * Translate a logical range to the physical range for the specified vdev_t. + * This function is initially called with a leaf vdev and will walk each + * parent vdev until it reaches a top-level vdev. Once the top-level is + * reached the physical range is initialized and the recursive function + * begins to unwind. As it unwinds it calls the parent's vdev specific + * translation function to do the real conversion. + */ +void +vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs) +{ + /* + * Walk up the vdev tree + */ + if (vd != vd->vdev_top) { + vdev_xlate(vd->vdev_parent, logical_rs, physical_rs); + } else { + /* + * We've reached the top-level vdev, initialize the + * physical range to the logical range and start to + * unwind. + */ + physical_rs->rs_start = logical_rs->rs_start; + physical_rs->rs_end = logical_rs->rs_end; + return; + } + + vdev_t *pvd = vd->vdev_parent; + ASSERT3P(pvd, !=, NULL); + ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL); + + /* + * As this recursive function unwinds, translate the logical + * range into its physical components by calling the + * vdev specific translate function. + */ + range_seg_t intermediate = { { { 0, 0 } } }; + pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate); + + physical_rs->rs_start = intermediate.rs_start; + physical_rs->rs_end = intermediate.rs_end; +} + +/* + * Callback to fill each ABD chunk with zfs_initialize_value. len must be + * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD + * allocation will guarantee these for us. + */ +/* ARGSUSED */ +static int +vdev_initialize_block_fill(void *buf, size_t len, void *unused) +{ + ASSERT0(len % sizeof (uint64_t)); +#ifdef _ILP32 + for (uint64_t i = 0; i < len; i += sizeof (uint32_t)) { + *(uint32_t *)((char *)(buf) + i) = zfs_initialize_value; + } +#else + for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) { + *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value; + } +#endif + return (0); +} + +static abd_t * +vdev_initialize_block_alloc(void) +{ + /* Allocate ABD for filler data */ + abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE); + + ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t)); + (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size, + vdev_initialize_block_fill, NULL); + + return (data); +} + +static void +vdev_initialize_block_free(abd_t *data) +{ + abd_free(data); +} + +static int +vdev_initialize_ranges(vdev_t *vd, abd_t *data) +{ + avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root; + + for (range_seg_t *rs = avl_first(rt); rs != NULL; + rs = AVL_NEXT(rt, rs)) { + uint64_t size = rs->rs_end - rs->rs_start; + + /* Split range into legally-sized physical chunks */ + uint64_t writes_required = + ((size - 1) / zfs_initialize_chunk_size) + 1; + + for (uint64_t w = 0; w < writes_required; w++) { + int error; + + error = vdev_initialize_write(vd, + VDEV_LABEL_START_SIZE + rs->rs_start + + (w * zfs_initialize_chunk_size), + MIN(size - (w * zfs_initialize_chunk_size), + zfs_initialize_chunk_size), data); + if (error != 0) + return (error); + } + } + return (0); +} + +static void +vdev_initialize_ms_load(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + metaslab_load_wait(msp); + if (!msp->ms_loaded) + VERIFY0(metaslab_load(msp)); +} + +static void +vdev_initialize_mg_wait(metaslab_group_t *mg) +{ + ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock)); + while (mg->mg_initialize_updating) { + cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock); + } +} + +static void +vdev_initialize_mg_mark(metaslab_group_t *mg) +{ + ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock)); + ASSERT(mg->mg_initialize_updating); + + while (mg->mg_ms_initializing >= max_initialize_ms) { + cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock); + } + mg->mg_ms_initializing++; + ASSERT3U(mg->mg_ms_initializing, <=, max_initialize_ms); +} + +/* + * Mark the metaslab as being initialized to prevent any allocations + * on this metaslab. We must also track how many metaslabs are currently + * being initialized within a metaslab group and limit them to prevent + * allocation failures from occurring because all metaslabs are being + * initialized. + */ +static void +vdev_initialize_ms_mark(metaslab_t *msp) +{ + ASSERT(!MUTEX_HELD(&msp->ms_lock)); + metaslab_group_t *mg = msp->ms_group; + + mutex_enter(&mg->mg_ms_initialize_lock); + + /* + * To keep an accurate count of how many threads are initializing + * a specific metaslab group, we only allow one thread to mark + * the metaslab group at a time. This ensures that the value of + * ms_initializing will be accurate when we decide to mark a metaslab + * group as being initialized. To do this we force all other threads + * to wait till the metaslab's mg_initialize_updating flag is no + * longer set. + */ + vdev_initialize_mg_wait(mg); + mg->mg_initialize_updating = B_TRUE; + if (msp->ms_initializing == 0) { + vdev_initialize_mg_mark(mg); + } + mutex_enter(&msp->ms_lock); + msp->ms_initializing++; + mutex_exit(&msp->ms_lock); + + mg->mg_initialize_updating = B_FALSE; + cv_broadcast(&mg->mg_ms_initialize_cv); + mutex_exit(&mg->mg_ms_initialize_lock); +} + +static void +vdev_initialize_ms_unmark(metaslab_t *msp) +{ + ASSERT(!MUTEX_HELD(&msp->ms_lock)); + metaslab_group_t *mg = msp->ms_group; + mutex_enter(&mg->mg_ms_initialize_lock); + mutex_enter(&msp->ms_lock); + if (--msp->ms_initializing == 0) { + mg->mg_ms_initializing--; + cv_broadcast(&mg->mg_ms_initialize_cv); + } + mutex_exit(&msp->ms_lock); + mutex_exit(&mg->mg_ms_initialize_lock); +} + +static void +vdev_initialize_calculate_progress(vdev_t *vd) +{ + ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || + spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); + ASSERT(vd->vdev_leaf_zap != 0); + + vd->vdev_initialize_bytes_est = 0; + vd->vdev_initialize_bytes_done = 0; + + for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_top->vdev_ms[i]; + mutex_enter(&msp->ms_lock); + + uint64_t ms_free = msp->ms_size - + space_map_allocated(msp->ms_sm); + + if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) + ms_free /= vd->vdev_top->vdev_children; + + /* + * Convert the metaslab range to a physical range + * on our vdev. We use this to determine if we are + * in the middle of this metaslab range. + */ + range_seg_t logical_rs, physical_rs; + logical_rs.rs_start = msp->ms_start; + logical_rs.rs_end = msp->ms_start + msp->ms_size; + vdev_xlate(vd, &logical_rs, &physical_rs); + + if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { + vd->vdev_initialize_bytes_est += ms_free; + mutex_exit(&msp->ms_lock); + continue; + } else if (vd->vdev_initialize_last_offset > + physical_rs.rs_end) { + vd->vdev_initialize_bytes_done += ms_free; + vd->vdev_initialize_bytes_est += ms_free; + mutex_exit(&msp->ms_lock); + continue; + } + + /* + * If we get here, we're in the middle of initializing this + * metaslab. Load it and walk the free tree for more accurate + * progress estimation. + */ + vdev_initialize_ms_load(msp); + + for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); + rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { + logical_rs.rs_start = rs->rs_start; + logical_rs.rs_end = rs->rs_end; + vdev_xlate(vd, &logical_rs, &physical_rs); + + uint64_t size = physical_rs.rs_end - + physical_rs.rs_start; + vd->vdev_initialize_bytes_est += size; + if (vd->vdev_initialize_last_offset > + physical_rs.rs_end) { + vd->vdev_initialize_bytes_done += size; + } else if (vd->vdev_initialize_last_offset > + physical_rs.rs_start && + vd->vdev_initialize_last_offset < + physical_rs.rs_end) { + vd->vdev_initialize_bytes_done += + vd->vdev_initialize_last_offset - + physical_rs.rs_start; + } + } + mutex_exit(&msp->ms_lock); + } +} + +static int +vdev_initialize_load(vdev_t *vd) +{ + int err = 0; + ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || + spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); + ASSERT(vd->vdev_leaf_zap != 0); + + if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE || + vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) { + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, + sizeof (vd->vdev_initialize_last_offset), 1, + &vd->vdev_initialize_last_offset); + if (err == ENOENT) { + vd->vdev_initialize_last_offset = 0; + err = 0; + } + } + + vdev_initialize_calculate_progress(vd); + return (err); +} + + +/* + * Convert the logical range into a physcial range and add it to our + * avl tree. + */ +void +vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) +{ + vdev_t *vd = arg; + range_seg_t logical_rs, physical_rs; + logical_rs.rs_start = start; + logical_rs.rs_end = start + size; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + vdev_xlate(vd, &logical_rs, &physical_rs); + + IMPLY(vd->vdev_top == vd, + logical_rs.rs_start == physical_rs.rs_start); + IMPLY(vd->vdev_top == vd, + logical_rs.rs_end == physical_rs.rs_end); + + /* Only add segments that we have not visited yet */ + if (physical_rs.rs_end <= vd->vdev_initialize_last_offset) + return; + + /* Pick up where we left off mid-range. */ + if (vd->vdev_initialize_last_offset > physical_rs.rs_start) { + zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " + "(%llu, %llu)", vd->vdev_path, + (u_longlong_t)physical_rs.rs_start, + (u_longlong_t)physical_rs.rs_end, + (u_longlong_t)vd->vdev_initialize_last_offset, + (u_longlong_t)physical_rs.rs_end); + ASSERT3U(physical_rs.rs_end, >, + vd->vdev_initialize_last_offset); + physical_rs.rs_start = vd->vdev_initialize_last_offset; + } + ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); + + /* + * With raidz, it's possible that the logical range does not live on + * this leaf vdev. We only add the physical range to this vdev's if it + * has a length greater than 0. + */ + if (physical_rs.rs_end > physical_rs.rs_start) { + range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start, + physical_rs.rs_end - physical_rs.rs_start); + } else { + ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); + } +} + +static void +vdev_initialize_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + int error = 0; + uint64_t ms_count = 0; + + ASSERT(vdev_is_concrete(vd)); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + vd->vdev_initialize_last_offset = 0; + VERIFY0(vdev_initialize_load(vd)); + + abd_t *deadbeef = vdev_initialize_block_alloc(); + + vd->vdev_initialize_tree = range_tree_create(NULL, NULL); + + for (uint64_t i = 0; !vd->vdev_detached && + i < vd->vdev_top->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_top->vdev_ms[i]; + + /* + * If we've expanded the top-level vdev or it's our + * first pass, calculate our progress. + */ + if (vd->vdev_top->vdev_ms_count != ms_count) { + vdev_initialize_calculate_progress(vd); + ms_count = vd->vdev_top->vdev_ms_count; + } + + vdev_initialize_ms_mark(msp); + mutex_enter(&msp->ms_lock); + vdev_initialize_ms_load(msp); + + range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, + vd); + mutex_exit(&msp->ms_lock); + + spa_config_exit(spa, SCL_CONFIG, FTAG); + error = vdev_initialize_ranges(vd, deadbeef); + vdev_initialize_ms_unmark(msp); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); + if (error != 0) + break; + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_enter(&vd->vdev_initialize_io_lock); + while (vd->vdev_initialize_inflight > 0) { + cv_wait(&vd->vdev_initialize_io_cv, + &vd->vdev_initialize_io_lock); + } + mutex_exit(&vd->vdev_initialize_io_lock); + + range_tree_destroy(vd->vdev_initialize_tree); + vdev_initialize_block_free(deadbeef); + vd->vdev_initialize_tree = NULL; + + mutex_enter(&vd->vdev_initialize_lock); + if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) { + vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE); + } + ASSERT(vd->vdev_initialize_thread != NULL || + vd->vdev_initialize_inflight == 0); + + /* + * Drop the vdev_initialize_lock while we sync out the + * txg since it's possible that a device might be trying to + * come online and must check to see if it needs to restart an + * initialization. That thread will be holding the spa_config_lock + * which would prevent the txg_wait_synced from completing. + */ + mutex_exit(&vd->vdev_initialize_lock); + txg_wait_synced(spa_get_dsl(spa), 0); + mutex_enter(&vd->vdev_initialize_lock); + + vd->vdev_initialize_thread = NULL; + cv_broadcast(&vd->vdev_initialize_cv); + mutex_exit(&vd->vdev_initialize_lock); +} + +/* + * Initiates a device. Caller must hold vdev_initialize_lock. + * Device must be a leaf and not already be initializing. + */ +void +vdev_initialize(vdev_t *vd) +{ + ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(vdev_is_concrete(vd)); + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + ASSERT(!vd->vdev_detached); + ASSERT(!vd->vdev_initialize_exit_wanted); + ASSERT(!vd->vdev_top->vdev_removing); + + vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); + vd->vdev_initialize_thread = thread_create(NULL, 0, + vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri); +} + +/* + * Stop initializng a device, with the resultant initialing state being + * tgt_state. Blocks until the initializing thread has exited. + * Caller must hold vdev_initialize_lock and must not be writing to the spa + * config, as the initializing thread may try to enter the config as a reader + * before exiting. + */ +void +vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state) +{ + ASSERTV(spa_t *spa = vd->vdev_spa); + ASSERT(!spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_WRITER)); + + ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(vdev_is_concrete(vd)); + + /* + * Allow cancel requests to proceed even if the initialize thread + * has stopped. + */ + if (vd->vdev_initialize_thread == NULL && + tgt_state != VDEV_INITIALIZE_CANCELED) { + return; + } + + vdev_initialize_change_state(vd, tgt_state); + vd->vdev_initialize_exit_wanted = B_TRUE; + while (vd->vdev_initialize_thread != NULL) + cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock); + + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + vd->vdev_initialize_exit_wanted = B_FALSE; +} + +static void +vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state) +{ + if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { + mutex_enter(&vd->vdev_initialize_lock); + vdev_initialize_stop(vd, tgt_state); + mutex_exit(&vd->vdev_initialize_lock); + return; + } + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state); + } +} + +/* + * Convenience function to stop initializing of a vdev tree and set all + * initialize thread pointers to NULL. + */ +void +vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) +{ + vdev_initialize_stop_all_impl(vd, tgt_state); + + if (vd->vdev_spa->spa_sync_on) { + /* Make sure that our state has been synced to disk */ + txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); + } +} + +void +vdev_initialize_restart(vdev_t *vd) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); + + if (vd->vdev_leaf_zap != 0) { + mutex_enter(&vd->vdev_initialize_lock); + uint64_t initialize_state = VDEV_INITIALIZE_NONE; + int err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE, + sizeof (initialize_state), 1, &initialize_state); + ASSERT(err == 0 || err == ENOENT); + vd->vdev_initialize_state = initialize_state; + + uint64_t timestamp = 0; + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, + sizeof (timestamp), 1, ×tamp); + ASSERT(err == 0 || err == ENOENT); + vd->vdev_initialize_action_time = (time_t)timestamp; + + if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || + vd->vdev_offline) { + /* load progress for reporting, but don't resume */ + VERIFY0(vdev_initialize_load(vd)); + } else if (vd->vdev_initialize_state == + VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd)) { + vdev_initialize(vd); + } + + mutex_exit(&vd->vdev_initialize_lock); + } + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_initialize_restart(vd->vdev_child[i]); + } +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(vdev_initialize_restart); +EXPORT_SYMBOL(vdev_xlate); +EXPORT_SYMBOL(vdev_initialize_stop_all); +EXPORT_SYMBOL(vdev_initialize); +EXPORT_SYMBOL(vdev_initialize_stop); + +/* CSTYLED */ +module_param(zfs_initialize_value, ulong, 0644); +MODULE_PARM_DESC(zfs_initialize_value, + "Value written during zpool initialize"); +#endif diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 65357d841..b45c05db2 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -740,6 +740,7 @@ vdev_ops_t vdev_mirror_ops = { NULL, NULL, NULL, + vdev_default_xlate, VDEV_TYPE_MIRROR, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; @@ -755,6 +756,7 @@ vdev_ops_t vdev_replacing_ops = { NULL, NULL, NULL, + vdev_default_xlate, VDEV_TYPE_REPLACING, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; @@ -770,6 +772,7 @@ vdev_ops_t vdev_spare_ops = { NULL, NULL, NULL, + vdev_default_xlate, VDEV_TYPE_SPARE, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c index b1c039f16..d85993bff 100644 --- a/module/zfs/vdev_missing.c +++ b/module/zfs/vdev_missing.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. */ /* @@ -90,6 +90,7 @@ vdev_ops_t vdev_missing_ops = { NULL, NULL, NULL, + NULL, VDEV_TYPE_MISSING, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; @@ -105,6 +106,7 @@ vdev_ops_t vdev_hole_ops = { NULL, NULL, NULL, + NULL, VDEV_TYPE_HOLE, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 89cdf7d81..939699cb8 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -154,6 +154,8 @@ uint32_t zfs_vdev_scrub_min_active = 1; uint32_t zfs_vdev_scrub_max_active = 2; uint32_t zfs_vdev_removal_min_active = 1; uint32_t zfs_vdev_removal_max_active = 2; +uint32_t zfs_vdev_initializing_min_active = 1; +uint32_t zfs_vdev_initializing_max_active = 1; /* * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent @@ -261,6 +263,8 @@ vdev_queue_class_min_active(zio_priority_t p) return (zfs_vdev_scrub_min_active); case ZIO_PRIORITY_REMOVAL: return (zfs_vdev_removal_min_active); + case ZIO_PRIORITY_INITIALIZING: + return (zfs_vdev_initializing_min_active); default: panic("invalid priority %u", p); return (0); @@ -331,6 +335,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) return (zfs_vdev_scrub_max_active); case ZIO_PRIORITY_REMOVAL: return (zfs_vdev_removal_max_active); + case ZIO_PRIORITY_INITIALIZING: + return (zfs_vdev_initializing_max_active); default: panic("invalid priority %u", p); return (0); @@ -718,8 +724,8 @@ again: } /* - * For LBA-ordered queues (async / scrub), issue the i/o which follows - * the most recently issued i/o in LBA (offset) order. + * For LBA-ordered queues (async / scrub / initializing), issue the + * i/o which follows the most recently issued i/o in LBA (offset) order. * * For FIFO queues (sync), issue the i/o with the lowest timestamp. */ @@ -775,13 +781,15 @@ vdev_queue_io(zio_t *zio) if (zio->io_priority != ZIO_PRIORITY_SYNC_READ && zio->io_priority != ZIO_PRIORITY_ASYNC_READ && zio->io_priority != ZIO_PRIORITY_SCRUB && - zio->io_priority != ZIO_PRIORITY_REMOVAL) + zio->io_priority != ZIO_PRIORITY_REMOVAL && + zio->io_priority != ZIO_PRIORITY_INITIALIZING) zio->io_priority = ZIO_PRIORITY_ASYNC_READ; } else { ASSERT(zio->io_type == ZIO_TYPE_WRITE); if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE && - zio->io_priority != ZIO_PRIORITY_REMOVAL) + zio->io_priority != ZIO_PRIORITY_REMOVAL && + zio->io_priority != ZIO_PRIORITY_INITIALIZING) zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; } @@ -938,11 +946,29 @@ module_param(zfs_vdev_async_write_min_active, int, 0644); MODULE_PARM_DESC(zfs_vdev_async_write_min_active, "Min active async write I/Os per vdev"); +module_param(zfs_vdev_initializing_max_active, int, 0644); +MODULE_PARM_DESC(zfs_vdev_initializing_max_active, + "Max active initializing I/Os per vdev"); + +module_param(zfs_vdev_initializing_min_active, int, 0644); +MODULE_PARM_DESC(zfs_vdev_initializing_min_active, + "Min active initializing I/Os per vdev"); + +module_param(zfs_vdev_removal_max_active, int, 0644); +MODULE_PARM_DESC(zfs_vdev_removal_max_active, + "Max active removal I/Os per vdev"); + +module_param(zfs_vdev_removal_min_active, int, 0644); +MODULE_PARM_DESC(zfs_vdev_removal_min_active, + "Min active removal I/Os per vdev"); + module_param(zfs_vdev_scrub_max_active, int, 0644); -MODULE_PARM_DESC(zfs_vdev_scrub_max_active, "Max active scrub I/Os per vdev"); +MODULE_PARM_DESC(zfs_vdev_scrub_max_active, + "Max active scrub I/Os per vdev"); module_param(zfs_vdev_scrub_min_active, int, 0644); -MODULE_PARM_DESC(zfs_vdev_scrub_min_active, "Min active scrub I/Os per vdev"); +MODULE_PARM_DESC(zfs_vdev_scrub_min_active, + "Min active scrub I/Os per vdev"); module_param(zfs_vdev_sync_read_max_active, int, 0644); MODULE_PARM_DESC(zfs_vdev_sync_read_max_active, diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index a21baf9c2..d10d89f3e 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -36,6 +36,10 @@ #include <sys/vdev_raidz.h> #include <sys/vdev_raidz_impl.h> +#ifdef ZFS_DEBUG +#include <sys/vdev_initialize.h> /* vdev_xlate testing */ +#endif + /* * Virtual device vector for RAID-Z. * @@ -1627,6 +1631,39 @@ vdev_raidz_child_done(zio_t *zio) rc->rc_skipped = 0; } +static void +vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) +{ +#ifdef ZFS_DEBUG + vdev_t *vd = zio->io_vd; + vdev_t *tvd = vd->vdev_top; + + range_seg_t logical_rs, physical_rs; + logical_rs.rs_start = zio->io_offset; + logical_rs.rs_end = logical_rs.rs_start + + vdev_raidz_asize(zio->io_vd, zio->io_size); + + raidz_col_t *rc = &rm->rm_col[col]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + vdev_xlate(cvd, &logical_rs, &physical_rs); + ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); + ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); + /* + * It would be nice to assert that rs_end is equal + * to rc_offset + rc_size but there might be an + * optional I/O at the end that is not accounted in + * rc_size. + */ + if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { + ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + + rc->rc_size + (1 << tvd->vdev_ashift)); + } else { + ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); + } +#endif +} + /* * Start an IO operation on a RAIDZ VDev * @@ -1665,6 +1702,12 @@ vdev_raidz_io_start(zio_t *zio) for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; + + /* + * Verify physical to logical translation. + */ + vdev_raidz_io_verify(zio, rm, c); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, @@ -2323,6 +2366,37 @@ vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) return (B_FALSE); } +static void +vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res) +{ + vdev_t *raidvd = cvd->vdev_parent; + ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); + + uint64_t width = raidvd->vdev_children; + uint64_t tgt_col = cvd->vdev_id; + uint64_t ashift = raidvd->vdev_top->vdev_ashift; + + /* make sure the offsets are block-aligned */ + ASSERT0(in->rs_start % (1 << ashift)); + ASSERT0(in->rs_end % (1 << ashift)); + uint64_t b_start = in->rs_start >> ashift; + uint64_t b_end = in->rs_end >> ashift; + + uint64_t start_row = 0; + if (b_start > tgt_col) /* avoid underflow */ + start_row = ((b_start - tgt_col - 1) / width) + 1; + + uint64_t end_row = 0; + if (b_end > tgt_col) + end_row = ((b_end - tgt_col - 1) / width) + 1; + + res->rs_start = start_row << ashift; + res->rs_end = end_row << ashift; + + ASSERT3U(res->rs_start, <=, in->rs_start); + ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start); +} + vdev_ops_t vdev_raidz_ops = { vdev_raidz_open, vdev_raidz_close, @@ -2334,6 +2408,7 @@ vdev_ops_t vdev_raidz_ops = { NULL, NULL, NULL, + vdev_raidz_xlate, VDEV_TYPE_RAIDZ, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index a706bc2a4..d0824aa84 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -44,6 +44,7 @@ #include <sys/vdev_indirect_births.h> #include <sys/vdev_indirect_mapping.h> #include <sys/abd.h> +#include <sys/vdev_initialize.h> #include <sys/trace_vdev.h> /* @@ -1186,6 +1187,7 @@ vdev_remove_complete(spa_t *spa) txg_wait_synced(spa->spa_dsl_pool, 0); txg = spa_vdev_enter(spa); vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); sysevent_t *ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_DEV); @@ -1896,6 +1898,9 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); + /* Stop initializing */ + (void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED); + *txg = spa_vdev_config_enter(spa); sysevent_t *ev = spa_event_create(spa, vd, NULL, @@ -2072,6 +2077,13 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) */ error = spa_reset_logs(spa); + /* + * We stop any initializing that is currently in progress but leave + * the state as "active". This will allow the initializing to resume + * if the removal is canceled sometime later. + */ + vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE); + *txg = spa_vdev_config_enter(spa); /* @@ -2083,6 +2095,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) if (error != 0) { metaslab_group_activate(mg); + spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); return (error); } diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c index 9f86cbfa4..e40b7ce8e 100644 --- a/module/zfs/vdev_root.c +++ b/module/zfs/vdev_root.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -150,6 +150,7 @@ vdev_ops_t vdev_root_ops = { NULL, NULL, NULL, + NULL, VDEV_TYPE_ROOT, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index a71da2837..3c36502d8 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -202,6 +202,8 @@ #include <sys/zio_checksum.h> #include <sys/vdev_removal.h> #include <sys/zfs_sysfs.h> +#include <sys/vdev_impl.h> +#include <sys/vdev_initialize.h> #include <linux/miscdevice.h> #include <linux/slab.h> @@ -3843,6 +3845,85 @@ zfs_ioc_destroy(zfs_cmd_t *zc) } /* + * innvl: { + * vdevs: { + * guid 1, guid 2, ... + * }, + * func: POOL_INITIALIZE_{CANCEL|DO|SUSPEND} + * } + * + * outnvl: { + * [func: EINVAL (if provided command type didn't make sense)], + * [vdevs: { + * guid1: errno, (see function body for possible errnos) + * ... + * }] + * } + * + */ +static const zfs_ioc_key_t zfs_keys_pool_initialize[] = { + {ZPOOL_INITIALIZE_COMMAND, DATA_TYPE_UINT64, 0}, + {ZPOOL_INITIALIZE_VDEVS, DATA_TYPE_NVLIST, 0} +}; + +static int +zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + spa_t *spa; + int error; + + error = spa_open(poolname, &spa, FTAG); + if (error != 0) + return (error); + + uint64_t cmd_type; + if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND, + &cmd_type) != 0) { + spa_close(spa, FTAG); + return (SET_ERROR(EINVAL)); + } + if (!(cmd_type == POOL_INITIALIZE_CANCEL || + cmd_type == POOL_INITIALIZE_DO || + cmd_type == POOL_INITIALIZE_SUSPEND)) { + spa_close(spa, FTAG); + return (SET_ERROR(EINVAL)); + } + + nvlist_t *vdev_guids; + if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS, + &vdev_guids) != 0) { + spa_close(spa, FTAG); + return (SET_ERROR(EINVAL)); + } + + nvlist_t *vdev_errlist = fnvlist_alloc(); + int total_errors = 0; + + for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); + pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { + uint64_t vdev_guid = fnvpair_value_uint64(pair); + + error = spa_vdev_initialize(spa, vdev_guid, cmd_type); + if (error != 0) { + char guid_as_str[MAXNAMELEN]; + + (void) snprintf(guid_as_str, sizeof (guid_as_str), + "%llu", (unsigned long long)vdev_guid); + fnvlist_add_int64(vdev_errlist, guid_as_str, error); + total_errors++; + } + } + if (fnvlist_size(vdev_errlist) > 0) { + fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS, + vdev_errlist); + } + fnvlist_free(vdev_errlist); + + spa_close(spa, FTAG); + return (total_errors > 0 ? EINVAL : 0); +} + +/* * fsname is name of dataset to rollback (to most recent snapshot) * * innvl may contain name of expected target snapshot @@ -6453,6 +6534,11 @@ zfs_ioctl_init(void) zfs_keys_pool_discard_checkpoint, ARRAY_SIZE(zfs_keys_pool_discard_checkpoint)); + zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE, + zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize)); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, diff --git a/module/zfs/zfs_sysfs.c b/module/zfs/zfs_sysfs.c index b17c91f65..87c4ac117 100644 --- a/module/zfs/zfs_sysfs.c +++ b/module/zfs/zfs_sysfs.c @@ -358,6 +358,7 @@ pool_property_show(struct kobject *kobj, struct attribute *attr, char *buf) */ static const char *zfs_features[] = { /* --> Add new kernel features here (post ZoL 0.8.0) */ + "vdev_initialize" }; #define ZFS_FEATURE_COUNT ARRAY_SIZE(zfs_features) diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 4463bfa1e..38f040126 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -397,6 +397,21 @@ pre = post = tags = ['functional', 'cli_root', 'zpool_labelclear'] +[tests/functional/cli_root/zpool_initialize] +tests = ['zpool_initialize_attach_detach_add_remove', + 'zpool_initialize_import_export', + 'zpool_initialize_offline_export_import_online', + 'zpool_initialize_online_offline', + 'zpool_initialize_split', + 'zpool_initialize_start_and_cancel_neg', + 'zpool_initialize_start_and_cancel_pos', + 'zpool_initialize_suspend_resume', + 'zpool_initialize_unsupported_vdevs', + 'zpool_initialize_verify_checksums', + 'zpool_initialize_verify_initialized'] +pre = +tags = ['functional', 'cli_root', 'zpool_initialize'] + [tests/functional/cli_root/zpool_offline] tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg', 'zpool_offline_003_pos'] diff --git a/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c b/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c index 88a18d8f0..61ac2fecc 100644 --- a/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c +++ b/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c @@ -642,6 +642,22 @@ test_unload_key(const char *dataset) IOC_INPUT_TEST(ZFS_IOC_UNLOAD_KEY, dataset, NULL, NULL, EACCES); } +static void +test_vdev_initialize(const char *pool) +{ + nvlist_t *required = fnvlist_alloc(); + nvlist_t *vdev_guids = fnvlist_alloc(); + + fnvlist_add_uint64(vdev_guids, "path", 0xdeadbeefdeadbeef); + fnvlist_add_uint64(required, ZPOOL_INITIALIZE_COMMAND, + POOL_INITIALIZE_DO); + fnvlist_add_nvlist(required, ZPOOL_INITIALIZE_VDEVS, vdev_guids); + + IOC_INPUT_TEST(ZFS_IOC_POOL_INITIALIZE, pool, required, NULL, EINVAL); + nvlist_free(vdev_guids); + nvlist_free(required); +} + static int zfs_destroy(const char *dataset) { @@ -732,6 +748,8 @@ zfs_ioc_input_tests(const char *pool) test_change_key(dataset); test_unload_key(dataset); + test_vdev_initialize(pool); + /* * cleanup */ @@ -869,6 +887,7 @@ validate_ioc_values(void) ZFS_IOC_BASE + 76 == ZFS_IOC_REMAP && ZFS_IOC_BASE + 77 == ZFS_IOC_POOL_CHECKPOINT && ZFS_IOC_BASE + 78 == ZFS_IOC_POOL_DISCARD_CHECKPOINT && + ZFS_IOC_BASE + 79 == ZFS_IOC_POOL_INITIALIZE && LINUX_IOC_BASE + 1 == ZFS_IOC_EVENTS_NEXT && LINUX_IOC_BASE + 2 == ZFS_IOC_EVENTS_CLEAR && LINUX_IOC_BASE + 3 == ZFS_IOC_EVENTS_SEEK); diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 8ced03e93..78ba2488d 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -82,6 +82,7 @@ export SYSTEM_FILES='arp mv net nproc + od openssl parted pax diff --git a/tests/zfs-tests/tests/functional/cli_root/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/Makefile.am index 13ff889d8..625cf8579 100644 --- a/tests/zfs-tests/tests/functional/cli_root/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/Makefile.am @@ -46,6 +46,7 @@ SUBDIRS = \ zpool_get \ zpool_history \ zpool_import \ + zpool_initialize \ zpool_labelclear \ zpool_offline \ zpool_online \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh index 0cd9b5959..79ceaabd0 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh @@ -150,7 +150,7 @@ function do_testing #<clear type> <vdevs> # # Make errors to the testing pool by overwrite the vdev device with - # /usr/bin/dd command. We do not want to have a full overwrite. That + # dd command. We do not want to have a full overwrite. That # may cause the system panic. So, we should skip the vdev label space. # (( i = $RANDOM % 3 )) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile.am new file mode 100644 index 000000000..a0a0e0b5c --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile.am @@ -0,0 +1,18 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_initialize +dist_pkgdata_SCRIPTS = \ + cleanup.ksh \ + zpool_initialize_attach_detach_add_remove.ksh \ + zpool_initialize_import_export.ksh \ + zpool_initialize.kshlib \ + zpool_initialize_offline_export_import_online.ksh \ + zpool_initialize_online_offline.ksh \ + zpool_initialize_split.ksh \ + zpool_initialize_start_and_cancel_neg.ksh \ + zpool_initialize_start_and_cancel_pos.ksh \ + zpool_initialize_suspend_resume.ksh \ + zpool_initialize_unsupported_vdevs.ksh \ + zpool_initialize_verify_checksums.ksh \ + zpool_initialize_verify_initialized.ksh + +dist_pkgdata_DATA = \ + zpool_initialize.kshlib diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh new file mode 100755 index 000000000..3c7dbd317 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh @@ -0,0 +1,31 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib new file mode 100644 index 000000000..0f4e7f0fa --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib @@ -0,0 +1,43 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# + +function initialize_prog_line # pool disk +{ + typeset pool="$1" + typeset disk="$2" + zpool status "$pool" | grep "$disk" | grep "initialized" +} + +function initialize_progress # pool disk +{ + initialize_prog_line "$1" "$2" | \ + sed 's/.*(\([0-9]\{1,\}\)% initialized.*/\1/g' +} + +function cleanup +{ + if poolexists $TESTPOOL; then + log_must zpool destroy -f $TESTPOOL + fi + + if poolexists $TESTPOOL1; then + log_must zpool destroy -f $TESTPOOL1 + fi +} +log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh new file mode 100755 index 000000000..2a695025d --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh @@ -0,0 +1,68 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Detaching/attaching, adding/removing data devices works with initializing. +# +# STRATEGY: +# 1. Create a single-disk pool. +# 2. Start initializing. +# 3. Attach a second disk, ensure initializing continues. +# 4. Detach the second disk, ensure initializing continues. +# 5. Add a second disk, ensure initializing continues. +# 6. Remove the first disk, ensure initializing stops. +# + +DISK1="$(echo $DISKS | cut -d' ' -f1)" +DISK2="$(echo $DISKS | cut -d' ' -f2)" + +log_must zpool create -f $TESTPOOL $DISK1 + +log_must zpool initialize $TESTPOOL $DISK1 +progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ -z "$progress" ]] && log_fail "Initializing did not start" + +log_must zpool attach $TESTPOOL $DISK1 $DISK2 +new_progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ "$progress" -le "$new_progress" ]] || \ + log_fail "Lost initializing progress on demotion to child vdev" +progress="$new_progress" + +log_must zpool detach $TESTPOOL $DISK2 +new_progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ "$progress" -le "$new_progress" ]] || \ + log_fail "Lost initializing progress on promotion to top vdev" +progress="$new_progress" + +log_must zpool add $TESTPOOL $DISK2 +log_must zpool remove $TESTPOOL $DISK1 +[[ -z "$(initialize_prog_line $TESTPOOL $DISK1)" ]] || \ + log_fail "Initializing continued after initiating removal" + +log_pass "Initializing worked as expected across attach/detach and add/remove" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh new file mode 100755 index 000000000..386d2a5dc --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh @@ -0,0 +1,78 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Initializing automatically resumes across import/export. +# +# STRATEGY: +# 1. Create a one-disk pool. +# 2. Start initializing and verify that initializing is active. +# 3. Export the pool. +# 4. Import the pool. +# 5. Verify that initializing resumes and progress does not regress. +# 6. Suspend initializing. +# 7. Repeat steps 3-4. +# 8. Verify that progress does not regress but initializing is still suspended. +# + +DISK1=${DISKS%% *} + +log_must zpool create -f $TESTPOOL $DISK1 +log_must zpool initialize $TESTPOOL + +sleep 2 + +progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ -z "$progress" ]] && log_fail "Initializing did not start" + +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL + +new_progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ -z "$new_progress" ]] && log_fail "Initializing did not restart after import" +[[ "$progress" -le "$new_progress" ]] || \ + log_fail "Initializing lost progress after import" +log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + +log_must zpool initialize -s $TESTPOOL $DISK1 +action_date="$(initialize_prog_line $TESTPOOL $DISK1 | \ + sed 's/.*ed at \(.*\)).*/\1/g')" +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL +new_action_date=$(initialize_prog_line $TESTPOOL $DISK1 | \ + sed 's/.*ed at \(.*\)).*/\1/g') +[[ "$action_date" != "$new_action_date" ]] && \ + log_fail "Initializing action date did not persist across export/import" + +[[ "$new_progress" -le "$(initialize_progress $TESTPOOL $DISK1)" ]] || \ + log_fail "Initializing lost progress after import" + +log_must eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + +log_pass "Initializing retains state as expected across export/import" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh new file mode 100755 index 000000000..dedd466e4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh @@ -0,0 +1,66 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Miscellaneous complex sequences of operations function as expected. +# +# STRATEGY: +# 1. Create a pool with a two-way mirror. +# 2. Start initializing, offline, export, import, online and verify that +# initializing state is preserved / initializing behaves as expected +# at each step. +# + +DISK1="$(echo $DISKS | cut -d' ' -f1)" +DISK2="$(echo $DISKS | cut -d' ' -f2)" + +log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2 + +log_must zpool initialize $TESTPOOL $DISK1 +log_must zpool offline $TESTPOOL $DISK1 +progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ -z "$progress" ]] && log_fail "Initializing did not start" +log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL + +new_progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ -z "$new_progress" ]] && log_fail "Initializing did not start after import" +[[ "$new_progress" -ge "$progress" ]] || \ + log_fail "Initializing lost progress after import" +log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + +log_must zpool online $TESTPOOL $DISK1 +new_progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ "$new_progress" -ge "$progress" ]] || \ + log_fail "Initializing lost progress after online" + +log_pass "Initializing behaves as expected at each step of:" \ + "initialize + offline + export + import + online" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh new file mode 100755 index 000000000..55bd3188c --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh @@ -0,0 +1,74 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Initializing automatically resumes across offline/online. +# +# STRATEGY: +# 1. Create a pool with a two-way mirror. +# 2. Start initializing one of the disks and verify that initializing is active. +# 3. Offline the disk. +# 4. Online the disk. +# 5. Verify that initializing resumes and progress does not regress. +# 6. Suspend initializing. +# 7. Repeat steps 3-4 and verify that initializing does not resume. +# + +DISK1=${DISKS%% *} +DISK2="$(echo $DISKS | cut -d' ' -f2)" + +log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2 +log_must zpool initialize $TESTPOOL $DISK1 + +log_must zpool offline $TESTPOOL $DISK1 + +progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ -z "$progress" ]] && log_fail "Initializing did not start" + +log_must zpool online $TESTPOOL $DISK1 + +new_progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ -z "$new_progress" ]] && \ + log_fail "Initializing did not restart after onlining" +[[ "$progress" -le "$new_progress" ]] || \ + log_fail "Initializing lost progress after onlining" +log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + +log_must zpool initialize -s $TESTPOOL $DISK1 +action_date="$(initialize_prog_line $TESTPOOL $DISK1 | \ + sed 's/.*ed at \(.*\)).*/\1/g')" +log_must zpool offline $TESTPOOL $DISK1 +log_must zpool online $TESTPOOL $DISK1 +new_action_date=$(initialize_prog_line $TESTPOOL $DISK1 | \ + sed 's/.*ed at \(.*\)).*/\1/g') +[[ "$action_date" != "$new_action_date" ]] && \ + log_fail "Initializing action date did not persist across offline/online" +log_must eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + +log_pass "Initializing performs as expected across offline/online" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_split.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_split.ksh new file mode 100755 index 000000000..69b27c26c --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_split.ksh @@ -0,0 +1,64 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Initializing state is preserved across zpool split. +# +# STRATEGY: +# 1. Create a pool with a two-way mirror. +# 2. Start initializing both devices. +# 3. Split the pool. Ensure initializing continues on the original. +# 4. Import the new pool. Ensure initializing resumes on it. +# + +DISK1="$(echo $DISKS | cut -d' ' -f1)" +DISK2="$(echo $DISKS | cut -d' ' -f2)" +POOL2="${TESTPOOL}_split" + +log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2 + +log_must zpool initialize $TESTPOOL $DISK1 $DISK2 +orig_prog1="$(initialize_progress $TESTPOOL $DISK1)" +orig_prog2="$(initialize_progress $TESTPOOL $DISK2)" +[[ -z "$orig_prog1" ]] && log_fail "Initializing did not start" + +log_must zpool split $TESTPOOL $TESTPOOL1 $DISK2 + +# Ensure initializing continued as expected on the original pool. +[[ "$(initialize_progress $TESTPOOL $DISK1)" -ge "$orig_prog1" ]] || \ + log_fail "Initializing lost progress on original pool" +log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + +log_must zpool import $TESTPOOL1 + +[[ "$(initialize_progress $TESTPOOL1 $DISK2)" -ge "$orig_prog2" ]] || \ + log_fail "Initializing lost progress on split pool" +log_mustnot eval "initialize_prog_line $TESTPOOL1 $DISK1 | grep suspended" + +log_pass "Initializing behaves as expected on zpool split" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh new file mode 100755 index 000000000..59b266d32 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh @@ -0,0 +1,60 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Cancelling and suspending initialize doesn't work if not all specified vdevs +# are being initialized. +# +# STRATEGY: +# 1. Create a three-disk pool. +# 2. Start initializing and verify that initializing is active. +# 3. Try to cancel and suspend initializing on the non-initializing disks. +# 4. Try to re-initialize the currently initializing disk. +# + +DISK1=${DISKS%% *} +DISK2="$(echo $DISKS | cut -d' ' -f2)" +DISK3="$(echo $DISKS | cut -d' ' -f3)" + +log_must zpool list -v +log_must zpool create -f $TESTPOOL $DISK1 $DISK2 $DISK3 +log_must zpool initialize $TESTPOOL $DISK1 + +[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \ + log_fail "Initialize did not start" + +log_mustnot zpool initialize -c $TESTPOOL $DISK2 +log_mustnot zpool initialize -c $TESTPOOL $DISK2 $DISK3 + +log_mustnot zpool initialize -s $TESTPOOL $DISK2 +log_mustnot zpool initialize -s $TESTPOOL $DISK2 $DISK3 + +log_mustnot zpool initialize $TESTPOOL $DISK1 + +log_pass "Nonsensical initialize operations fail" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh new file mode 100755 index 000000000..5003b5f10 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh @@ -0,0 +1,52 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Starting and stopping an initialize works. +# +# STRATEGY: +# 1. Create a one-disk pool. +# 2. Start initializing and verify that initializing is active. +# 3. Cancel initializing and verify that initializing is not active. +# + +DISK1=${DISKS%% *} + +log_must zpool create -f $TESTPOOL $DISK1 +log_must zpool initialize $TESTPOOL + +[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \ + log_fail "Initialize did not start" + +log_must zpool initialize -c $TESTPOOL + +[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] || \ + log_fail "Initialize did not stop" + +log_pass "Initialize start + cancel works" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh new file mode 100755 index 000000000..bce3da526 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh @@ -0,0 +1,63 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Suspending and resuming initializing works. +# +# STRATEGY: +# 1. Create a one-disk pool. +# 2. Start initializing and verify that initializing is active. +# 3. Wait 3 seconds, then suspend initializing and verify that the progress +# reporting says so. +# 4. Wait 5 seconds and ensure initializing progress doesn't advance. +# 5. Restart initializing and verify that the progress doesn't regress. +# + +DISK1=${DISKS%% *} + +log_must zpool create -f $TESTPOOL $DISK1 +log_must zpool initialize $TESTPOOL + +[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \ + log_fail "Initializing did not start" + +sleep 5 +log_must zpool initialize -s $TESTPOOL +log_must eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" +progress="$(initialize_progress $TESTPOOL $DISK1)" + +sleep 3 +[[ "$progress" -eq "$(initialize_progress $TESTPOOL $DISK1)" ]] || \ + log_fail "Initializing progress advanced while suspended" + +log_must zpool initialize $TESTPOOL $DISK1 +[[ "$progress" -le "$(initialize_progress $TESTPOOL $DISK1)" ]] || + log_fail "Initializing progress regressed after resuming" + +log_pass "Suspend + resume initializing works as expected" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh new file mode 100755 index 000000000..bd4ca069c --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh @@ -0,0 +1,74 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Attempting to initialize unsupported vdevs should fail. +# +# STRATEGY: +# 1. Create a pool with the following configuration: +# root +# mirror +# vdev0 +# vdev1 (offline) +# cache +# vdev2 +# spare +# vdev3 +# 2. Try to initialize vdev1, vdev2, and vdev3. Ensure that all 3 fail. +# +function cleanup +{ + if datasetexists $TESTPOOL; then + log_must zpool destroy -f $TESTPOOL + fi + if [[ -d $TESTDIR ]]; then + log_must rm -rf $TESTDIR + fi +} +log_onexit cleanup + +log_must mkdir $TESTDIR +set -A FDISKS +for n in {0..2}; do + log_must mkfile $MINVDEVSIZE $TESTDIR/vdev$n + FDISKS+=("$TESTDIR/vdev$n") +done +FDISKS+=("${DISKS%% *}") + +log_must zpool create $TESTPOOL mirror ${FDISKS[0]} ${FDISKS[1]} \ + spare ${FDISKS[2]} cache ${FDISKS[3]} + +log_must zpool offline $TESTPOOL ${FDISKS[1]} + +log_mustnot zpool initialize $TESTPOOL mirror-0 +for n in {1..3}; do + log_mustnot zpool initialize $TESTPOOL ${FDISKS[$n]} +done + +log_pass "Attempting to initialize failed on unsupported devices" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh new file mode 100755 index 000000000..9be752ff8 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh @@ -0,0 +1,59 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Initializing does not cause file corruption. +# +# STRATEGY: +# 1. Create a one-disk pool. +# 2. Write data to the pool. +# 3. Start initializing and verify that initializing is active. +# 4. Write more data to the pool. +# 5. Run zdb to validate checksums. +# + +DISK1=${DISKS%% *} + +log_must zpool create -f $TESTPOOL $DISK1 +log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=1M count=30 +log_must sync + +log_must zpool initialize $TESTPOOL + +log_must zdb -cc $TESTPOOL + +[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \ + log_fail "Initializing did not start" + +log_must dd if=/dev/urandom of=/$TESTPOOL/file2 bs=1M count=30 +log_must sync + +log_must zdb -cc $TESTPOOL + +log_pass "Initializing does not corrupt existing or new data" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh new file mode 100755 index 000000000..0fa6a0be9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh @@ -0,0 +1,89 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# After initializing, the disk is actually initialized. +# +# STRATEGY: +# 1. Create a one-disk pool. +# 2. Initialize the disk to completion. +# 3. Load all metaslabs that don't have a spacemap, and make sure the entire +# metaslab has been filled with the initializing pattern (deadbeef). +# + +function cleanup +{ + set_tunable64 zfs_initialize_value $ORIG_PATTERN + zpool import -d $TESTDIR $TESTPOOL + + if datasetexists $TESTPOOL ; then + zpool destroy -f $TESTPOOL + fi + if [[ -d "$TESTDIR" ]]; then + rm -rf "$TESTDIR" + fi +} +log_onexit cleanup + +PATTERN="deadbeefdeadbeef" +SMALLFILE="$TESTDIR/smallfile" + +ORIG_PATTERN=$(get_tunable zfs_initialize_value) +log_must set_tunable64 zfs_initialize_value $(printf %llu 0x$PATTERN) + +log_must mkdir "$TESTDIR" +log_must mkfile $MINVDEVSIZE "$SMALLFILE" +log_must zpool create $TESTPOOL "$SMALLFILE" +log_must zpool initialize $TESTPOOL + +while [[ "$(initialize_progress $TESTPOOL $SMALLFILE)" -lt "100" ]]; do + sleep 0.5 +done + +log_must zpool export $TESTPOOL + +spacemaps=0 +bs=512 +while read -r sm; do + typeset offset="$(echo $sm | cut -d ' ' -f1)" + typeset size="$(echo $sm | cut -d ' ' -f2)" + + spacemaps=$((spacemaps + 1)) + offset=$(((4 * 1024 * 1024) + 16#$offset)) + out=$(dd if=$SMALLFILE skip=$(($offset / $bs)) \ + count=$(($size / $bs)) bs=$bs 2>/dev/null | od -t x8 -Ad) + echo "$out" | log_must egrep "$PATTERN|\*|$size" +done <<< "$(zdb -p $TESTDIR -Pme $TESTPOOL | egrep 'spacemap[ ]+0 ' | \ + awk '{print $4, $8}')" + +if [[ $spacemaps -eq 0 ]];then + log_fail "Did not find any empty space maps to check" +else + log_pass "Initializing wrote appropriate amount to disk" +fi |