aboutsummaryrefslogtreecommitdiffstats
path: root/include/sys
diff options
context:
space:
mode:
authorGeorge Wilson <[email protected]>2018-12-19 07:54:59 -0700
committerBrian Behlendorf <[email protected]>2019-01-07 10:37:26 -0800
commit619f09769393d4e0cbaa5f662362138e1c699159 (patch)
tree6a94855931e5bf9e353270c72fba43316a44baa6 /include/sys
parentc87db591967507de027d6bb0c683ffd09dd70105 (diff)
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM ======== The first access to a block incurs a performance penalty on some platforms (e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are "thick provisioned", where supported by the platform (VMware). This can create a large delay in getting a new virtual machines up and running (or adding storage to an existing Engine). If the thick provision step is omitted, write performance will be suboptimal until all blocks on the LUN have been written. SOLUTION ========= This feature introduces a way to 'initialize' the disks at install or in the background to make sure we don't incur this first read penalty. When an entire LUN is added to ZFS, we make all space available immediately, and allow ZFS to find unallocated space and zero it out. This works with concurrent writes to arbitrary offsets, ensuring that we don't zero out something that has been (or is in the middle of being) written. This scheme can also be applied to existing pools (affecting only free regions on the vdev). Detailed design: - new subcommand:zpool initialize [-cs] <pool> [<vdev> ...] - start, suspend, or cancel initialization - Creates new open-context thread for each vdev - Thread iterates through all metaslabs in this vdev - Each metaslab: - select a metaslab - load the metaslab - mark the metaslab as being zeroed - walk all free ranges within that metaslab and translate them to ranges on the leaf vdev - issue a "zeroing" I/O on the leaf vdev that corresponds to a free range on the metaslab we're working on - continue until all free ranges for this metaslab have been "zeroed" - reset/unmark the metaslab being zeroed - if more metaslabs exist, then repeat above tasks. - if no more metaslabs, then we're done. - progress for the initialization is stored on-disk in the vdev’s leaf zap object. The following information is stored: - the last offset that has been initialized - the state of the initialization process (i.e. active, suspended, or canceled) - the start time for the initialization - progress is reported via the zpool status command and shows information for each of the vdevs that are initializing Porting notes: - Added zfs_initialize_value module parameter to set the pattern written by "zpool initialize". - Added zfs_vdev_{initializing,removal}_{min,max}_active module options. Authored by: George Wilson <[email protected]> Reviewed by: John Wren Kennedy <[email protected]> Reviewed by: Matthew Ahrens <[email protected]> Reviewed by: Pavel Zakharov <[email protected]> Reviewed by: Prakash Surya <[email protected]> Reviewed by: loli10K <[email protected]> Reviewed by: Brian Behlendorf <[email protected]> Approved by: Richard Lowe <[email protected]> Signed-off-by: Tim Chase <[email protected]> Ported-by: Tim Chase <[email protected]> OpenZFS-issue: https://www.illumos.org/issues/9102 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb Closes #8230
Diffstat (limited to 'include/sys')
-rw-r--r--include/sys/Makefile.am1
-rw-r--r--include/sys/fs/zfs.h38
-rw-r--r--include/sys/metaslab_impl.h10
-rw-r--r--include/sys/spa.h2
-rw-r--r--include/sys/vdev_impl.h31
-rw-r--r--include/sys/vdev_initialize.h46
-rw-r--r--include/sys/zio_priority.h3
7 files changed, 128 insertions, 3 deletions
diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am
index 8bf376998..e6c82d113 100644
--- a/include/sys/Makefile.am
+++ b/include/sys/Makefile.am
@@ -96,6 +96,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/vdev_impl.h \
$(top_srcdir)/include/sys/vdev_indirect_births.h \
$(top_srcdir)/include/sys/vdev_indirect_mapping.h \
+ $(top_srcdir)/include/sys/vdev_initialize.h \
$(top_srcdir)/include/sys/vdev_raidz.h \
$(top_srcdir)/include/sys/vdev_raidz_impl.h \
$(top_srcdir)/include/sys/vdev_removal.h \
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 85512618c..945853739 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -775,6 +775,13 @@ typedef struct zpool_load_policy {
#define VDEV_ALLOC_BIAS_SPECIAL "special"
#define VDEV_ALLOC_BIAS_DEDUP "dedup"
+#define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \
+ "com.delphix:next_offset_to_initialize"
+#define VDEV_LEAF_ZAP_INITIALIZE_STATE \
+ "com.delphix:vdev_initialize_state"
+#define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \
+ "com.delphix:vdev_initialize_action_time"
+
/*
* This is needed in userland to report the minimum necessary device size.
*/
@@ -988,10 +995,15 @@ typedef struct vdev_stat {
uint64_t vs_read_errors; /* read errors */
uint64_t vs_write_errors; /* write errors */
uint64_t vs_checksum_errors; /* checksum errors */
+ uint64_t vs_initialize_errors; /* initializing errors */
uint64_t vs_self_healed; /* self-healed bytes */
uint64_t vs_scan_removing; /* removing? */
uint64_t vs_scan_processed; /* scan processed bytes */
uint64_t vs_fragmentation; /* device fragmentation */
+ uint64_t vs_initialize_bytes_done; /* bytes initialized */
+ uint64_t vs_initialize_bytes_est; /* total bytes to initialize */
+ uint64_t vs_initialize_state; /* vdev_initialzing_state_t */
+ uint64_t vs_initialize_action_time; /* time_t */
uint64_t vs_checkpoint_space; /* checkpoint-consumed space */
uint64_t vs_resilver_deferred; /* resilver deferred */
uint64_t vs_slow_ios; /* slow IOs */
@@ -1023,7 +1035,6 @@ typedef struct vdev_stat_ex {
#define VDEV_L_HISTO_BUCKETS 37 /* Latency histo buckets */
#define VDEV_RQ_HISTO_BUCKETS 25 /* Request size histo buckets */
-
/* Amount of time in ZIO queue (ns) */
uint64_t vsx_queue_histo[ZIO_PRIORITY_NUM_QUEUEABLE]
[VDEV_L_HISTO_BUCKETS];
@@ -1051,6 +1062,16 @@ typedef struct vdev_stat_ex {
} vdev_stat_ex_t;
/*
+ * Initialize functions.
+ */
+typedef enum pool_initialize_func {
+ POOL_INITIALIZE_DO,
+ POOL_INITIALIZE_CANCEL,
+ POOL_INITIALIZE_SUSPEND,
+ POOL_INITIALIZE_FUNCS
+} pool_initialize_func_t;
+
+/*
* DDT statistics. Note: all fields should be 64-bit because this
* is passed between kernel and userland as an nvlist uint64 array.
*/
@@ -1094,6 +1115,14 @@ typedef struct ddt_histogram {
#define ZVOL_PROP_NAME "name"
#define ZVOL_DEFAULT_BLOCKSIZE 8192
+typedef enum {
+ VDEV_INITIALIZE_NONE,
+ VDEV_INITIALIZE_ACTIVE,
+ VDEV_INITIALIZE_CANCELED,
+ VDEV_INITIALIZE_SUSPENDED,
+ VDEV_INITIALIZE_COMPLETE
+} vdev_initializing_state_t;
+
/*
* /dev/zfs ioctl numbers.
*
@@ -1184,6 +1213,7 @@ typedef enum zfs_ioc {
ZFS_IOC_REMAP, /* 0x5a4c */
ZFS_IOC_POOL_CHECKPOINT, /* 0x5a4d */
ZFS_IOC_POOL_DISCARD_CHECKPOINT, /* 0x5a4e */
+ ZFS_IOC_POOL_INITIALIZE, /* 0x5a4f */
/*
* Linux - 3/64 numbers reserved.
@@ -1278,6 +1308,12 @@ typedef enum {
#define ZPOOL_HIDDEN_ARGS "hidden_args"
/*
+ * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE.
+ */
+#define ZPOOL_INITIALIZE_COMMAND "initialize_command"
+#define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs"
+
+/*
* Flags for ZFS_IOC_VDEV_SET_STATE
*/
#define ZFS_ONLINE_CHECKREMOVE 0x1
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index aa1c82a02..3e32eace6 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -68,7 +68,8 @@ typedef enum trace_alloc_type {
TRACE_GROUP_FAILURE = -5ULL,
TRACE_ENOSPC = -6ULL,
TRACE_CONDENSING = -7ULL,
- TRACE_VDEV_ERROR = -8ULL
+ TRACE_VDEV_ERROR = -8ULL,
+ TRACE_INITIALIZING = -9ULL
} trace_alloc_type_t;
#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
@@ -270,6 +271,11 @@ struct metaslab_group {
uint64_t mg_failed_allocations;
uint64_t mg_fragmentation;
uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+
+ int mg_ms_initializing;
+ boolean_t mg_initialize_updating;
+ kmutex_t mg_ms_initialize_lock;
+ kcondvar_t mg_ms_initialize_cv;
};
/*
@@ -360,6 +366,8 @@ struct metaslab {
boolean_t ms_condense_wanted;
uint64_t ms_condense_checked_txg;
+ uint64_t ms_initializing; /* leaves initializing this ms */
+
/*
* We must hold both ms_lock and ms_group->mg_lock in order to
* modify ms_loaded.
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 5dc27e334..4a66260ef 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -772,6 +772,7 @@ extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
#define SPA_ASYNC_AUTOEXPAND 0x20
#define SPA_ASYNC_REMOVE_DONE 0x40
#define SPA_ASYNC_REMOVE_STOP 0x80
+#define SPA_ASYNC_INITIALIZE_RESTART 0x100
/*
* Controls the behavior of spa_vdev_remove().
@@ -787,6 +788,7 @@ extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
int replace_done);
extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
extern boolean_t spa_vdev_remove_active(spa_t *spa);
+extern int spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type);
extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 6c13a548f..ae21e037e 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -82,6 +82,12 @@ typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd,
uint64_t offset, uint64_t size, void *arg);
typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size,
vdev_remap_cb_t callback, void *arg);
+/*
+ * Given a target vdev, translates the logical range "in" to the physical
+ * range "res"
+ */
+typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in,
+ range_seg_t *res);
typedef const struct vdev_ops {
vdev_open_func_t *vdev_op_open;
@@ -94,6 +100,11 @@ typedef const struct vdev_ops {
vdev_hold_func_t *vdev_op_hold;
vdev_rele_func_t *vdev_op_rele;
vdev_remap_func_t *vdev_op_remap;
+ /*
+ * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves.
+ * Used when initializing vdevs. Isn't used by leaf ops.
+ */
+ vdev_xlation_func_t *vdev_op_xlate;
char vdev_op_type[16];
boolean_t vdev_op_leaf;
} vdev_ops_t;
@@ -250,6 +261,24 @@ struct vdev {
/* pool checkpoint related */
space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */
+ boolean_t vdev_initialize_exit_wanted;
+ vdev_initializing_state_t vdev_initialize_state;
+ kthread_t *vdev_initialize_thread;
+ /* Protects vdev_initialize_thread and vdev_initialize_state. */
+ kmutex_t vdev_initialize_lock;
+ kcondvar_t vdev_initialize_cv;
+ uint64_t vdev_initialize_offset[TXG_SIZE];
+ uint64_t vdev_initialize_last_offset;
+ range_tree_t *vdev_initialize_tree; /* valid while initializing */
+ uint64_t vdev_initialize_bytes_est;
+ uint64_t vdev_initialize_bytes_done;
+ time_t vdev_initialize_action_time; /* start and end time */
+
+ /* for limiting outstanding I/Os */
+ kmutex_t vdev_initialize_io_lock;
+ kcondvar_t vdev_initialize_io_cv;
+ uint64_t vdev_initialize_inflight;
+
/*
* Values stored in the config for an indirect or removing vdev.
*/
@@ -478,6 +507,8 @@ extern vdev_ops_t vdev_indirect_ops;
/*
* Common size functions
*/
+extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in,
+ range_seg_t *out);
extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
extern uint64_t vdev_get_min_asize(vdev_t *vd);
extern void vdev_set_min_asize(vdev_t *vd);
diff --git a/include/sys/vdev_initialize.h b/include/sys/vdev_initialize.h
new file mode 100644
index 000000000..db4b0572c
--- /dev/null
+++ b/include/sys/vdev_initialize.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_INITIALIZE_H
+#define _SYS_VDEV_INITIALIZE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void vdev_initialize(vdev_t *vd);
+extern void vdev_initialize_stop(vdev_t *vd,
+ vdev_initializing_state_t tgt_state);
+extern void vdev_initialize_stop_all(vdev_t *vd,
+ vdev_initializing_state_t tgt_state);
+extern void vdev_initialize_restart(vdev_t *vd);
+extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs,
+ range_seg_t *physical_rs);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_INITIALIZE_H */
diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h
index c2cc8b2d5..d8e6a1745 100644
--- a/include/sys/zio_priority.h
+++ b/include/sys/zio_priority.h
@@ -13,7 +13,7 @@
* CDDL HEADER END
*/
/*
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
*/
#ifndef _ZIO_PRIORITY_H
#define _ZIO_PRIORITY_H
@@ -29,6 +29,7 @@ typedef enum zio_priority {
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */
+ ZIO_PRIORITY_INITIALIZING, /* initializing I/O */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW, /* non-queued i/os (e.g. free) */
} zio_priority_t;