summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorSerapheim Dimitropoulos <serapheim.dimitro@delphix.com>2016-12-16 14:11:29 -0800
committerBrian Behlendorf <behlendorf1@llnl.gov>2018-06-26 10:07:42 -0700
commitd2734cce68cf740e015312314415f9034c67851c (patch)
treeb7a140a3cf2a19bb7c88f2d277f3b5a33c121cea /include
parent88eaf610d9c7056f0946e5090cba1e6288ff2b70 (diff)
OpenZFS 9166 - zfs storage pool checkpoint
Details about the motivation of this feature and its usage can be found in this blogpost: https://sdimitro.github.io/post/zpool-checkpoint/ A lightning talk of this feature can be found here: https://www.youtube.com/watch?v=fPQA8K40jAM Implementation details can be found in big block comment of spa_checkpoint.c Side-changes that are relevant to this commit but not explained elsewhere: * renames members of "struct metaslab trees to be shorter without losing meaning * space_map_{alloc,truncate}() accept a block size as a parameter. The reason is that in the current state all space maps that we allocate through the DMU use a global tunable (space_map_blksz) which defauls to 4KB. This is ok for metaslab space maps in terms of bandwirdth since they are scattered all over the disk. But for other space maps this default is probably not what we want. Examples are device removal's vdev_obsolete_sm or vdev_chedkpoint_sm from this review. Both of these have a 1:1 relationship with each vdev and could benefit from a bigger block size. Porting notes: * The part of dsl_scan_sync() which handles async destroys has been moved into the new dsl_process_async_destroys() function. * Remove "VERIFY(!(flags & FWRITE))" in "kernel.c" so zhack can write to block device backed pools. * ZTS: * Fix get_txg() in zpool_sync_001_pos due to "checkpoint_txg". * Don't use large dd block sizes on /dev/urandom under Linux in checkpoint_capacity. * Adopt Delphix-OS's setting of 4 (spa_asize_inflation = SPA_DVAS_PER_BP + 1) for the checkpoint_capacity test to speed its attempts to fill the pool * Create the base and nested pools with sync=disabled to speed up the "setup" phase. * Clear labels in test pool between checkpoint tests to avoid duplicate pool issues. * The import_rewind_device_replaced test has been marked as "known to fail" for the reasons listed in its DISCLAIMER. * New module parameters: zfs_spa_discard_memory_limit, zfs_remove_max_bytes_pause (not documented - debugging only) vdev_max_ms_count (formerly metaslabs_per_vdev) vdev_min_ms_count Authored by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: John Kennedy <john.kennedy@delphix.com> Reviewed by: Dan Kimmel <dan.kimmel@delphix.com> Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov> Approved by: Richard Lowe <richlowe@richlowe.net> Ported-by: Tim Chase <tim@chase2k.com> Signed-off-by: Tim Chase <tim@chase2k.com> OpenZFS-issue: https://illumos.org/issues/9166 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/7159fdb8 Closes #7570
Diffstat (limited to 'include')
-rw-r--r--include/libzfs.h7
-rw-r--r--include/libzfs_core.h3
-rw-r--r--include/sys/Makefile.am1
-rw-r--r--include/sys/dmu.h1
-rw-r--r--include/sys/dsl_dir.h3
-rw-r--r--include/sys/dsl_pool.h8
-rw-r--r--include/sys/dsl_synctask.h41
-rw-r--r--include/sys/fs/zfs.h37
-rw-r--r--include/sys/metaslab.h6
-rw-r--r--include/sys/metaslab_impl.h53
-rw-r--r--include/sys/range_tree.h2
-rw-r--r--include/sys/spa.h8
-rw-r--r--include/sys/spa_checkpoint.h44
-rw-r--r--include/sys/spa_impl.h5
-rw-r--r--include/sys/space_map.h12
-rw-r--r--include/sys/uberblock_impl.h24
-rw-r--r--include/sys/vdev.h5
-rw-r--r--include/sys/vdev_impl.h11
-rw-r--r--include/sys/vdev_removal.h4
-rw-r--r--include/sys/zio.h1
-rw-r--r--include/sys/zthr.h1
-rw-r--r--include/zfeature_common.h3
22 files changed, 229 insertions, 51 deletions
diff --git a/include/libzfs.h b/include/libzfs.h
index b98963158..c0c0f3c3c 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -152,6 +152,11 @@ typedef enum zfs_error {
EZFS_ACTIVE_POOL, /* pool is imported on a different system */
EZFS_CRYPTOFAILED, /* failed to setup encryption */
EZFS_NO_PENDING, /* cannot cancel, no operation is pending */
+ EZFS_CHECKPOINT_EXISTS, /* checkpoint exists */
+ EZFS_DISCARDING_CHECKPOINT, /* currently discarding a checkpoint */
+ EZFS_NO_CHECKPOINT, /* pool has no checkpoint */
+ EZFS_DEVRM_IN_PROGRESS, /* a device is currently being removed */
+ EZFS_VDEV_TOO_BIG, /* a device is too big to be used */
EZFS_UNKNOWN
} zfs_error_t;
@@ -457,6 +462,8 @@ extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *);
extern int zpool_get_physpath(zpool_handle_t *, char *, size_t);
extern void zpool_explain_recover(libzfs_handle_t *, const char *, int,
nvlist_t *);
+extern int zpool_checkpoint(zpool_handle_t *);
+extern int zpool_discard_checkpoint(zpool_handle_t *);
/*
* Basic handle manipulations. These functions do not create or destroy the
diff --git a/include/libzfs_core.h b/include/libzfs_core.h
index 5af0e1e75..4ca9b254c 100644
--- a/include/libzfs_core.h
+++ b/include/libzfs_core.h
@@ -110,6 +110,9 @@ int lzc_channel_program_nosync(const char *, const char *, uint64_t,
int lzc_sync(const char *, nvlist_t *, nvlist_t **);
int lzc_reopen(const char *, boolean_t);
+int lzc_pool_checkpoint(const char *);
+int lzc_pool_checkpoint_discard(const char *);
+
#ifdef __cplusplus
}
#endif
diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am
index 50c21831d..d64133ceb 100644
--- a/include/sys/Makefile.am
+++ b/include/sys/Makefile.am
@@ -13,6 +13,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/bptree.h \
$(top_srcdir)/include/sys/bqueue.h \
$(top_srcdir)/include/sys/cityhash.h \
+ $(top_srcdir)/include/sys/spa_checkpoint.h \
$(top_srcdir)/include/sys/dbuf.h \
$(top_srcdir)/include/sys/ddt.h \
$(top_srcdir)/include/sys/dmu.h \
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 45259a7fc..d95c09bb9 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -366,6 +366,7 @@ typedef struct dmu_buf {
#define DMU_POOL_REMOVING "com.delphix:removing"
#define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj"
#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect"
+#define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint"
/*
* Allocate an object from this objset. The range of object numbers
diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h
index 8a346e902..86bc2dd87 100644
--- a/include/sys/dsl_dir.h
+++ b/include/sys/dsl_dir.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
@@ -138,6 +138,7 @@ uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds,
const char *name, dmu_tx_t *tx);
uint64_t dsl_dir_get_used(dsl_dir_t *dd);
+uint64_t dsl_dir_get_compressed(dsl_dir_t *dd);
uint64_t dsl_dir_get_quota(dsl_dir_t *dd);
uint64_t dsl_dir_get_reservation(dsl_dir_t *dd);
uint64_t dsl_dir_get_compressratio(dsl_dir_t *dd);
diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h
index c60e4bf9d..01870e867 100644
--- a/include/sys/dsl_pool.h
+++ b/include/sys/dsl_pool.h
@@ -38,6 +38,7 @@
#include <sys/bpobj.h>
#include <sys/bptree.h>
#include <sys/rrwlock.h>
+#include <sys/dsl_synctask.h>
#include <sys/mmp.h>
#ifdef __cplusplus
@@ -128,6 +129,7 @@ typedef struct dsl_pool {
txg_list_t dp_dirty_zilogs;
txg_list_t dp_dirty_dirs;
txg_list_t dp_sync_tasks;
+ txg_list_t dp_early_sync_tasks;
taskq_t *dp_sync_taskq;
taskq_t *dp_zil_clean_taskq;
@@ -151,7 +153,9 @@ dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops,
void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg);
int dsl_pool_sync_context(dsl_pool_t *dp);
-uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
+uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy);
+uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp,
+ zfs_space_check_t slop_policy);
void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
@@ -162,6 +166,8 @@ void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx);
void dsl_pool_mos_diduse_space(dsl_pool_t *dp,
int64_t used, int64_t comp, int64_t uncomp);
+void dsl_pool_ckpoint_diduse_space(dsl_pool_t *dp,
+ int64_t used, int64_t comp, int64_t uncomp);
boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp);
void dsl_pool_config_enter(dsl_pool_t *dp, void *tag);
void dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag);
diff --git a/include/sys/dsl_synctask.h b/include/sys/dsl_synctask.h
index 6139303c1..da6c7a40d 100644
--- a/include/sys/dsl_synctask.h
+++ b/include/sys/dsl_synctask.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_DSL_SYNCTASK_H
@@ -57,14 +57,41 @@ typedef enum zfs_space_check {
ZFS_SPACE_CHECK_RESERVED,
/*
- * No space check is performed. Only operations which we expect to
- * result in a net reduction in space should use this
- * (e.g. "zfs destroy". Setting quotas & reservations also uses
- * this because it needs to circumvent the quota/reservation checks).
+ * Space check allows use of three quarters of the slop space.
+ * If there is less than 0.8% free space, the operation will
+ * fail.
+ */
+ ZFS_SPACE_CHECK_EXTRA_RESERVED,
+
+ /*
+ * In all cases "zfs destroy" is expected to result in an net
+ * reduction of space, except one. When the pool has a
+ * checkpoint, space freed by "zfs destroy" will not actually
+ * free anything internally. Thus, it starts failing after
+ * three quarters of the slop space is exceeded.
+ */
+ ZFS_SPACE_CHECK_DESTROY = ZFS_SPACE_CHECK_EXTRA_RESERVED,
+
+ /*
+ * A channel program can run a "zfs destroy" as part of its
+ * script and therefore has the same space_check policy when
+ * being evaluated.
+ */
+ ZFS_SPACE_CHECK_ZCP_EVAL = ZFS_SPACE_CHECK_DESTROY,
+
+ /*
+ * No space check is performed. This level of space check should
+ * be used cautiously as operations that use it can even run when
+ * 0.8% capacity is left for use. In this scenario, if there is a
+ * checkpoint, async destroys are suspended and any kind of freeing
+ * can potentially add space instead of freeing it.
*
* See also the comments above spa_slop_shift.
*/
ZFS_SPACE_CHECK_NONE,
+
+ ZFS_SPACE_CHECK_DISCARD_CHECKPOINT = ZFS_SPACE_CHECK_NONE,
+
} zfs_space_check_t;
typedef struct dsl_sync_task {
@@ -85,6 +112,10 @@ int dsl_sync_task(const char *, dsl_checkfunc_t *,
dsl_syncfunc_t *, void *, int, zfs_space_check_t);
void dsl_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *,
void *, int, zfs_space_check_t, dmu_tx_t *);
+int dsl_early_sync_task(const char *, dsl_checkfunc_t *,
+ dsl_syncfunc_t *, void *, int, zfs_space_check_t);
+void dsl_early_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *,
+ void *, int, zfs_space_check_t, dmu_tx_t *);
#ifdef __cplusplus
}
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 870618ecb..0ee9b00bd 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -239,6 +239,7 @@ typedef enum {
ZPOOL_PROP_TNAME,
ZPOOL_PROP_MAXDNODESIZE,
ZPOOL_PROP_MULTIHOST,
+ ZPOOL_PROP_CHECKPOINT,
ZPOOL_NUM_PROPS
} zpool_prop_t;
@@ -616,6 +617,7 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_DTL "DTL"
#define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */
#define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */
+#define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */
#define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */
#define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */
@@ -752,6 +754,8 @@ typedef struct zpool_load_policy {
"com.delphix:indirect_obsolete_sm"
#define VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \
"com.delphix:obsolete_counts_are_precise"
+#define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \
+ "com.delphix:pool_checkpoint_sm"
/*
* This is needed in userland to report the minimum necessary device size.
@@ -861,6 +865,18 @@ typedef enum pool_scrub_cmd {
POOL_SCRUB_FLAGS_END
} pool_scrub_cmd_t;
+typedef enum {
+ CS_NONE,
+ CS_CHECKPOINT_EXISTS,
+ CS_CHECKPOINT_DISCARDING,
+ CS_NUM_STATES
+} checkpoint_state_t;
+
+typedef struct pool_checkpoint_stat {
+ uint64_t pcs_state; /* checkpoint_state_t */
+ uint64_t pcs_start_time; /* time checkpoint/discard started */
+ uint64_t pcs_space; /* checkpointed space */
+} pool_checkpoint_stat_t;
/*
* ZIO types. Needed to interpret vdev statistics below.
@@ -958,7 +974,7 @@ typedef struct vdev_stat {
uint64_t vs_scan_removing; /* removing? */
uint64_t vs_scan_processed; /* scan processed bytes */
uint64_t vs_fragmentation; /* device fragmentation */
-
+ uint64_t vs_checkpoint_space; /* checkpoint-consumed space */
} vdev_stat_t;
/*
@@ -1144,6 +1160,8 @@ typedef enum zfs_ioc {
ZFS_IOC_UNLOAD_KEY,
ZFS_IOC_CHANGE_KEY,
ZFS_IOC_REMAP,
+ ZFS_IOC_POOL_CHECKPOINT,
+ ZFS_IOC_POOL_DISCARD_CHECKPOINT,
/*
* Linux - 3/64 numbers reserved.
@@ -1167,6 +1185,22 @@ typedef enum zfs_ioc {
#define BLKZNAME _IOR(0x12, 125, char[ZFS_MAX_DATASET_NAME_LEN])
/*
+ * ZFS-specific error codes used for returning descriptive errors
+ * to the userland through zfs ioctls.
+ *
+ * The enum implicitly includes all the error codes from errno.h.
+ * New code should use and extend this enum for errors that are
+ * not described precisely by generic errno codes.
+ */
+typedef enum {
+ ZFS_ERR_CHECKPOINT_EXISTS = 1024,
+ ZFS_ERR_DISCARDING_CHECKPOINT,
+ ZFS_ERR_NO_CHECKPOINT,
+ ZFS_ERR_DEVRM_IN_PROGRESS,
+ ZFS_ERR_VDEV_TOO_BIG
+} zfs_errno_t;
+
+/*
* Internal SPA load state. Used by FMA diagnosis engine.
*/
typedef enum {
@@ -1235,6 +1269,7 @@ typedef enum {
#define ZFS_IMPORT_TEMP_NAME 0x10
#define ZFS_IMPORT_SKIP_MMP 0x20
#define ZFS_IMPORT_LOAD_KEYS 0x40
+#define ZFS_IMPORT_CHECKPOINT 0x80
/*
* Channel program argument/return nvlist keys and defaults.
diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index fdcf6c71b..282ec231c 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_H
@@ -70,8 +70,8 @@ int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t,
dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *);
void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
-void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, uint64_t);
-void metaslab_free_dva(spa_t *, const dva_t *, uint64_t);
+void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t);
+void metaslab_free_dva(spa_t *, const dva_t *, boolean_t);
void metaslab_free_impl_cb(uint64_t, vdev_t *, uint64_t, uint64_t, void *);
void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t);
int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index 76f670a4d..dafd2b231 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_IMPL_H
@@ -255,16 +255,16 @@ struct metaslab_group {
/*
* Each metaslab maintains a set of in-core trees to track metaslab
- * operations. The in-core free tree (ms_tree) contains the list of
+ * operations. The in-core free tree (ms_allocatable) contains the list of
* free segments which are eligible for allocation. As blocks are
- * allocated, the allocated segment are removed from the ms_tree and
- * added to a per txg allocation tree (ms_alloctree). As blocks are
- * freed, they are added to the free tree (ms_freeingtree). These trees
+ * allocated, the allocated segment are removed from the ms_allocatable and
+ * added to a per txg allocation tree (ms_allocating). As blocks are
+ * freed, they are added to the free tree (ms_freeing). These trees
* allow us to process all allocations and frees in syncing context
* where it is safe to update the on-disk space maps. An additional set
* of in-core trees is maintained to track deferred frees
- * (ms_defertree). Once a block is freed it will move from the
- * ms_freedtree to the ms_defertree. A deferred free means that a block
+ * (ms_defer). Once a block is freed it will move from the
+ * ms_freed to the ms_defer tree. A deferred free means that a block
* has been freed but cannot be used by the pool until TXG_DEFER_SIZE
* transactions groups later. For example, a block that is freed in txg
* 50 will not be available for reallocation until txg 52 (50 +
@@ -278,14 +278,14 @@ struct metaslab_group {
* ALLOCATE
* |
* V
- * free segment (ms_tree) -----> ms_alloctree[4] ----> (write to space map)
+ * free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map)
* ^
- * | ms_freeingtree <--- FREE
- * | |
- * | v
- * | ms_freedtree
- * | |
- * +-------- ms_defertree[2] <-------+---------> (write to space map)
+ * | ms_freeing <--- FREE
+ * | |
+ * | v
+ * | ms_freed
+ * | |
+ * +-------- ms_defer[2] <-------+-------> (write to space map)
*
*
* Each metaslab's space is tracked in a single space map in the MOS,
@@ -296,8 +296,8 @@ struct metaslab_group {
* To load the in-core free tree we read the space map from disk. This
* object contains a series of alloc and free records that are combined
* to make up the list of all free segments in this metaslab. These
- * segments are represented in-core by the ms_tree and are stored in an
- * AVL tree.
+ * segments are represented in-core by the ms_allocatable and are stored
+ * in an AVL tree.
*
* As the space map grows (as a result of the appends) it will
* eventually become space-inefficient. When the metaslab's in-core
@@ -317,20 +317,22 @@ struct metaslab {
uint64_t ms_size;
uint64_t ms_fragmentation;
- range_tree_t *ms_alloctree[TXG_SIZE];
- range_tree_t *ms_tree;
+ range_tree_t *ms_allocating[TXG_SIZE];
+ range_tree_t *ms_allocatable;
/*
* The following range trees are accessed only from syncing context.
* ms_free*tree only have entries while syncing, and are empty
* between syncs.
*/
- range_tree_t *ms_freeingtree; /* to free this syncing txg */
- range_tree_t *ms_freedtree; /* already freed this syncing txg */
- range_tree_t *ms_defertree[TXG_DEFER_SIZE];
+ range_tree_t *ms_freeing; /* to free this syncing txg */
+ range_tree_t *ms_freed; /* already freed this syncing txg */
+ range_tree_t *ms_defer[TXG_DEFER_SIZE];
+ range_tree_t *ms_checkpointing; /* to add to the checkpoint */
boolean_t ms_condensing; /* condensing? */
boolean_t ms_condense_wanted;
+ uint64_t ms_condense_checked_txg;
/*
* We must hold both ms_lock and ms_group->mg_lock in order to
@@ -356,11 +358,12 @@ struct metaslab {
/*
* The metaslab block allocators can optionally use a size-ordered
* range tree and/or an array of LBAs. Not all allocators use
- * this functionality. The ms_size_tree should always contain the
- * same number of segments as the ms_tree. The only difference
- * is that the ms_size_tree is ordered by segment sizes.
+ * this functionality. The ms_allocatable_by_size should always
+ * contain the same number of segments as the ms_allocatable. The
+ * only difference is that the ms_allocatable_by_size is ordered by
+ * segment sizes.
*/
- avl_tree_t ms_size_tree;
+ avl_tree_t ms_allocatable_by_size;
uint64_t ms_lbas[MAX_LBAS];
metaslab_group_t *ms_group; /* metaslab group */
diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h
index 9eef762de..7f79786f5 100644
--- a/include/sys/range_tree.h
+++ b/include/sys/range_tree.h
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_RANGE_TREE_H
diff --git a/include/sys/spa.h b/include/sys/spa.h
index e8578be9a..b6483e11b 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -747,6 +747,8 @@ extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props,
uint64_t flags);
extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
extern int spa_destroy(char *pool);
+extern int spa_checkpoint(const char *pool);
+extern int spa_checkpoint_discard(const char *pool);
extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
boolean_t hardforce);
extern int spa_reset(char *pool);
@@ -965,6 +967,7 @@ extern spa_load_state_t spa_load_state(spa_t *spa);
extern uint64_t spa_freeze_txg(spa_t *spa);
extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize);
extern uint64_t spa_get_dspace(spa_t *spa);
+extern uint64_t spa_get_checkpoint_space(spa_t *spa);
extern uint64_t spa_get_slop_space(spa_t *spa);
extern void spa_update_dspace(spa_t *spa);
extern uint64_t spa_version(spa_t *spa);
@@ -1016,6 +1019,10 @@ extern boolean_t spa_writeable(spa_t *spa);
extern boolean_t spa_has_pending_synctask(spa_t *spa);
extern int spa_maxblocksize(spa_t *spa);
extern int spa_maxdnodesize(spa_t *spa);
+extern boolean_t spa_has_checkpoint(spa_t *spa);
+extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa);
+extern boolean_t spa_suspend_async_destroy(spa_t *spa);
+extern uint64_t spa_min_claim_txg(spa_t *spa);
extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp);
extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva,
const blkptr_t *bp);
@@ -1027,6 +1034,7 @@ extern uint64_t spa_get_last_removal_txg(spa_t *spa);
extern boolean_t spa_trust_config(spa_t *spa);
extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
+extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
extern boolean_t spa_multihost(spa_t *spa);
extern unsigned long spa_get_hostid(void);
diff --git a/include/sys/spa_checkpoint.h b/include/sys/spa_checkpoint.h
new file mode 100644
index 000000000..a5c856014
--- /dev/null
+++ b/include/sys/spa_checkpoint.h
@@ -0,0 +1,44 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_SPA_CHECKPOINT_H
+#define _SYS_SPA_CHECKPOINT_H
+
+#include <sys/zthr.h>
+
+typedef struct spa_checkpoint_info {
+ uint64_t sci_timestamp; /* when checkpointed uberblock was synced */
+ uint64_t sci_dspace; /* disk space used by checkpoint in bytes */
+} spa_checkpoint_info_t;
+
+int spa_checkpoint(const char *);
+int spa_checkpoint_discard(const char *);
+
+boolean_t spa_checkpoint_discard_thread_check(void *, zthr_t *);
+int spa_checkpoint_discard_thread(void *, zthr_t *);
+
+int spa_checkpoint_get_stats(spa_t *, pool_checkpoint_stat_t *);
+
+#endif /* _SYS_SPA_CHECKPOINT_H */
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 6abb63157..8d2a31961 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -32,6 +32,7 @@
#define _SYS_SPA_IMPL_H
#include <sys/spa.h>
+#include <sys/spa_checkpoint.h>
#include <sys/vdev.h>
#include <sys/vdev_removal.h>
#include <sys/metaslab.h>
@@ -284,6 +285,10 @@ struct spa {
spa_condensing_indirect_t *spa_condensing_indirect;
zthr_t *spa_condense_zthr; /* zthr doing condense. */
+ uint64_t spa_checkpoint_txg; /* the txg of the checkpoint */
+ spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */
+ zthr_t *spa_checkpoint_discard_zthr;
+
char *spa_root; /* alternate root directory */
uint64_t spa_ena; /* spa-wide ereport ENA */
int spa_last_open_failed; /* error if last open failed */
diff --git a/include/sys/space_map.h b/include/sys/space_map.h
index 457300d05..98b87269c 100644
--- a/include/sys/space_map.h
+++ b/include/sys/space_map.h
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_SPACE_MAP_H
@@ -57,7 +57,7 @@ extern "C" {
typedef struct space_map_phys {
uint64_t smp_object; /* on-disk space map object */
uint64_t smp_objsize; /* size of the object */
- uint64_t smp_alloc; /* space allocated from the map */
+ int64_t smp_alloc; /* space allocated from the map */
uint64_t smp_pad[5]; /* reserved */
/*
@@ -82,7 +82,7 @@ typedef struct space_map {
uint64_t sm_size; /* size of map */
uint8_t sm_shift; /* unit shift */
uint64_t sm_length; /* synced length */
- uint64_t sm_alloc; /* synced space allocated */
+ int64_t sm_alloc; /* synced space allocated */
objset_t *sm_os; /* objset for this map */
uint64_t sm_object; /* object id for this map */
uint32_t sm_blksz; /* block size for space map */
@@ -140,6 +140,8 @@ typedef int (*sm_cb_t)(maptype_t type, uint64_t offset, uint64_t size,
int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg);
+int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
+ dmu_tx_t *tx);
void space_map_histogram_clear(space_map_t *sm);
void space_map_histogram_add(space_map_t *sm, range_tree_t *rt,
@@ -153,8 +155,8 @@ uint64_t space_map_length(space_map_t *sm);
void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
dmu_tx_t *tx);
-void space_map_truncate(space_map_t *sm, dmu_tx_t *tx);
-uint64_t space_map_alloc(objset_t *os, dmu_tx_t *tx);
+void space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx);
+uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx);
void space_map_free(space_map_t *sm, dmu_tx_t *tx);
void space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx);
diff --git a/include/sys/uberblock_impl.h b/include/sys/uberblock_impl.h
index 08eeabdda..113df7c61 100644
--- a/include/sys/uberblock_impl.h
+++ b/include/sys/uberblock_impl.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_UBERBLOCK_IMPL_H
@@ -61,6 +61,28 @@ struct uberblock {
uint64_t ub_mmp_magic; /* MMP_MAGIC */
uint64_t ub_mmp_delay; /* nanosec since last MMP write */
uint64_t ub_mmp_seq; /* reserved for sequence number */
+
+ /*
+ * ub_checkpoint_txg indicates two things about the current uberblock:
+ *
+ * 1] If it is not zero then this uberblock is a checkpoint. If it is
+ * zero, then this uberblock is not a checkpoint.
+ *
+ * 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is
+ * the ub_txg that the uberblock had at the time we moved it to
+ * the MOS config.
+ *
+ * The field is set when we checkpoint the uberblock and continues to
+ * hold that value even after we've rewound (unlike the ub_txg that
+ * is reset to a higher value).
+ *
+ * Besides checks used to determine whether we are reopening the
+ * pool from a checkpointed uberblock [see spa_ld_select_uberblock()],
+ * the value of the field is used to determine which ZIL blocks have
+ * been allocated according to the ms_sm when we are rewinding to a
+ * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then
+ * the ZIL block is not allocated [see uses of spa_min_claim_txg()].
+ */
uint64_t ub_checkpoint_txg;
};
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index 161e30ae7..6d31d61b5 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_VDEV_H
@@ -81,7 +81,7 @@ extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx);
extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx);
extern void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx);
extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset,
- uint64_t size, uint64_t txg);
+ uint64_t size);
extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev,
uint64_t offset, uint64_t size, dmu_tx_t *tx);
@@ -122,6 +122,7 @@ extern boolean_t vdev_readable(vdev_t *vd);
extern boolean_t vdev_writeable(vdev_t *vd);
extern boolean_t vdev_allocatable(vdev_t *vd);
extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio);
+extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd);
extern void vdev_cache_init(vdev_t *vd);
extern void vdev_cache_fini(vdev_t *vd);
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index e28994613..c22087307 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_VDEV_IMPL_H
@@ -236,6 +236,9 @@ struct vdev {
kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */
uint64_t vdev_top_zap;
+ /* pool checkpoint related */
+ space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */
+
/*
* Values stored in the config for an indirect or removing vdev.
*/
@@ -469,6 +472,7 @@ extern void vdev_set_min_asize(vdev_t *vd);
/*
* Global variables
*/
+extern int vdev_standard_sm_blksz;
/* zdb uses this tunable, so it must be declared here to make lint happy. */
extern int zfs_vdev_cache_size;
@@ -481,6 +485,11 @@ extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx);
extern int vdev_obsolete_sm_object(vdev_t *vd);
extern boolean_t vdev_obsolete_counts_are_precise(vdev_t *vd);
+/*
+ * Other miscellaneous functions
+ */
+int vdev_checkpoint_sm_object(vdev_t *vd);
+
#ifdef __cplusplus
}
#endif
diff --git a/include/sys/vdev_removal.h b/include/sys/vdev_removal.h
index bec2cea33..3962237af 100644
--- a/include/sys/vdev_removal.h
+++ b/include/sys/vdev_removal.h
@@ -14,7 +14,7 @@
*/
/*
- * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_VDEV_REMOVAL_H
@@ -79,7 +79,7 @@ extern void spa_condense_fini(spa_t *);
extern void spa_start_indirect_condensing_thread(spa_t *);
extern void spa_vdev_condense_suspend(spa_t *);
extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t);
-extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t, uint64_t);
+extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t);
extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *);
extern void svr_sync(spa_t *spa, dmu_tx_t *tx);
extern void spa_vdev_remove_suspend(spa_t *);
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 25c12fbcc..6c0c682a8 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -566,7 +566,6 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg,
blkptr_t *new_bp, uint64_t size, boolean_t *slog);
-extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
extern void zio_flush(zio_t *zio, vdev_t *vd);
extern void zio_shrink(zio_t *zio, uint64_t size);
diff --git a/include/sys/zthr.h b/include/sys/zthr.h
index 6bfb6b6c0..62da2eea8 100644
--- a/include/sys/zthr.h
+++ b/include/sys/zthr.h
@@ -13,7 +13,6 @@
* CDDL HEADER END
*/
-
/*
* Copyright (c) 2017 by Delphix. All rights reserved.
*/
diff --git a/include/zfeature_common.h b/include/zfeature_common.h
index 13670c8e5..c59b800d3 100644
--- a/include/zfeature_common.h
+++ b/include/zfeature_common.h
@@ -20,7 +20,7 @@
*/
/*
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
@@ -61,6 +61,7 @@ typedef enum spa_feature {
SPA_FEATURE_PROJECT_QUOTA,
SPA_FEATURE_DEVICE_REMOVAL,
SPA_FEATURE_OBSOLETE_COUNTS,
+ SPA_FEATURE_POOL_CHECKPOINT,
SPA_FEATURES
} spa_feature_t;