summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
Diffstat (limited to 'include')
-rw-r--r--include/libzfs.h7
-rw-r--r--include/libzfs_core.h3
-rw-r--r--include/sys/Makefile.am1
-rw-r--r--include/sys/dmu.h1
-rw-r--r--include/sys/dsl_dir.h3
-rw-r--r--include/sys/dsl_pool.h8
-rw-r--r--include/sys/dsl_synctask.h41
-rw-r--r--include/sys/fs/zfs.h37
-rw-r--r--include/sys/metaslab.h6
-rw-r--r--include/sys/metaslab_impl.h53
-rw-r--r--include/sys/range_tree.h2
-rw-r--r--include/sys/spa.h8
-rw-r--r--include/sys/spa_checkpoint.h44
-rw-r--r--include/sys/spa_impl.h5
-rw-r--r--include/sys/space_map.h12
-rw-r--r--include/sys/uberblock_impl.h24
-rw-r--r--include/sys/vdev.h5
-rw-r--r--include/sys/vdev_impl.h11
-rw-r--r--include/sys/vdev_removal.h4
-rw-r--r--include/sys/zio.h1
-rw-r--r--include/sys/zthr.h1
-rw-r--r--include/zfeature_common.h3
22 files changed, 229 insertions, 51 deletions
diff --git a/include/libzfs.h b/include/libzfs.h
index b98963158..c0c0f3c3c 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -152,6 +152,11 @@ typedef enum zfs_error {
EZFS_ACTIVE_POOL, /* pool is imported on a different system */
EZFS_CRYPTOFAILED, /* failed to setup encryption */
EZFS_NO_PENDING, /* cannot cancel, no operation is pending */
+ EZFS_CHECKPOINT_EXISTS, /* checkpoint exists */
+ EZFS_DISCARDING_CHECKPOINT, /* currently discarding a checkpoint */
+ EZFS_NO_CHECKPOINT, /* pool has no checkpoint */
+ EZFS_DEVRM_IN_PROGRESS, /* a device is currently being removed */
+ EZFS_VDEV_TOO_BIG, /* a device is too big to be used */
EZFS_UNKNOWN
} zfs_error_t;
@@ -457,6 +462,8 @@ extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *);
extern int zpool_get_physpath(zpool_handle_t *, char *, size_t);
extern void zpool_explain_recover(libzfs_handle_t *, const char *, int,
nvlist_t *);
+extern int zpool_checkpoint(zpool_handle_t *);
+extern int zpool_discard_checkpoint(zpool_handle_t *);
/*
* Basic handle manipulations. These functions do not create or destroy the
diff --git a/include/libzfs_core.h b/include/libzfs_core.h
index 5af0e1e75..4ca9b254c 100644
--- a/include/libzfs_core.h
+++ b/include/libzfs_core.h
@@ -110,6 +110,9 @@ int lzc_channel_program_nosync(const char *, const char *, uint64_t,
int lzc_sync(const char *, nvlist_t *, nvlist_t **);
int lzc_reopen(const char *, boolean_t);
+int lzc_pool_checkpoint(const char *);
+int lzc_pool_checkpoint_discard(const char *);
+
#ifdef __cplusplus
}
#endif
diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am
index 50c21831d..d64133ceb 100644
--- a/include/sys/Makefile.am
+++ b/include/sys/Makefile.am
@@ -13,6 +13,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/bptree.h \
$(top_srcdir)/include/sys/bqueue.h \
$(top_srcdir)/include/sys/cityhash.h \
+ $(top_srcdir)/include/sys/spa_checkpoint.h \
$(top_srcdir)/include/sys/dbuf.h \
$(top_srcdir)/include/sys/ddt.h \
$(top_srcdir)/include/sys/dmu.h \
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 45259a7fc..d95c09bb9 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -366,6 +366,7 @@ typedef struct dmu_buf {
#define DMU_POOL_REMOVING "com.delphix:removing"
#define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj"
#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect"
+#define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint"
/*
* Allocate an object from this objset. The range of object numbers
diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h
index 8a346e902..86bc2dd87 100644
--- a/include/sys/dsl_dir.h
+++ b/include/sys/dsl_dir.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
@@ -138,6 +138,7 @@ uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds,
const char *name, dmu_tx_t *tx);
uint64_t dsl_dir_get_used(dsl_dir_t *dd);
+uint64_t dsl_dir_get_compressed(dsl_dir_t *dd);
uint64_t dsl_dir_get_quota(dsl_dir_t *dd);
uint64_t dsl_dir_get_reservation(dsl_dir_t *dd);
uint64_t dsl_dir_get_compressratio(dsl_dir_t *dd);
diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h
index c60e4bf9d..01870e867 100644
--- a/include/sys/dsl_pool.h
+++ b/include/sys/dsl_pool.h
@@ -38,6 +38,7 @@
#include <sys/bpobj.h>
#include <sys/bptree.h>
#include <sys/rrwlock.h>
+#include <sys/dsl_synctask.h>
#include <sys/mmp.h>
#ifdef __cplusplus
@@ -128,6 +129,7 @@ typedef struct dsl_pool {
txg_list_t dp_dirty_zilogs;
txg_list_t dp_dirty_dirs;
txg_list_t dp_sync_tasks;
+ txg_list_t dp_early_sync_tasks;
taskq_t *dp_sync_taskq;
taskq_t *dp_zil_clean_taskq;
@@ -151,7 +153,9 @@ dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops,
void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg);
int dsl_pool_sync_context(dsl_pool_t *dp);
-uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
+uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy);
+uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp,
+ zfs_space_check_t slop_policy);
void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
@@ -162,6 +166,8 @@ void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx);
void dsl_pool_mos_diduse_space(dsl_pool_t *dp,
int64_t used, int64_t comp, int64_t uncomp);
+void dsl_pool_ckpoint_diduse_space(dsl_pool_t *dp,
+ int64_t used, int64_t comp, int64_t uncomp);
boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp);
void dsl_pool_config_enter(dsl_pool_t *dp, void *tag);
void dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag);
diff --git a/include/sys/dsl_synctask.h b/include/sys/dsl_synctask.h
index 6139303c1..da6c7a40d 100644
--- a/include/sys/dsl_synctask.h
+++ b/include/sys/dsl_synctask.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_DSL_SYNCTASK_H
@@ -57,14 +57,41 @@ typedef enum zfs_space_check {
ZFS_SPACE_CHECK_RESERVED,
/*
- * No space check is performed. Only operations which we expect to
- * result in a net reduction in space should use this
- * (e.g. "zfs destroy". Setting quotas & reservations also uses
- * this because it needs to circumvent the quota/reservation checks).
+ * Space check allows use of three quarters of the slop space.
+ * If there is less than 0.8% free space, the operation will
+ * fail.
+ */
+ ZFS_SPACE_CHECK_EXTRA_RESERVED,
+
+ /*
+ * In all cases "zfs destroy" is expected to result in an net
+ * reduction of space, except one. When the pool has a
+ * checkpoint, space freed by "zfs destroy" will not actually
+ * free anything internally. Thus, it starts failing after
+ * three quarters of the slop space is exceeded.
+ */
+ ZFS_SPACE_CHECK_DESTROY = ZFS_SPACE_CHECK_EXTRA_RESERVED,
+
+ /*
+ * A channel program can run a "zfs destroy" as part of its
+ * script and therefore has the same space_check policy when
+ * being evaluated.
+ */
+ ZFS_SPACE_CHECK_ZCP_EVAL = ZFS_SPACE_CHECK_DESTROY,
+
+ /*
+ * No space check is performed. This level of space check should
+ * be used cautiously as operations that use it can even run when
+ * 0.8% capacity is left for use. In this scenario, if there is a
+ * checkpoint, async destroys are suspended and any kind of freeing
+ * can potentially add space instead of freeing it.
*
* See also the comments above spa_slop_shift.
*/
ZFS_SPACE_CHECK_NONE,
+
+ ZFS_SPACE_CHECK_DISCARD_CHECKPOINT = ZFS_SPACE_CHECK_NONE,
+
} zfs_space_check_t;
typedef struct dsl_sync_task {
@@ -85,6 +112,10 @@ int dsl_sync_task(const char *, dsl_checkfunc_t *,
dsl_syncfunc_t *, void *, int, zfs_space_check_t);
void dsl_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *,
void *, int, zfs_space_check_t, dmu_tx_t *);
+int dsl_early_sync_task(const char *, dsl_checkfunc_t *,
+ dsl_syncfunc_t *, void *, int, zfs_space_check_t);
+void dsl_early_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *,
+ void *, int, zfs_space_check_t, dmu_tx_t *);
#ifdef __cplusplus
}
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 870618ecb..0ee9b00bd 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -239,6 +239,7 @@ typedef enum {
ZPOOL_PROP_TNAME,
ZPOOL_PROP_MAXDNODESIZE,
ZPOOL_PROP_MULTIHOST,
+ ZPOOL_PROP_CHECKPOINT,
ZPOOL_NUM_PROPS
} zpool_prop_t;
@@ -616,6 +617,7 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_DTL "DTL"
#define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */
#define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */
+#define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */
#define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */
#define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */
@@ -752,6 +754,8 @@ typedef struct zpool_load_policy {
"com.delphix:indirect_obsolete_sm"
#define VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \
"com.delphix:obsolete_counts_are_precise"
+#define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \
+ "com.delphix:pool_checkpoint_sm"
/*
* This is needed in userland to report the minimum necessary device size.
@@ -861,6 +865,18 @@ typedef enum pool_scrub_cmd {
POOL_SCRUB_FLAGS_END
} pool_scrub_cmd_t;
+typedef enum {
+ CS_NONE,
+ CS_CHECKPOINT_EXISTS,
+ CS_CHECKPOINT_DISCARDING,
+ CS_NUM_STATES
+} checkpoint_state_t;
+
+typedef struct pool_checkpoint_stat {
+ uint64_t pcs_state; /* checkpoint_state_t */
+ uint64_t pcs_start_time; /* time checkpoint/discard started */
+ uint64_t pcs_space; /* checkpointed space */
+} pool_checkpoint_stat_t;
/*
* ZIO types. Needed to interpret vdev statistics below.
@@ -958,7 +974,7 @@ typedef struct vdev_stat {
uint64_t vs_scan_removing; /* removing? */
uint64_t vs_scan_processed; /* scan processed bytes */
uint64_t vs_fragmentation; /* device fragmentation */
-
+ uint64_t vs_checkpoint_space; /* checkpoint-consumed space */
} vdev_stat_t;
/*
@@ -1144,6 +1160,8 @@ typedef enum zfs_ioc {
ZFS_IOC_UNLOAD_KEY,
ZFS_IOC_CHANGE_KEY,
ZFS_IOC_REMAP,
+ ZFS_IOC_POOL_CHECKPOINT,
+ ZFS_IOC_POOL_DISCARD_CHECKPOINT,
/*
* Linux - 3/64 numbers reserved.
@@ -1167,6 +1185,22 @@ typedef enum zfs_ioc {
#define BLKZNAME _IOR(0x12, 125, char[ZFS_MAX_DATASET_NAME_LEN])
/*
+ * ZFS-specific error codes used for returning descriptive errors
+ * to the userland through zfs ioctls.
+ *
+ * The enum implicitly includes all the error codes from errno.h.
+ * New code should use and extend this enum for errors that are
+ * not described precisely by generic errno codes.
+ */
+typedef enum {
+ ZFS_ERR_CHECKPOINT_EXISTS = 1024,
+ ZFS_ERR_DISCARDING_CHECKPOINT,
+ ZFS_ERR_NO_CHECKPOINT,
+ ZFS_ERR_DEVRM_IN_PROGRESS,
+ ZFS_ERR_VDEV_TOO_BIG
+} zfs_errno_t;
+
+/*
* Internal SPA load state. Used by FMA diagnosis engine.
*/
typedef enum {
@@ -1235,6 +1269,7 @@ typedef enum {
#define ZFS_IMPORT_TEMP_NAME 0x10
#define ZFS_IMPORT_SKIP_MMP 0x20
#define ZFS_IMPORT_LOAD_KEYS 0x40
+#define ZFS_IMPORT_CHECKPOINT 0x80
/*
* Channel program argument/return nvlist keys and defaults.
diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index fdcf6c71b..282ec231c 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_H
@@ -70,8 +70,8 @@ int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t,
dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *);
void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
-void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, uint64_t);
-void metaslab_free_dva(spa_t *, const dva_t *, uint64_t);
+void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t);
+void metaslab_free_dva(spa_t *, const dva_t *, boolean_t);
void metaslab_free_impl_cb(uint64_t, vdev_t *, uint64_t, uint64_t, void *);
void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t);
int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index 76f670a4d..dafd2b231 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_IMPL_H
@@ -255,16 +255,16 @@ struct metaslab_group {
/*
* Each metaslab maintains a set of in-core trees to track metaslab
- * operations. The in-core free tree (ms_tree) contains the list of
+ * operations. The in-core free tree (ms_allocatable) contains the list of
* free segments which are eligible for allocation. As blocks are
- * allocated, the allocated segment are removed from the ms_tree and
- * added to a per txg allocation tree (ms_alloctree). As blocks are
- * freed, they are added to the free tree (ms_freeingtree). These trees
+ * allocated, the allocated segment are removed from the ms_allocatable and
+ * added to a per txg allocation tree (ms_allocating). As blocks are
+ * freed, they are added to the free tree (ms_freeing). These trees
* allow us to process all allocations and frees in syncing context
* where it is safe to update the on-disk space maps. An additional set
* of in-core trees is maintained to track deferred frees
- * (ms_defertree). Once a block is freed it will move from the
- * ms_freedtree to the ms_defertree. A deferred free means that a block
+ * (ms_defer). Once a block is freed it will move from the
+ * ms_freed to the ms_defer tree. A deferred free means that a block
* has been freed but cannot be used by the pool until TXG_DEFER_SIZE
* transactions groups later. For example, a block that is freed in txg
* 50 will not be available for reallocation until txg 52 (50 +
@@ -278,14 +278,14 @@ struct metaslab_group {
* ALLOCATE
* |
* V
- * free segment (ms_tree) -----> ms_alloctree[4] ----> (write to space map)
+ * free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map)
* ^
- * | ms_freeingtree <--- FREE
- * | |
- * | v
- * | ms_freedtree
- * | |
- * +-------- ms_defertree[2] <-------+---------> (write to space map)
+ * | ms_freeing <--- FREE
+ * | |
+ * | v
+ * | ms_freed
+ * | |
+ * +-------- ms_defer[2] <-------+-------> (write to space map)
*
*
* Each metaslab's space is tracked in a single space map in the MOS,
@@ -296,8 +296,8 @@ struct metaslab_group {
* To load the in-core free tree we read the space map from disk. This
* object contains a series of alloc and free records that are combined
* to make up the list of all free segments in this metaslab. These
- * segments are represented in-core by the ms_tree and are stored in an
- * AVL tree.
+ * segments are represented in-core by the ms_allocatable and are stored
+ * in an AVL tree.
*
* As the space map grows (as a result of the appends) it will
* eventually become space-inefficient. When the metaslab's in-core
@@ -317,20 +317,22 @@ struct metaslab {
uint64_t ms_size;
uint64_t ms_fragmentation;
- range_tree_t *ms_alloctree[TXG_SIZE];
- range_tree_t *ms_tree;
+ range_tree_t *ms_allocating[TXG_SIZE];
+ range_tree_t *ms_allocatable;
/*
* The following range trees are accessed only from syncing context.
* ms_free*tree only have entries while syncing, and are empty
* between syncs.
*/
- range_tree_t *ms_freeingtree; /* to free this syncing txg */
- range_tree_t *ms_freedtree; /* already freed this syncing txg */
- range_tree_t *ms_defertree[TXG_DEFER_SIZE];
+ range_tree_t *ms_freeing; /* to free this syncing txg */
+ range_tree_t *ms_freed; /* already freed this syncing txg */
+ range_tree_t *ms_defer[TXG_DEFER_SIZE];
+ range_tree_t *ms_checkpointing; /* to add to the checkpoint */
boolean_t ms_condensing; /* condensing? */
boolean_t ms_condense_wanted;
+ uint64_t ms_condense_checked_txg;
/*
* We must hold both ms_lock and ms_group->mg_lock in order to
@@ -356,11 +358,12 @@ struct metaslab {
/*
* The metaslab block allocators can optionally use a size-ordered
* range tree and/or an array of LBAs. Not all allocators use
- * this functionality. The ms_size_tree should always contain the
- * same number of segments as the ms_tree. The only difference
- * is that the ms_size_tree is ordered by segment sizes.
+ * this functionality. The ms_allocatable_by_size should always
+ * contain the same number of segments as the ms_allocatable. The
+ * only difference is that the ms_allocatable_by_size is ordered by
+ * segment sizes.
*/
- avl_tree_t ms_size_tree;
+ avl_tree_t ms_allocatable_by_size;
uint64_t ms_lbas[MAX_LBAS];
metaslab_group_t *ms_group; /* metaslab group */
diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h
index 9eef762de..7f79786f5 100644
--- a/include/sys/range_tree.h
+++ b/include/sys/range_tree.h
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_RANGE_TREE_H
diff --git a/include/sys/spa.h b/include/sys/spa.h
index e8578be9a..b6483e11b 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -747,6 +747,8 @@ extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props,
uint64_t flags);
extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
extern int spa_destroy(char *pool);
+extern int spa_checkpoint(const char *pool);
+extern int spa_checkpoint_discard(const char *pool);
extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
boolean_t hardforce);
extern int spa_reset(char *pool);
@@ -965,6 +967,7 @@ extern spa_load_state_t spa_load_state(spa_t *spa);
extern uint64_t spa_freeze_txg(spa_t *spa);
extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize);
extern uint64_t spa_get_dspace(spa_t *spa);
+extern uint64_t spa_get_checkpoint_space(spa_t *spa);
extern uint64_t spa_get_slop_space(spa_t *spa);
extern void spa_update_dspace(spa_t *spa);
extern uint64_t spa_version(spa_t *spa);
@@ -1016,6 +1019,10 @@ extern boolean_t spa_writeable(spa_t *spa);
extern boolean_t spa_has_pending_synctask(spa_t *spa);
extern int spa_maxblocksize(spa_t *spa);
extern int spa_maxdnodesize(spa_t *spa);
+extern boolean_t spa_has_checkpoint(spa_t *spa);
+extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa);
+extern boolean_t spa_suspend_async_destroy(spa_t *spa);
+extern uint64_t spa_min_claim_txg(spa_t *spa);
extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp);
extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva,
const blkptr_t *bp);
@@ -1027,6 +1034,7 @@ extern uint64_t spa_get_last_removal_txg(spa_t *spa);
extern boolean_t spa_trust_config(spa_t *spa);
extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
+extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
extern boolean_t spa_multihost(spa_t *spa);
extern unsigned long spa_get_hostid(void);
diff --git a/include/sys/spa_checkpoint.h b/include/sys/spa_checkpoint.h
new file mode 100644
index 000000000..a5c856014
--- /dev/null
+++ b/include/sys/spa_checkpoint.h
@@ -0,0 +1,44 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_SPA_CHECKPOINT_H
+#define _SYS_SPA_CHECKPOINT_H
+
+#include <sys/zthr.h>
+
+typedef struct spa_checkpoint_info {
+ uint64_t sci_timestamp; /* when checkpointed uberblock was synced */
+ uint64_t sci_dspace; /* disk space used by checkpoint in bytes */
+} spa_checkpoint_info_t;
+
+int spa_checkpoint(const char *);
+int spa_checkpoint_discard(const char *);
+
+boolean_t spa_checkpoint_discard_thread_check(void *, zthr_t *);
+int spa_checkpoint_discard_thread(void *, zthr_t *);
+
+int spa_checkpoint_get_stats(spa_t *, pool_checkpoint_stat_t *);
+
+#endif /* _SYS_SPA_CHECKPOINT_H */
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 6abb63157..8d2a31961 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -32,6 +32,7 @@
#define _SYS_SPA_IMPL_H
#include <sys/spa.h>
+#include <sys/spa_checkpoint.h>
#include <sys/vdev.h>
#include <sys/vdev_removal.h>
#include <sys/metaslab.h>
@@ -284,6 +285,10 @@ struct spa {
spa_condensing_indirect_t *spa_condensing_indirect;
zthr_t *spa_condense_zthr; /* zthr doing condense. */
+ uint64_t spa_checkpoint_txg; /* the txg of the checkpoint */
+ spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */
+ zthr_t *spa_checkpoint_discard_zthr;
+
char *spa_root; /* alternate root directory */
uint64_t spa_ena; /* spa-wide ereport ENA */
int spa_last_open_failed; /* error if last open failed */
diff --git a/include/sys/space_map.h b/include/sys/space_map.h
index 457300d05..98b87269c 100644
--- a/include/sys/space_map.h
+++ b/include/sys/space_map.h
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_SPACE_MAP_H
@@ -57,7 +57,7 @@ extern "C" {
typedef struct space_map_phys {
uint64_t smp_object; /* on-disk space map object */
uint64_t smp_objsize; /* size of the object */
- uint64_t smp_alloc; /* space allocated from the map */
+ int64_t smp_alloc; /* space allocated from the map */
uint64_t smp_pad[5]; /* reserved */
/*
@@ -82,7 +82,7 @@ typedef struct space_map {
uint64_t sm_size; /* size of map */
uint8_t sm_shift; /* unit shift */
uint64_t sm_length; /* synced length */
- uint64_t sm_alloc; /* synced space allocated */
+ int64_t sm_alloc; /* synced space allocated */
objset_t *sm_os; /* objset for this map */
uint64_t sm_object; /* object id for this map */
uint32_t sm_blksz; /* block size for space map */
@@ -140,6 +140,8 @@ typedef int (*sm_cb_t)(maptype_t type, uint64_t offset, uint64_t size,
int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg);
+int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
+ dmu_tx_t *tx);
void space_map_histogram_clear(space_map_t *sm);
void space_map_histogram_add(space_map_t *sm, range_tree_t *rt,
@@ -153,8 +155,8 @@ uint64_t space_map_length(space_map_t *sm);
void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
dmu_tx_t *tx);
-void space_map_truncate(space_map_t *sm, dmu_tx_t *tx);
-uint64_t space_map_alloc(objset_t *os, dmu_tx_t *tx);
+void space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx);
+uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx);
void space_map_free(space_map_t *sm, dmu_tx_t *tx);
void space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx);
diff --git a/include/sys/uberblock_impl.h b/include/sys/uberblock_impl.h
index 08eeabdda..113df7c61 100644
--- a/include/sys/uberblock_impl.h
+++ b/include/sys/uberblock_impl.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_UBERBLOCK_IMPL_H
@@ -61,6 +61,28 @@ struct uberblock {
uint64_t ub_mmp_magic; /* MMP_MAGIC */
uint64_t ub_mmp_delay; /* nanosec since last MMP write */
uint64_t ub_mmp_seq; /* reserved for sequence number */
+
+ /*
+ * ub_checkpoint_txg indicates two things about the current uberblock:
+ *
+ * 1] If it is not zero then this uberblock is a checkpoint. If it is
+ * zero, then this uberblock is not a checkpoint.
+ *
+ * 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is
+ * the ub_txg that the uberblock had at the time we moved it to
+ * the MOS config.
+ *
+ * The field is set when we checkpoint the uberblock and continues to
+ * hold that value even after we've rewound (unlike the ub_txg that
+ * is reset to a higher value).
+ *
+ * Besides checks used to determine whether we are reopening the
+ * pool from a checkpointed uberblock [see spa_ld_select_uberblock()],
+ * the value of the field is used to determine which ZIL blocks have
+ * been allocated according to the ms_sm when we are rewinding to a
+ * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then
+ * the ZIL block is not allocated [see uses of spa_min_claim_txg()].
+ */
uint64_t ub_checkpoint_txg;
};
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index 161e30ae7..6d31d61b5 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_VDEV_H
@@ -81,7 +81,7 @@ extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx);
extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx);
extern void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx);
extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset,
- uint64_t size, uint64_t txg);
+ uint64_t size);
extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev,
uint64_t offset, uint64_t size, dmu_tx_t *tx);
@@ -122,6 +122,7 @@ extern boolean_t vdev_readable(vdev_t *vd);
extern boolean_t vdev_writeable(vdev_t *vd);
extern boolean_t vdev_allocatable(vdev_t *vd);
extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio);
+extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd);
extern void vdev_cache_init(vdev_t *vd);
extern void vdev_cache_fini(vdev_t *vd);
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index e28994613..c22087307 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_VDEV_IMPL_H
@@ -236,6 +236,9 @@ struct vdev {
kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */
uint64_t vdev_top_zap;
+ /* pool checkpoint related */
+ space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */
+
/*
* Values stored in the config for an indirect or removing vdev.
*/
@@ -469,6 +472,7 @@ extern void vdev_set_min_asize(vdev_t *vd);
/*
* Global variables
*/
+extern int vdev_standard_sm_blksz;
/* zdb uses this tunable, so it must be declared here to make lint happy. */
extern int zfs_vdev_cache_size;
@@ -481,6 +485,11 @@ extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx);
extern int vdev_obsolete_sm_object(vdev_t *vd);
extern boolean_t vdev_obsolete_counts_are_precise(vdev_t *vd);
+/*
+ * Other miscellaneous functions
+ */
+int vdev_checkpoint_sm_object(vdev_t *vd);
+
#ifdef __cplusplus
}
#endif
diff --git a/include/sys/vdev_removal.h b/include/sys/vdev_removal.h
index bec2cea33..3962237af 100644
--- a/include/sys/vdev_removal.h
+++ b/include/sys/vdev_removal.h
@@ -14,7 +14,7 @@
*/
/*
- * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_VDEV_REMOVAL_H
@@ -79,7 +79,7 @@ extern void spa_condense_fini(spa_t *);
extern void spa_start_indirect_condensing_thread(spa_t *);
extern void spa_vdev_condense_suspend(spa_t *);
extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t);
-extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t, uint64_t);
+extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t);
extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *);
extern void svr_sync(spa_t *spa, dmu_tx_t *tx);
extern void spa_vdev_remove_suspend(spa_t *);
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 25c12fbcc..6c0c682a8 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -566,7 +566,6 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg,
blkptr_t *new_bp, uint64_t size, boolean_t *slog);
-extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
extern void zio_flush(zio_t *zio, vdev_t *vd);
extern void zio_shrink(zio_t *zio, uint64_t size);
diff --git a/include/sys/zthr.h b/include/sys/zthr.h
index 6bfb6b6c0..62da2eea8 100644
--- a/include/sys/zthr.h
+++ b/include/sys/zthr.h
@@ -13,7 +13,6 @@
* CDDL HEADER END
*/
-
/*
* Copyright (c) 2017 by Delphix. All rights reserved.
*/
diff --git a/include/zfeature_common.h b/include/zfeature_common.h
index 13670c8e5..c59b800d3 100644
--- a/include/zfeature_common.h
+++ b/include/zfeature_common.h
@@ -20,7 +20,7 @@
*/
/*
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
@@ -61,6 +61,7 @@ typedef enum spa_feature {
SPA_FEATURE_PROJECT_QUOTA,
SPA_FEATURE_DEVICE_REMOVAL,
SPA_FEATURE_OBSOLETE_COUNTS,
+ SPA_FEATURE_POOL_CHECKPOINT,
SPA_FEATURES
} spa_feature_t;