diff options
Diffstat (limited to 'include')
-rw-r--r-- | include/libzfs.h | 7 | ||||
-rw-r--r-- | include/libzfs_core.h | 3 | ||||
-rw-r--r-- | include/sys/Makefile.am | 1 | ||||
-rw-r--r-- | include/sys/dmu.h | 1 | ||||
-rw-r--r-- | include/sys/dsl_dir.h | 3 | ||||
-rw-r--r-- | include/sys/dsl_pool.h | 8 | ||||
-rw-r--r-- | include/sys/dsl_synctask.h | 41 | ||||
-rw-r--r-- | include/sys/fs/zfs.h | 37 | ||||
-rw-r--r-- | include/sys/metaslab.h | 6 | ||||
-rw-r--r-- | include/sys/metaslab_impl.h | 53 | ||||
-rw-r--r-- | include/sys/range_tree.h | 2 | ||||
-rw-r--r-- | include/sys/spa.h | 8 | ||||
-rw-r--r-- | include/sys/spa_checkpoint.h | 44 | ||||
-rw-r--r-- | include/sys/spa_impl.h | 5 | ||||
-rw-r--r-- | include/sys/space_map.h | 12 | ||||
-rw-r--r-- | include/sys/uberblock_impl.h | 24 | ||||
-rw-r--r-- | include/sys/vdev.h | 5 | ||||
-rw-r--r-- | include/sys/vdev_impl.h | 11 | ||||
-rw-r--r-- | include/sys/vdev_removal.h | 4 | ||||
-rw-r--r-- | include/sys/zio.h | 1 | ||||
-rw-r--r-- | include/sys/zthr.h | 1 | ||||
-rw-r--r-- | include/zfeature_common.h | 3 |
22 files changed, 229 insertions, 51 deletions
diff --git a/include/libzfs.h b/include/libzfs.h index b98963158..c0c0f3c3c 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -152,6 +152,11 @@ typedef enum zfs_error { EZFS_ACTIVE_POOL, /* pool is imported on a different system */ EZFS_CRYPTOFAILED, /* failed to setup encryption */ EZFS_NO_PENDING, /* cannot cancel, no operation is pending */ + EZFS_CHECKPOINT_EXISTS, /* checkpoint exists */ + EZFS_DISCARDING_CHECKPOINT, /* currently discarding a checkpoint */ + EZFS_NO_CHECKPOINT, /* pool has no checkpoint */ + EZFS_DEVRM_IN_PROGRESS, /* a device is currently being removed */ + EZFS_VDEV_TOO_BIG, /* a device is too big to be used */ EZFS_UNKNOWN } zfs_error_t; @@ -457,6 +462,8 @@ extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *); extern int zpool_get_physpath(zpool_handle_t *, char *, size_t); extern void zpool_explain_recover(libzfs_handle_t *, const char *, int, nvlist_t *); +extern int zpool_checkpoint(zpool_handle_t *); +extern int zpool_discard_checkpoint(zpool_handle_t *); /* * Basic handle manipulations. These functions do not create or destroy the diff --git a/include/libzfs_core.h b/include/libzfs_core.h index 5af0e1e75..4ca9b254c 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -110,6 +110,9 @@ int lzc_channel_program_nosync(const char *, const char *, uint64_t, int lzc_sync(const char *, nvlist_t *, nvlist_t **); int lzc_reopen(const char *, boolean_t); +int lzc_pool_checkpoint(const char *); +int lzc_pool_checkpoint_discard(const char *); + #ifdef __cplusplus } #endif diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 50c21831d..d64133ceb 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -13,6 +13,7 @@ COMMON_H = \ $(top_srcdir)/include/sys/bptree.h \ $(top_srcdir)/include/sys/bqueue.h \ $(top_srcdir)/include/sys/cityhash.h \ + $(top_srcdir)/include/sys/spa_checkpoint.h \ $(top_srcdir)/include/sys/dbuf.h \ $(top_srcdir)/include/sys/ddt.h \ $(top_srcdir)/include/sys/dmu.h \ diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 45259a7fc..d95c09bb9 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -366,6 +366,7 @@ typedef struct dmu_buf { #define DMU_POOL_REMOVING "com.delphix:removing" #define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj" #define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect" +#define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint" /* * Allocate an object from this objset. The range of object numbers diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h index 8a346e902..86bc2dd87 100644 --- a/include/sys/dsl_dir.h +++ b/include/sys/dsl_dir.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -138,6 +138,7 @@ uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, dmu_tx_t *tx); uint64_t dsl_dir_get_used(dsl_dir_t *dd); +uint64_t dsl_dir_get_compressed(dsl_dir_t *dd); uint64_t dsl_dir_get_quota(dsl_dir_t *dd); uint64_t dsl_dir_get_reservation(dsl_dir_t *dd); uint64_t dsl_dir_get_compressratio(dsl_dir_t *dd); diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index c60e4bf9d..01870e867 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -38,6 +38,7 @@ #include <sys/bpobj.h> #include <sys/bptree.h> #include <sys/rrwlock.h> +#include <sys/dsl_synctask.h> #include <sys/mmp.h> #ifdef __cplusplus @@ -128,6 +129,7 @@ typedef struct dsl_pool { txg_list_t dp_dirty_zilogs; txg_list_t dp_dirty_dirs; txg_list_t dp_sync_tasks; + txg_list_t dp_early_sync_tasks; taskq_t *dp_sync_taskq; taskq_t *dp_zil_clean_taskq; @@ -151,7 +153,9 @@ dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg); void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg); int dsl_pool_sync_context(dsl_pool_t *dp); -uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree); +uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy); +uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp, + zfs_space_check_t slop_policy); void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg); void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); @@ -162,6 +166,8 @@ void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_mos_diduse_space(dsl_pool_t *dp, int64_t used, int64_t comp, int64_t uncomp); +void dsl_pool_ckpoint_diduse_space(dsl_pool_t *dp, + int64_t used, int64_t comp, int64_t uncomp); boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp); void dsl_pool_config_enter(dsl_pool_t *dp, void *tag); void dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag); diff --git a/include/sys/dsl_synctask.h b/include/sys/dsl_synctask.h index 6139303c1..da6c7a40d 100644 --- a/include/sys/dsl_synctask.h +++ b/include/sys/dsl_synctask.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_SYNCTASK_H @@ -57,14 +57,41 @@ typedef enum zfs_space_check { ZFS_SPACE_CHECK_RESERVED, /* - * No space check is performed. Only operations which we expect to - * result in a net reduction in space should use this - * (e.g. "zfs destroy". Setting quotas & reservations also uses - * this because it needs to circumvent the quota/reservation checks). + * Space check allows use of three quarters of the slop space. + * If there is less than 0.8% free space, the operation will + * fail. + */ + ZFS_SPACE_CHECK_EXTRA_RESERVED, + + /* + * In all cases "zfs destroy" is expected to result in an net + * reduction of space, except one. When the pool has a + * checkpoint, space freed by "zfs destroy" will not actually + * free anything internally. Thus, it starts failing after + * three quarters of the slop space is exceeded. + */ + ZFS_SPACE_CHECK_DESTROY = ZFS_SPACE_CHECK_EXTRA_RESERVED, + + /* + * A channel program can run a "zfs destroy" as part of its + * script and therefore has the same space_check policy when + * being evaluated. + */ + ZFS_SPACE_CHECK_ZCP_EVAL = ZFS_SPACE_CHECK_DESTROY, + + /* + * No space check is performed. This level of space check should + * be used cautiously as operations that use it can even run when + * 0.8% capacity is left for use. In this scenario, if there is a + * checkpoint, async destroys are suspended and any kind of freeing + * can potentially add space instead of freeing it. * * See also the comments above spa_slop_shift. */ ZFS_SPACE_CHECK_NONE, + + ZFS_SPACE_CHECK_DISCARD_CHECKPOINT = ZFS_SPACE_CHECK_NONE, + } zfs_space_check_t; typedef struct dsl_sync_task { @@ -85,6 +112,10 @@ int dsl_sync_task(const char *, dsl_checkfunc_t *, dsl_syncfunc_t *, void *, int, zfs_space_check_t); void dsl_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *, void *, int, zfs_space_check_t, dmu_tx_t *); +int dsl_early_sync_task(const char *, dsl_checkfunc_t *, + dsl_syncfunc_t *, void *, int, zfs_space_check_t); +void dsl_early_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *, + void *, int, zfs_space_check_t, dmu_tx_t *); #ifdef __cplusplus } diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 870618ecb..0ee9b00bd 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -239,6 +239,7 @@ typedef enum { ZPOOL_PROP_TNAME, ZPOOL_PROP_MAXDNODESIZE, ZPOOL_PROP_MULTIHOST, + ZPOOL_PROP_CHECKPOINT, ZPOOL_NUM_PROPS } zpool_prop_t; @@ -616,6 +617,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ +#define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ #define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */ @@ -752,6 +754,8 @@ typedef struct zpool_load_policy { "com.delphix:indirect_obsolete_sm" #define VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \ "com.delphix:obsolete_counts_are_precise" +#define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \ + "com.delphix:pool_checkpoint_sm" /* * This is needed in userland to report the minimum necessary device size. @@ -861,6 +865,18 @@ typedef enum pool_scrub_cmd { POOL_SCRUB_FLAGS_END } pool_scrub_cmd_t; +typedef enum { + CS_NONE, + CS_CHECKPOINT_EXISTS, + CS_CHECKPOINT_DISCARDING, + CS_NUM_STATES +} checkpoint_state_t; + +typedef struct pool_checkpoint_stat { + uint64_t pcs_state; /* checkpoint_state_t */ + uint64_t pcs_start_time; /* time checkpoint/discard started */ + uint64_t pcs_space; /* checkpointed space */ +} pool_checkpoint_stat_t; /* * ZIO types. Needed to interpret vdev statistics below. @@ -958,7 +974,7 @@ typedef struct vdev_stat { uint64_t vs_scan_removing; /* removing? */ uint64_t vs_scan_processed; /* scan processed bytes */ uint64_t vs_fragmentation; /* device fragmentation */ - + uint64_t vs_checkpoint_space; /* checkpoint-consumed space */ } vdev_stat_t; /* @@ -1144,6 +1160,8 @@ typedef enum zfs_ioc { ZFS_IOC_UNLOAD_KEY, ZFS_IOC_CHANGE_KEY, ZFS_IOC_REMAP, + ZFS_IOC_POOL_CHECKPOINT, + ZFS_IOC_POOL_DISCARD_CHECKPOINT, /* * Linux - 3/64 numbers reserved. @@ -1167,6 +1185,22 @@ typedef enum zfs_ioc { #define BLKZNAME _IOR(0x12, 125, char[ZFS_MAX_DATASET_NAME_LEN]) /* + * ZFS-specific error codes used for returning descriptive errors + * to the userland through zfs ioctls. + * + * The enum implicitly includes all the error codes from errno.h. + * New code should use and extend this enum for errors that are + * not described precisely by generic errno codes. + */ +typedef enum { + ZFS_ERR_CHECKPOINT_EXISTS = 1024, + ZFS_ERR_DISCARDING_CHECKPOINT, + ZFS_ERR_NO_CHECKPOINT, + ZFS_ERR_DEVRM_IN_PROGRESS, + ZFS_ERR_VDEV_TOO_BIG +} zfs_errno_t; + +/* * Internal SPA load state. Used by FMA diagnosis engine. */ typedef enum { @@ -1235,6 +1269,7 @@ typedef enum { #define ZFS_IMPORT_TEMP_NAME 0x10 #define ZFS_IMPORT_SKIP_MMP 0x20 #define ZFS_IMPORT_LOAD_KEYS 0x40 +#define ZFS_IMPORT_CHECKPOINT 0x80 /* * Channel program argument/return nvlist keys and defaults. diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index fdcf6c71b..282ec231c 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_H @@ -70,8 +70,8 @@ int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t, dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); -void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, uint64_t); -void metaslab_free_dva(spa_t *, const dva_t *, uint64_t); +void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t); +void metaslab_free_dva(spa_t *, const dva_t *, boolean_t); void metaslab_free_impl_cb(uint64_t, vdev_t *, uint64_t, uint64_t, void *); void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t); int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 76f670a4d..dafd2b231 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_IMPL_H @@ -255,16 +255,16 @@ struct metaslab_group { /* * Each metaslab maintains a set of in-core trees to track metaslab - * operations. The in-core free tree (ms_tree) contains the list of + * operations. The in-core free tree (ms_allocatable) contains the list of * free segments which are eligible for allocation. As blocks are - * allocated, the allocated segment are removed from the ms_tree and - * added to a per txg allocation tree (ms_alloctree). As blocks are - * freed, they are added to the free tree (ms_freeingtree). These trees + * allocated, the allocated segment are removed from the ms_allocatable and + * added to a per txg allocation tree (ms_allocating). As blocks are + * freed, they are added to the free tree (ms_freeing). These trees * allow us to process all allocations and frees in syncing context * where it is safe to update the on-disk space maps. An additional set * of in-core trees is maintained to track deferred frees - * (ms_defertree). Once a block is freed it will move from the - * ms_freedtree to the ms_defertree. A deferred free means that a block + * (ms_defer). Once a block is freed it will move from the + * ms_freed to the ms_defer tree. A deferred free means that a block * has been freed but cannot be used by the pool until TXG_DEFER_SIZE * transactions groups later. For example, a block that is freed in txg * 50 will not be available for reallocation until txg 52 (50 + @@ -278,14 +278,14 @@ struct metaslab_group { * ALLOCATE * | * V - * free segment (ms_tree) -----> ms_alloctree[4] ----> (write to space map) + * free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map) * ^ - * | ms_freeingtree <--- FREE - * | | - * | v - * | ms_freedtree - * | | - * +-------- ms_defertree[2] <-------+---------> (write to space map) + * | ms_freeing <--- FREE + * | | + * | v + * | ms_freed + * | | + * +-------- ms_defer[2] <-------+-------> (write to space map) * * * Each metaslab's space is tracked in a single space map in the MOS, @@ -296,8 +296,8 @@ struct metaslab_group { * To load the in-core free tree we read the space map from disk. This * object contains a series of alloc and free records that are combined * to make up the list of all free segments in this metaslab. These - * segments are represented in-core by the ms_tree and are stored in an - * AVL tree. + * segments are represented in-core by the ms_allocatable and are stored + * in an AVL tree. * * As the space map grows (as a result of the appends) it will * eventually become space-inefficient. When the metaslab's in-core @@ -317,20 +317,22 @@ struct metaslab { uint64_t ms_size; uint64_t ms_fragmentation; - range_tree_t *ms_alloctree[TXG_SIZE]; - range_tree_t *ms_tree; + range_tree_t *ms_allocating[TXG_SIZE]; + range_tree_t *ms_allocatable; /* * The following range trees are accessed only from syncing context. * ms_free*tree only have entries while syncing, and are empty * between syncs. */ - range_tree_t *ms_freeingtree; /* to free this syncing txg */ - range_tree_t *ms_freedtree; /* already freed this syncing txg */ - range_tree_t *ms_defertree[TXG_DEFER_SIZE]; + range_tree_t *ms_freeing; /* to free this syncing txg */ + range_tree_t *ms_freed; /* already freed this syncing txg */ + range_tree_t *ms_defer[TXG_DEFER_SIZE]; + range_tree_t *ms_checkpointing; /* to add to the checkpoint */ boolean_t ms_condensing; /* condensing? */ boolean_t ms_condense_wanted; + uint64_t ms_condense_checked_txg; /* * We must hold both ms_lock and ms_group->mg_lock in order to @@ -356,11 +358,12 @@ struct metaslab { /* * The metaslab block allocators can optionally use a size-ordered * range tree and/or an array of LBAs. Not all allocators use - * this functionality. The ms_size_tree should always contain the - * same number of segments as the ms_tree. The only difference - * is that the ms_size_tree is ordered by segment sizes. + * this functionality. The ms_allocatable_by_size should always + * contain the same number of segments as the ms_allocatable. The + * only difference is that the ms_allocatable_by_size is ordered by + * segment sizes. */ - avl_tree_t ms_size_tree; + avl_tree_t ms_allocatable_by_size; uint64_t ms_lbas[MAX_LBAS]; metaslab_group_t *ms_group; /* metaslab group */ diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h index 9eef762de..7f79786f5 100644 --- a/include/sys/range_tree.h +++ b/include/sys/range_tree.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. */ #ifndef _SYS_RANGE_TREE_H diff --git a/include/sys/spa.h b/include/sys/spa.h index e8578be9a..b6483e11b 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -747,6 +747,8 @@ extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); extern int spa_destroy(char *pool); +extern int spa_checkpoint(const char *pool); +extern int spa_checkpoint_discard(const char *pool); extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce); extern int spa_reset(char *pool); @@ -965,6 +967,7 @@ extern spa_load_state_t spa_load_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); extern uint64_t spa_get_dspace(spa_t *spa); +extern uint64_t spa_get_checkpoint_space(spa_t *spa); extern uint64_t spa_get_slop_space(spa_t *spa); extern void spa_update_dspace(spa_t *spa); extern uint64_t spa_version(spa_t *spa); @@ -1016,6 +1019,10 @@ extern boolean_t spa_writeable(spa_t *spa); extern boolean_t spa_has_pending_synctask(spa_t *spa); extern int spa_maxblocksize(spa_t *spa); extern int spa_maxdnodesize(spa_t *spa); +extern boolean_t spa_has_checkpoint(spa_t *spa); +extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa); +extern boolean_t spa_suspend_async_destroy(spa_t *spa); +extern uint64_t spa_min_claim_txg(spa_t *spa); extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp); extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp); @@ -1027,6 +1034,7 @@ extern uint64_t spa_get_last_removal_txg(spa_t *spa); extern boolean_t spa_trust_config(spa_t *spa); extern uint64_t spa_missing_tvds_allowed(spa_t *spa); extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing); +extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa); extern boolean_t spa_multihost(spa_t *spa); extern unsigned long spa_get_hostid(void); diff --git a/include/sys/spa_checkpoint.h b/include/sys/spa_checkpoint.h new file mode 100644 index 000000000..a5c856014 --- /dev/null +++ b/include/sys/spa_checkpoint.h @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_SPA_CHECKPOINT_H +#define _SYS_SPA_CHECKPOINT_H + +#include <sys/zthr.h> + +typedef struct spa_checkpoint_info { + uint64_t sci_timestamp; /* when checkpointed uberblock was synced */ + uint64_t sci_dspace; /* disk space used by checkpoint in bytes */ +} spa_checkpoint_info_t; + +int spa_checkpoint(const char *); +int spa_checkpoint_discard(const char *); + +boolean_t spa_checkpoint_discard_thread_check(void *, zthr_t *); +int spa_checkpoint_discard_thread(void *, zthr_t *); + +int spa_checkpoint_get_stats(spa_t *, pool_checkpoint_stat_t *); + +#endif /* _SYS_SPA_CHECKPOINT_H */ diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 6abb63157..8d2a31961 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -32,6 +32,7 @@ #define _SYS_SPA_IMPL_H #include <sys/spa.h> +#include <sys/spa_checkpoint.h> #include <sys/vdev.h> #include <sys/vdev_removal.h> #include <sys/metaslab.h> @@ -284,6 +285,10 @@ struct spa { spa_condensing_indirect_t *spa_condensing_indirect; zthr_t *spa_condense_zthr; /* zthr doing condense. */ + uint64_t spa_checkpoint_txg; /* the txg of the checkpoint */ + spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */ + zthr_t *spa_checkpoint_discard_zthr; + char *spa_root; /* alternate root directory */ uint64_t spa_ena; /* spa-wide ereport ENA */ int spa_last_open_failed; /* error if last open failed */ diff --git a/include/sys/space_map.h b/include/sys/space_map.h index 457300d05..98b87269c 100644 --- a/include/sys/space_map.h +++ b/include/sys/space_map.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. */ #ifndef _SYS_SPACE_MAP_H @@ -57,7 +57,7 @@ extern "C" { typedef struct space_map_phys { uint64_t smp_object; /* on-disk space map object */ uint64_t smp_objsize; /* size of the object */ - uint64_t smp_alloc; /* space allocated from the map */ + int64_t smp_alloc; /* space allocated from the map */ uint64_t smp_pad[5]; /* reserved */ /* @@ -82,7 +82,7 @@ typedef struct space_map { uint64_t sm_size; /* size of map */ uint8_t sm_shift; /* unit shift */ uint64_t sm_length; /* synced length */ - uint64_t sm_alloc; /* synced space allocated */ + int64_t sm_alloc; /* synced space allocated */ objset_t *sm_os; /* objset for this map */ uint64_t sm_object; /* object id for this map */ uint32_t sm_blksz; /* block size for space map */ @@ -140,6 +140,8 @@ typedef int (*sm_cb_t)(maptype_t type, uint64_t offset, uint64_t size, int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype); int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg); +int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, + dmu_tx_t *tx); void space_map_histogram_clear(space_map_t *sm); void space_map_histogram_add(space_map_t *sm, range_tree_t *rt, @@ -153,8 +155,8 @@ uint64_t space_map_length(space_map_t *sm); void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, dmu_tx_t *tx); -void space_map_truncate(space_map_t *sm, dmu_tx_t *tx); -uint64_t space_map_alloc(objset_t *os, dmu_tx_t *tx); +void space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx); +uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx); void space_map_free(space_map_t *sm, dmu_tx_t *tx); void space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx); diff --git a/include/sys/uberblock_impl.h b/include/sys/uberblock_impl.h index 08eeabdda..113df7c61 100644 --- a/include/sys/uberblock_impl.h +++ b/include/sys/uberblock_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017 by Delphix. All rights reserved. + * Copyright (c) 2016, 2017 by Delphix. All rights reserved. */ #ifndef _SYS_UBERBLOCK_IMPL_H @@ -61,6 +61,28 @@ struct uberblock { uint64_t ub_mmp_magic; /* MMP_MAGIC */ uint64_t ub_mmp_delay; /* nanosec since last MMP write */ uint64_t ub_mmp_seq; /* reserved for sequence number */ + + /* + * ub_checkpoint_txg indicates two things about the current uberblock: + * + * 1] If it is not zero then this uberblock is a checkpoint. If it is + * zero, then this uberblock is not a checkpoint. + * + * 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is + * the ub_txg that the uberblock had at the time we moved it to + * the MOS config. + * + * The field is set when we checkpoint the uberblock and continues to + * hold that value even after we've rewound (unlike the ub_txg that + * is reset to a higher value). + * + * Besides checks used to determine whether we are reopening the + * pool from a checkpointed uberblock [see spa_ld_select_uberblock()], + * the value of the field is used to determine which ZIL blocks have + * been allocated according to the ms_sm when we are rewinding to a + * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then + * the ZIL block is not allocated [see uses of spa_min_claim_txg()]. + */ uint64_t ub_checkpoint_txg; }; diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 161e30ae7..6d31d61b5 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. */ #ifndef _SYS_VDEV_H @@ -81,7 +81,7 @@ extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx); extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx); extern void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx); extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, - uint64_t size, uint64_t txg); + uint64_t size); extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev, uint64_t offset, uint64_t size, dmu_tx_t *tx); @@ -122,6 +122,7 @@ extern boolean_t vdev_readable(vdev_t *vd); extern boolean_t vdev_writeable(vdev_t *vd); extern boolean_t vdev_allocatable(vdev_t *vd); extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio); +extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd); extern void vdev_cache_init(vdev_t *vd); extern void vdev_cache_fini(vdev_t *vd); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index e28994613..c22087307 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. */ #ifndef _SYS_VDEV_IMPL_H @@ -236,6 +236,9 @@ struct vdev { kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */ uint64_t vdev_top_zap; + /* pool checkpoint related */ + space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */ + /* * Values stored in the config for an indirect or removing vdev. */ @@ -469,6 +472,7 @@ extern void vdev_set_min_asize(vdev_t *vd); /* * Global variables */ +extern int vdev_standard_sm_blksz; /* zdb uses this tunable, so it must be declared here to make lint happy. */ extern int zfs_vdev_cache_size; @@ -481,6 +485,11 @@ extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx); extern int vdev_obsolete_sm_object(vdev_t *vd); extern boolean_t vdev_obsolete_counts_are_precise(vdev_t *vd); +/* + * Other miscellaneous functions + */ +int vdev_checkpoint_sm_object(vdev_t *vd); + #ifdef __cplusplus } #endif diff --git a/include/sys/vdev_removal.h b/include/sys/vdev_removal.h index bec2cea33..3962237af 100644 --- a/include/sys/vdev_removal.h +++ b/include/sys/vdev_removal.h @@ -14,7 +14,7 @@ */ /* - * Copyright (c) 2014, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014, 2017 by Delphix. All rights reserved. */ #ifndef _SYS_VDEV_REMOVAL_H @@ -79,7 +79,7 @@ extern void spa_condense_fini(spa_t *); extern void spa_start_indirect_condensing_thread(spa_t *); extern void spa_vdev_condense_suspend(spa_t *); extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t); -extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t, uint64_t); +extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t); extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *); extern void svr_sync(spa_t *spa, dmu_tx_t *tx); extern void spa_vdev_remove_suspend(spa_t *); diff --git a/include/sys/zio.h b/include/sys/zio.h index 25c12fbcc..6c0c682a8 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -566,7 +566,6 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, uint64_t size, boolean_t *slog); -extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp); extern void zio_flush(zio_t *zio, vdev_t *vd); extern void zio_shrink(zio_t *zio, uint64_t size); diff --git a/include/sys/zthr.h b/include/sys/zthr.h index 6bfb6b6c0..62da2eea8 100644 --- a/include/sys/zthr.h +++ b/include/sys/zthr.h @@ -13,7 +13,6 @@ * CDDL HEADER END */ - /* * Copyright (c) 2017 by Delphix. All rights reserved. */ diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 13670c8e5..c59b800d3 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. */ @@ -61,6 +61,7 @@ typedef enum spa_feature { SPA_FEATURE_PROJECT_QUOTA, SPA_FEATURE_DEVICE_REMOVAL, SPA_FEATURE_OBSOLETE_COUNTS, + SPA_FEATURE_POOL_CHECKPOINT, SPA_FEATURES } spa_feature_t; |