diff options
author | Sara Hartse <[email protected]> | 2019-07-26 10:54:14 -0700 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2019-07-26 10:54:14 -0700 |
commit | 37f03da8ba6e1ab074b503e1dd63bfa7199d0537 (patch) | |
tree | 987b03643c33cd43b246a20aea28b8750f7b4ee6 /include/sys | |
parent | d274ac54609894d00a49c0a0da89abd3a7f3998d (diff) |
Fast Clone Deletion
Deleting a clone requires finding blocks are clone-only, not shared
with the snapshot. This was done by traversing the entire block tree
which results in a large performance penalty for sparsely
written clones.
This is new method keeps track of clone blocks when they are
modified in a "Livelist" so that, when it’s time to delete,
the clone-specific blocks are already at hand.
We see performance improvements because now deletion work is
proportional to the number of clone-modified blocks, not the size
of the original dataset.
Reviewed-by: Sean Eric Fagan <[email protected]>
Reviewed-by: Matt Ahrens <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Serapheim Dimitropoulos <[email protected]>
Signed-off-by: Sara Hartse <[email protected]>
Closes #8416
Diffstat (limited to 'include/sys')
-rw-r--r-- | include/sys/bplist.h | 2 | ||||
-rw-r--r-- | include/sys/bpobj.h | 19 | ||||
-rw-r--r-- | include/sys/dmu.h | 2 | ||||
-rw-r--r-- | include/sys/dmu_objset.h | 2 | ||||
-rw-r--r-- | include/sys/dsl_deadlist.h | 31 | ||||
-rw-r--r-- | include/sys/dsl_destroy.h | 1 | ||||
-rw-r--r-- | include/sys/dsl_dir.h | 14 | ||||
-rw-r--r-- | include/sys/dsl_pool.h | 3 | ||||
-rw-r--r-- | include/sys/spa.h | 9 | ||||
-rw-r--r-- | include/sys/spa_impl.h | 6 | ||||
-rw-r--r-- | include/sys/zthr.h | 2 |
11 files changed, 81 insertions, 10 deletions
diff --git a/include/sys/bplist.h b/include/sys/bplist.h index 471be9047..f8deaf843 100644 --- a/include/sys/bplist.h +++ b/include/sys/bplist.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018 by Delphix. All rights reserved. */ #ifndef _SYS_BPLIST_H @@ -49,6 +50,7 @@ void bplist_destroy(bplist_t *bpl); void bplist_append(bplist_t *bpl, const blkptr_t *bp); void bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx); +void bplist_clear(bplist_t *bpl); #ifdef __cplusplus } diff --git a/include/sys/bpobj.h b/include/sys/bpobj.h index d425e239f..16e403526 100644 --- a/include/sys/bpobj.h +++ b/include/sys/bpobj.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2015, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_BPOBJ_H @@ -31,6 +31,7 @@ #include <sys/txg.h> #include <sys/zio.h> #include <sys/zfs_context.h> +#include <sys/bplist.h> #ifdef __cplusplus extern "C" { @@ -48,10 +49,12 @@ typedef struct bpobj_phys { uint64_t bpo_uncomp; uint64_t bpo_subobjs; uint64_t bpo_num_subobjs; + uint64_t bpo_num_freed; } bpobj_phys_t; #define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t)) #define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t)) +#define BPOBJ_SIZE_V2 (6 * sizeof (uint64_t)) typedef struct bpobj { kmutex_t bpo_lock; @@ -60,12 +63,14 @@ typedef struct bpobj { int bpo_epb; uint8_t bpo_havecomp; uint8_t bpo_havesubobj; + uint8_t bpo_havefreed; bpobj_phys_t *bpo_phys; dmu_buf_t *bpo_dbuf; dmu_buf_t *bpo_cached_dbuf; } bpobj_t; -typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); +typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx); uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx); uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx); @@ -77,10 +82,13 @@ void bpobj_close(bpobj_t *bpo); boolean_t bpobj_is_open(const bpobj_t *bpo); int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx); -int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *); +int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, uint64_t *); +int livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, + void *arg, int64_t start); void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx); -void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx); +void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx); int bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); @@ -88,6 +96,9 @@ int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); boolean_t bpobj_is_empty(bpobj_t *bpo); +int bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx); + #ifdef __cplusplus } #endif diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 65da78eb5..62de1eaf5 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -383,6 +383,7 @@ typedef struct dmu_buf { #define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect" #define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint" #define DMU_POOL_LOG_SPACEMAP_ZAP "com.delphix:log_spacemap_zap" +#define DMU_POOL_DELETED_CLONES "com.delphix:deleted_clones" /* * Allocate an object from this objset. The range of object numbers @@ -1003,6 +1004,7 @@ extern uint64_t dmu_objset_id(objset_t *os); extern uint64_t dmu_objset_dnodesize(objset_t *os); extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os); extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os); +extern int dmu_objset_blksize(objset_t *os); extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *id, uint64_t *offp, boolean_t *case_conflict); extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val); diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index c0650bcde..9b6614e98 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -126,7 +126,7 @@ struct objset { zfs_cache_type_t os_secondary_cache; zfs_sync_type_t os_sync; zfs_redundant_metadata_type_t os_redundant_metadata; - int os_recordsize; + uint64_t os_recordsize; /* * The next four values are used as a cache of whatever's on disk, and * are initialized the first time these properties are queried. Before diff --git a/include/sys/dsl_deadlist.h b/include/sys/dsl_deadlist.h index 08f38233d..bb8248a66 100644 --- a/include/sys/dsl_deadlist.h +++ b/include/sys/dsl_deadlist.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2015 by Delphix. All rights reserved. + * Copyright (c) 2018, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_DEADLIST_H @@ -28,12 +28,14 @@ #include <sys/bpobj.h> #include <sys/zfs_context.h> +#include <sys/zthr.h> #ifdef __cplusplus extern "C" { #endif struct dmu_buf; +struct dsl_pool; struct dsl_dataset; typedef struct dsl_deadlist_phys { @@ -63,13 +65,34 @@ typedef struct dsl_deadlist_entry { bpobj_t dle_bpobj; } dsl_deadlist_entry_t; +typedef struct livelist_condense_entry { + struct dsl_dataset *ds; + dsl_deadlist_entry_t *first; + dsl_deadlist_entry_t *next; + boolean_t syncing; + boolean_t cancelled; +} livelist_condense_entry_t; + +extern unsigned long zfs_livelist_max_entries; +extern int zfs_livelist_min_percent_shared; + +typedef int deadlist_iter_t(void *args, dsl_deadlist_entry_t *dle); + void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object); void dsl_deadlist_close(dsl_deadlist_t *dl); +void dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *arg); uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx); void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx); -void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx); +void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, + boolean_t free, dmu_tx_t *tx); +int dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); +int dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); +void dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, +dmu_tx_t *tx); +dsl_deadlist_entry_t *dsl_deadlist_first(dsl_deadlist_t *dl); +dsl_deadlist_entry_t *dsl_deadlist_last(dsl_deadlist_t *dl); uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, uint64_t mrs_obj, dmu_tx_t *tx); void dsl_deadlist_space(dsl_deadlist_t *dl, @@ -81,6 +104,10 @@ void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx); void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, dmu_tx_t *tx); boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl); +int dsl_process_sub_livelist(bpobj_t *bpobj, struct bplist *to_free, + zthr_t *t, uint64_t *size); +void dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl, + dmu_tx_t *tx); #ifdef __cplusplus } diff --git a/include/sys/dsl_destroy.h b/include/sys/dsl_destroy.h index c4dbea26b..208d75bac 100644 --- a/include/sys/dsl_destroy.h +++ b/include/sys/dsl_destroy.h @@ -33,6 +33,7 @@ extern "C" { struct nvlist; struct dsl_dataset; +struct dsl_pool; struct dmu_tx; int dsl_destroy_snapshots_nvl(struct nvlist *, boolean_t, diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h index 08d1fcb37..bb6921027 100644 --- a/include/sys/dsl_dir.h +++ b/include/sys/dsl_dir.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -29,18 +29,20 @@ #define _SYS_DSL_DIR_H #include <sys/dmu.h> +#include <sys/dsl_deadlist.h> #include <sys/dsl_pool.h> #include <sys/dsl_synctask.h> #include <sys/refcount.h> #include <sys/zfs_context.h> #include <sys/dsl_crypt.h> +#include <sys/bplist.h> #ifdef __cplusplus extern "C" { #endif struct dsl_dataset; - +struct zthr; /* * DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object. * They should be of the format <reverse-dns>:<field>. @@ -49,6 +51,7 @@ struct dsl_dataset; #define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count" #define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count" #define DD_FIELD_CRYPTO_KEY_OBJ "com.datto:crypto_key_obj" +#define DD_FIELD_LIVELIST "com.delphix:livelist" typedef enum dd_used { DD_USED_HEAD, @@ -114,6 +117,10 @@ struct dsl_dir { /* amount of space we expect to write; == amount of dirty data */ int64_t dd_space_towrite[TXG_SIZE]; + dsl_deadlist_t dd_livelist; + bplist_t dd_pending_frees; + bplist_t dd_pending_allocs; + /* protected by dd_lock; keep at end of struct for better locality */ char dd_myname[ZFS_MAX_DATASET_NAME_LEN]; }; @@ -182,6 +189,9 @@ void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx); void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx); boolean_t dsl_dir_is_zapified(dsl_dir_t *dd); +void dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj); +void dsl_dir_livelist_close(dsl_dir_t *dd); +void dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total); /* internal reserved dir name */ #define MOS_DIR_NAME "$MOS" diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 63ba3509a..172ecdc46 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ @@ -54,6 +54,7 @@ struct dsl_pool; struct dmu_tx; struct dsl_scan; struct dsl_crypto_params; +struct dsl_deadlist; extern unsigned long zfs_dirty_data_max; extern unsigned long zfs_dirty_data_max_max; diff --git a/include/sys/spa.h b/include/sys/spa.h index 50ca15be5..e64313783 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -63,6 +63,8 @@ typedef struct ddt ddt_t; typedef struct ddt_entry ddt_entry_t; typedef struct zbookmark_phys zbookmark_phys_t; +struct bpobj; +struct bplist; struct dsl_pool; struct dsl_dataset; struct dsl_crypto_params; @@ -532,6 +534,9 @@ _NOTE(CONSTCOND) } while (0) #define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1) #define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) +#define BP_GET_FREE(bp) BF64_GET((bp)->blk_fill, 0, 1) +#define BP_SET_FREE(bp, x) BF64_SET((bp)->blk_fill, 0, 1, x) + #define BP_PHYSICAL_BIRTH(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) @@ -654,6 +659,7 @@ _NOTE(CONSTCOND) } while (0) * 'func' is either snprintf() or mdb_snprintf(). * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. */ + #define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \ { \ static const char *copyname[] = \ @@ -804,6 +810,8 @@ extern spa_t *spa_inject_addref(char *pool); extern void spa_inject_delref(spa_t *spa); extern void spa_scan_stat_init(spa_t *spa); extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); +extern int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); +extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 @@ -1131,6 +1139,7 @@ extern uint64_t spa_total_metaslabs(spa_t *spa); extern boolean_t spa_multihost(spa_t *spa); extern unsigned long spa_get_hostid(void); extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); +extern boolean_t spa_livelist_delete_check(spa_t *spa); extern int spa_mode(spa_t *spa); extern uint64_t zfs_strtonum(const char *str, char **nptr); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 929144017..ebe14dae4 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -49,6 +49,7 @@ #include <sys/dsl_crypt.h> #include <sys/zfeature.h> #include <sys/zthr.h> +#include <sys/dsl_deadlist.h> #include <zfeature_common.h> #ifdef __cplusplus @@ -317,6 +318,11 @@ struct spa { list_t spa_log_summary; uint64_t spa_log_flushall_txg; + zthr_t *spa_livelist_delete_zthr; /* deleting livelists */ + zthr_t *spa_livelist_condense_zthr; /* condensing livelists */ + uint64_t spa_livelists_to_delete; /* set of livelists to free */ + livelist_condense_entry_t spa_to_condense; /* next to condense */ + char *spa_root; /* alternate root directory */ uint64_t spa_ena; /* spa-wide ereport ENA */ int spa_last_open_failed; /* error if last open failed */ diff --git a/include/sys/zthr.h b/include/sys/zthr.h index 33c218ec4..0a05f5225 100644 --- a/include/sys/zthr.h +++ b/include/sys/zthr.h @@ -33,7 +33,9 @@ extern void zthr_destroy(zthr_t *t); extern void zthr_wakeup(zthr_t *t); extern void zthr_cancel(zthr_t *t); extern void zthr_resume(zthr_t *t); +extern void zthr_wait_cycle_done(zthr_t *t); extern boolean_t zthr_iscancelled(zthr_t *t); +extern boolean_t zthr_has_waiters(zthr_t *t); #endif /* _SYS_ZTHR_H */ |