aboutsummaryrefslogtreecommitdiffstats
path: root/include/sys
diff options
context:
space:
mode:
authorSara Hartse <[email protected]>2019-07-26 10:54:14 -0700
committerBrian Behlendorf <[email protected]>2019-07-26 10:54:14 -0700
commit37f03da8ba6e1ab074b503e1dd63bfa7199d0537 (patch)
tree987b03643c33cd43b246a20aea28b8750f7b4ee6 /include/sys
parentd274ac54609894d00a49c0a0da89abd3a7f3998d (diff)
Fast Clone Deletion
Deleting a clone requires finding blocks are clone-only, not shared with the snapshot. This was done by traversing the entire block tree which results in a large performance penalty for sparsely written clones. This is new method keeps track of clone blocks when they are modified in a "Livelist" so that, when it’s time to delete, the clone-specific blocks are already at hand. We see performance improvements because now deletion work is proportional to the number of clone-modified blocks, not the size of the original dataset. Reviewed-by: Sean Eric Fagan <[email protected]> Reviewed-by: Matt Ahrens <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Serapheim Dimitropoulos <[email protected]> Signed-off-by: Sara Hartse <[email protected]> Closes #8416
Diffstat (limited to 'include/sys')
-rw-r--r--include/sys/bplist.h2
-rw-r--r--include/sys/bpobj.h19
-rw-r--r--include/sys/dmu.h2
-rw-r--r--include/sys/dmu_objset.h2
-rw-r--r--include/sys/dsl_deadlist.h31
-rw-r--r--include/sys/dsl_destroy.h1
-rw-r--r--include/sys/dsl_dir.h14
-rw-r--r--include/sys/dsl_pool.h3
-rw-r--r--include/sys/spa.h9
-rw-r--r--include/sys/spa_impl.h6
-rw-r--r--include/sys/zthr.h2
11 files changed, 81 insertions, 10 deletions
diff --git a/include/sys/bplist.h b/include/sys/bplist.h
index 471be9047..f8deaf843 100644
--- a/include/sys/bplist.h
+++ b/include/sys/bplist.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018 by Delphix. All rights reserved.
*/
#ifndef _SYS_BPLIST_H
@@ -49,6 +50,7 @@ void bplist_destroy(bplist_t *bpl);
void bplist_append(bplist_t *bpl, const blkptr_t *bp);
void bplist_iterate(bplist_t *bpl, bplist_itor_t *func,
void *arg, dmu_tx_t *tx);
+void bplist_clear(bplist_t *bpl);
#ifdef __cplusplus
}
diff --git a/include/sys/bpobj.h b/include/sys/bpobj.h
index d425e239f..16e403526 100644
--- a/include/sys/bpobj.h
+++ b/include/sys/bpobj.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2015, 2019 by Delphix. All rights reserved.
*/
#ifndef _SYS_BPOBJ_H
@@ -31,6 +31,7 @@
#include <sys/txg.h>
#include <sys/zio.h>
#include <sys/zfs_context.h>
+#include <sys/bplist.h>
#ifdef __cplusplus
extern "C" {
@@ -48,10 +49,12 @@ typedef struct bpobj_phys {
uint64_t bpo_uncomp;
uint64_t bpo_subobjs;
uint64_t bpo_num_subobjs;
+ uint64_t bpo_num_freed;
} bpobj_phys_t;
#define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t))
#define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t))
+#define BPOBJ_SIZE_V2 (6 * sizeof (uint64_t))
typedef struct bpobj {
kmutex_t bpo_lock;
@@ -60,12 +63,14 @@ typedef struct bpobj {
int bpo_epb;
uint8_t bpo_havecomp;
uint8_t bpo_havesubobj;
+ uint8_t bpo_havefreed;
bpobj_phys_t *bpo_phys;
dmu_buf_t *bpo_dbuf;
dmu_buf_t *bpo_cached_dbuf;
} bpobj_t;
-typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+ dmu_tx_t *tx);
uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx);
@@ -77,10 +82,13 @@ void bpobj_close(bpobj_t *bpo);
boolean_t bpobj_is_open(const bpobj_t *bpo);
int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx);
-int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *);
+int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, uint64_t *);
+int livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func,
+ void *arg, int64_t start);
void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
-void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx);
+void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
+ dmu_tx_t *tx);
int bpobj_space(bpobj_t *bpo,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
@@ -88,6 +96,9 @@ int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
boolean_t bpobj_is_empty(bpobj_t *bpo);
+int bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+ dmu_tx_t *tx);
+
#ifdef __cplusplus
}
#endif
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 65da78eb5..62de1eaf5 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -383,6 +383,7 @@ typedef struct dmu_buf {
#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect"
#define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint"
#define DMU_POOL_LOG_SPACEMAP_ZAP "com.delphix:log_spacemap_zap"
+#define DMU_POOL_DELETED_CLONES "com.delphix:deleted_clones"
/*
* Allocate an object from this objset. The range of object numbers
@@ -1003,6 +1004,7 @@ extern uint64_t dmu_objset_id(objset_t *os);
extern uint64_t dmu_objset_dnodesize(objset_t *os);
extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
+extern int dmu_objset_blksize(objset_t *os);
extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val);
diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h
index c0650bcde..9b6614e98 100644
--- a/include/sys/dmu_objset.h
+++ b/include/sys/dmu_objset.h
@@ -126,7 +126,7 @@ struct objset {
zfs_cache_type_t os_secondary_cache;
zfs_sync_type_t os_sync;
zfs_redundant_metadata_type_t os_redundant_metadata;
- int os_recordsize;
+ uint64_t os_recordsize;
/*
* The next four values are used as a cache of whatever's on disk, and
* are initialized the first time these properties are queried. Before
diff --git a/include/sys/dsl_deadlist.h b/include/sys/dsl_deadlist.h
index 08f38233d..bb8248a66 100644
--- a/include/sys/dsl_deadlist.h
+++ b/include/sys/dsl_deadlist.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
*/
#ifndef _SYS_DSL_DEADLIST_H
@@ -28,12 +28,14 @@
#include <sys/bpobj.h>
#include <sys/zfs_context.h>
+#include <sys/zthr.h>
#ifdef __cplusplus
extern "C" {
#endif
struct dmu_buf;
+struct dsl_pool;
struct dsl_dataset;
typedef struct dsl_deadlist_phys {
@@ -63,13 +65,34 @@ typedef struct dsl_deadlist_entry {
bpobj_t dle_bpobj;
} dsl_deadlist_entry_t;
+typedef struct livelist_condense_entry {
+ struct dsl_dataset *ds;
+ dsl_deadlist_entry_t *first;
+ dsl_deadlist_entry_t *next;
+ boolean_t syncing;
+ boolean_t cancelled;
+} livelist_condense_entry_t;
+
+extern unsigned long zfs_livelist_max_entries;
+extern int zfs_livelist_min_percent_shared;
+
+typedef int deadlist_iter_t(void *args, dsl_deadlist_entry_t *dle);
+
void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object);
void dsl_deadlist_close(dsl_deadlist_t *dl);
+void dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *arg);
uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx);
void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx);
-void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx);
+void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp,
+ boolean_t free, dmu_tx_t *tx);
+int dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+int dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
+void dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg,
+dmu_tx_t *tx);
+dsl_deadlist_entry_t *dsl_deadlist_first(dsl_deadlist_t *dl);
+dsl_deadlist_entry_t *dsl_deadlist_last(dsl_deadlist_t *dl);
uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
uint64_t mrs_obj, dmu_tx_t *tx);
void dsl_deadlist_space(dsl_deadlist_t *dl,
@@ -81,6 +104,10 @@ void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx);
void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
dmu_tx_t *tx);
boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl);
+int dsl_process_sub_livelist(bpobj_t *bpobj, struct bplist *to_free,
+ zthr_t *t, uint64_t *size);
+void dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl,
+ dmu_tx_t *tx);
#ifdef __cplusplus
}
diff --git a/include/sys/dsl_destroy.h b/include/sys/dsl_destroy.h
index c4dbea26b..208d75bac 100644
--- a/include/sys/dsl_destroy.h
+++ b/include/sys/dsl_destroy.h
@@ -33,6 +33,7 @@ extern "C" {
struct nvlist;
struct dsl_dataset;
+struct dsl_pool;
struct dmu_tx;
int dsl_destroy_snapshots_nvl(struct nvlist *, boolean_t,
diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h
index 08d1fcb37..bb6921027 100644
--- a/include/sys/dsl_dir.h
+++ b/include/sys/dsl_dir.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
@@ -29,18 +29,20 @@
#define _SYS_DSL_DIR_H
#include <sys/dmu.h>
+#include <sys/dsl_deadlist.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_synctask.h>
#include <sys/refcount.h>
#include <sys/zfs_context.h>
#include <sys/dsl_crypt.h>
+#include <sys/bplist.h>
#ifdef __cplusplus
extern "C" {
#endif
struct dsl_dataset;
-
+struct zthr;
/*
* DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object.
* They should be of the format <reverse-dns>:<field>.
@@ -49,6 +51,7 @@ struct dsl_dataset;
#define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count"
#define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count"
#define DD_FIELD_CRYPTO_KEY_OBJ "com.datto:crypto_key_obj"
+#define DD_FIELD_LIVELIST "com.delphix:livelist"
typedef enum dd_used {
DD_USED_HEAD,
@@ -114,6 +117,10 @@ struct dsl_dir {
/* amount of space we expect to write; == amount of dirty data */
int64_t dd_space_towrite[TXG_SIZE];
+ dsl_deadlist_t dd_livelist;
+ bplist_t dd_pending_frees;
+ bplist_t dd_pending_allocs;
+
/* protected by dd_lock; keep at end of struct for better locality */
char dd_myname[ZFS_MAX_DATASET_NAME_LEN];
};
@@ -182,6 +189,9 @@ void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
dmu_tx_t *tx);
void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx);
boolean_t dsl_dir_is_zapified(dsl_dir_t *dd);
+void dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj);
+void dsl_dir_livelist_close(dsl_dir_t *dd);
+void dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total);
/* internal reserved dir name */
#define MOS_DIR_NAME "$MOS"
diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h
index 63ba3509a..172ecdc46 100644
--- a/include/sys/dsl_pool.h
+++ b/include/sys/dsl_pool.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
*/
@@ -54,6 +54,7 @@ struct dsl_pool;
struct dmu_tx;
struct dsl_scan;
struct dsl_crypto_params;
+struct dsl_deadlist;
extern unsigned long zfs_dirty_data_max;
extern unsigned long zfs_dirty_data_max_max;
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 50ca15be5..e64313783 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -63,6 +63,8 @@ typedef struct ddt ddt_t;
typedef struct ddt_entry ddt_entry_t;
typedef struct zbookmark_phys zbookmark_phys_t;
+struct bpobj;
+struct bplist;
struct dsl_pool;
struct dsl_dataset;
struct dsl_crypto_params;
@@ -532,6 +534,9 @@ _NOTE(CONSTCOND) } while (0)
#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1)
#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
+#define BP_GET_FREE(bp) BF64_GET((bp)->blk_fill, 0, 1)
+#define BP_SET_FREE(bp, x) BF64_SET((bp)->blk_fill, 0, 1, x)
+
#define BP_PHYSICAL_BIRTH(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
@@ -654,6 +659,7 @@ _NOTE(CONSTCOND) } while (0)
* 'func' is either snprintf() or mdb_snprintf().
* 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
*/
+
#define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \
{ \
static const char *copyname[] = \
@@ -804,6 +810,8 @@ extern spa_t *spa_inject_addref(char *pool);
extern void spa_inject_delref(spa_t *spa);
extern void spa_scan_stat_init(spa_t *spa);
extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
+extern int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
#define SPA_ASYNC_CONFIG_UPDATE 0x01
#define SPA_ASYNC_REMOVE 0x02
@@ -1131,6 +1139,7 @@ extern uint64_t spa_total_metaslabs(spa_t *spa);
extern boolean_t spa_multihost(spa_t *spa);
extern unsigned long spa_get_hostid(void);
extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
+extern boolean_t spa_livelist_delete_check(spa_t *spa);
extern int spa_mode(spa_t *spa);
extern uint64_t zfs_strtonum(const char *str, char **nptr);
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 929144017..ebe14dae4 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -49,6 +49,7 @@
#include <sys/dsl_crypt.h>
#include <sys/zfeature.h>
#include <sys/zthr.h>
+#include <sys/dsl_deadlist.h>
#include <zfeature_common.h>
#ifdef __cplusplus
@@ -317,6 +318,11 @@ struct spa {
list_t spa_log_summary;
uint64_t spa_log_flushall_txg;
+ zthr_t *spa_livelist_delete_zthr; /* deleting livelists */
+ zthr_t *spa_livelist_condense_zthr; /* condensing livelists */
+ uint64_t spa_livelists_to_delete; /* set of livelists to free */
+ livelist_condense_entry_t spa_to_condense; /* next to condense */
+
char *spa_root; /* alternate root directory */
uint64_t spa_ena; /* spa-wide ereport ENA */
int spa_last_open_failed; /* error if last open failed */
diff --git a/include/sys/zthr.h b/include/sys/zthr.h
index 33c218ec4..0a05f5225 100644
--- a/include/sys/zthr.h
+++ b/include/sys/zthr.h
@@ -33,7 +33,9 @@ extern void zthr_destroy(zthr_t *t);
extern void zthr_wakeup(zthr_t *t);
extern void zthr_cancel(zthr_t *t);
extern void zthr_resume(zthr_t *t);
+extern void zthr_wait_cycle_done(zthr_t *t);
extern boolean_t zthr_iscancelled(zthr_t *t);
+extern boolean_t zthr_has_waiters(zthr_t *t);
#endif /* _SYS_ZTHR_H */