aboutsummaryrefslogtreecommitdiffstats
path: root/include/sys
diff options
context:
space:
mode:
authorTom Caputi <[email protected]>2017-11-15 20:27:01 -0500
committerBrian Behlendorf <[email protected]>2017-11-15 17:27:01 -0800
commitd4a72f23863382bdf6d0ae33196f5b5decbc48fd (patch)
tree1084ea930b9a1ef46e58d1757943ab3ad66c22c4 /include/sys
parente301113c17673a290098850830cf2e6d1a1fcbe3 (diff)
Sequential scrub and resilvers
Currently, scrubs and resilvers can take an extremely long time to complete. This is largely due to the fact that zfs scans process pools in logical order, as determined by each block's bookmark. This makes sense from a simplicity perspective, but blocks in zfs are often scattered randomly across disks, particularly due to zfs's copy-on-write mechanisms. This patch improves performance by splitting scrubs and resilvers into a metadata scanning phase and an IO issuing phase. The metadata scan reads through the structure of the pool and gathers an in-memory queue of I/Os, sorted by size and offset on disk. The issuing phase will then issue the scrub I/Os as sequentially as possible, greatly improving performance. This patch also updates and cleans up some of the scan code which has not been updated in several years. Reviewed-by: Brian Behlendorf <[email protected]> Authored-by: Saso Kiselkov <[email protected]> Authored-by: Alek Pinchuk <[email protected]> Authored-by: Tom Caputi <[email protected]> Signed-off-by: Tom Caputi <[email protected]> Closes #3625 Closes #6256
Diffstat (limited to 'include/sys')
-rw-r--r--include/sys/arc.h39
-rw-r--r--include/sys/dsl_pool.h1
-rw-r--r--include/sys/dsl_scan.h47
-rw-r--r--include/sys/fs/zfs.h6
-rw-r--r--include/sys/range_tree.h23
-rw-r--r--include/sys/spa_impl.h5
-rw-r--r--include/sys/vdev_impl.h7
7 files changed, 100 insertions, 28 deletions
diff --git a/include/sys/arc.h b/include/sys/arc.h
index 7428a1629..0e7a85188 100644
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@@ -66,11 +66,11 @@ typedef struct arc_prune arc_prune_t;
* while transforming data into its desired format - specifically, when
* decrypting, the key may not be present, or the HMAC may not be correct
* which signifies deliberate tampering with the on-disk state
- * (assuming that the checksum was correct). The "error" parameter will be
- * nonzero in this case, even if there is no associated zio.
+ * (assuming that the checksum was correct). If any error occurs, the "buf"
+ * parameter will be NULL.
*/
-typedef void arc_read_done_func_t(zio_t *zio, int error, arc_buf_t *buf,
- void *private);
+typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb,
+ const blkptr_t *bp, arc_buf_t *buf, void *private);
typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *private);
typedef void arc_prune_func_t(int64_t bytes, void *private);
@@ -106,44 +106,45 @@ typedef enum arc_flags
ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */
ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */
ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */
+ ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */
/*
* Private ARC flags. These flags are private ARC only flags that
* will show up in b_flags in the arc_hdr_buf_t. These flags should
* only be set by ARC code.
*/
- ARC_FLAG_IN_HASH_TABLE = 1 << 6, /* buffer is hashed */
- ARC_FLAG_IO_IN_PROGRESS = 1 << 7, /* I/O in progress */
- ARC_FLAG_IO_ERROR = 1 << 8, /* I/O failed for buf */
- ARC_FLAG_INDIRECT = 1 << 9, /* indirect block */
+ ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */
+ ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */
+ ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */
+ ARC_FLAG_INDIRECT = 1 << 10, /* indirect block */
/* Indicates that block was read with ASYNC priority. */
- ARC_FLAG_PRIO_ASYNC_READ = 1 << 10,
- ARC_FLAG_L2_WRITING = 1 << 11, /* write in progress */
- ARC_FLAG_L2_EVICTED = 1 << 12, /* evicted during I/O */
- ARC_FLAG_L2_WRITE_HEAD = 1 << 13, /* head of write list */
+ ARC_FLAG_PRIO_ASYNC_READ = 1 << 11,
+ ARC_FLAG_L2_WRITING = 1 << 12, /* write in progress */
+ ARC_FLAG_L2_EVICTED = 1 << 13, /* evicted during I/O */
+ ARC_FLAG_L2_WRITE_HEAD = 1 << 14, /* head of write list */
/*
* Encrypted or authenticated on disk (may be plaintext in memory).
* This header has b_crypt_hdr allocated. Does not include indirect
* blocks with checksums of MACs which will also have their X
* (encrypted) bit set in the bp.
*/
- ARC_FLAG_PROTECTED = 1 << 14,
+ ARC_FLAG_PROTECTED = 1 << 15,
/* data has not been authenticated yet */
- ARC_FLAG_NOAUTH = 1 << 15,
+ ARC_FLAG_NOAUTH = 1 << 16,
/* indicates that the buffer contains metadata (otherwise, data) */
- ARC_FLAG_BUFC_METADATA = 1 << 16,
+ ARC_FLAG_BUFC_METADATA = 1 << 17,
/* Flags specifying whether optional hdr struct fields are defined */
- ARC_FLAG_HAS_L1HDR = 1 << 17,
- ARC_FLAG_HAS_L2HDR = 1 << 18,
+ ARC_FLAG_HAS_L1HDR = 1 << 18,
+ ARC_FLAG_HAS_L2HDR = 1 << 19,
/*
* Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data.
* This allows the l2arc to use the blkptr's checksum to verify
* the data without having to store the checksum in the hdr.
*/
- ARC_FLAG_COMPRESSED_ARC = 1 << 19,
- ARC_FLAG_SHARED_DATA = 1 << 20,
+ ARC_FLAG_COMPRESSED_ARC = 1 << 20,
+ ARC_FLAG_SHARED_DATA = 1 << 21,
/*
* The arc buffer's compression mode is stored in the top 7 bits of the
diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h
index 044ef9544..9ceb59d9b 100644
--- a/include/sys/dsl_pool.h
+++ b/include/sys/dsl_pool.h
@@ -80,6 +80,7 @@ typedef struct zfs_blkstat {
typedef struct zfs_all_blkstats {
zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1];
+ kmutex_t zab_lock;
} zfs_all_blkstats_t;
diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h
index 5303d9a69..7a29d9788 100644
--- a/include/sys/dsl_scan.h
+++ b/include/sys/dsl_scan.h
@@ -108,22 +108,56 @@ typedef enum dsl_scan_flags {
*/
typedef struct dsl_scan {
struct dsl_pool *scn_dp;
-
- boolean_t scn_suspending;
uint64_t scn_restart_txg;
uint64_t scn_done_txg;
uint64_t scn_sync_start_time;
- zio_t *scn_zio_root;
+ uint64_t scn_issued_before_pass;
/* for freeing blocks */
boolean_t scn_is_bptree;
boolean_t scn_async_destroying;
boolean_t scn_async_stalled;
- uint64_t scn_visited_this_txg;
- dsl_scan_phys_t scn_phys;
+ /* flags and stats for controlling scan state */
+ boolean_t scn_is_sorted; /* doing sequential scan */
+ boolean_t scn_clearing; /* scan is issuing sequential extents */
+ boolean_t scn_checkpointing; /* scan is issuing all queued extents */
+ boolean_t scn_suspending; /* scan is suspending until next txg */
+ uint64_t scn_last_checkpoint; /* time of last checkpoint */
+
+ /* members for thread synchronization */
+ zio_t *scn_zio_root; /* root zio for waiting on IO */
+ taskq_t *scn_taskq; /* task queue for issuing extents */
+
+ /* for controlling scan prefetch, protected by spa_scrub_lock */
+ boolean_t scn_prefetch_stop; /* prefetch should stop */
+ zbookmark_phys_t scn_prefetch_bookmark; /* prefetch start bookmark */
+ avl_tree_t scn_prefetch_queue; /* priority queue of prefetch IOs */
+ uint64_t scn_maxinflight_bytes; /* max bytes in flight for pool */
+
+ /* per txg statistics */
+ uint64_t scn_visited_this_txg; /* total bps visited this txg */
+ uint64_t scn_holes_this_txg;
+ uint64_t scn_lt_min_this_txg;
+ uint64_t scn_gt_max_this_txg;
+ uint64_t scn_ddt_contained_this_txg;
+ uint64_t scn_objsets_visited_this_txg;
+ uint64_t scn_avg_seg_size_this_txg;
+ uint64_t scn_segs_this_txg;
+ uint64_t scn_avg_zio_size_this_txg;
+ uint64_t scn_zios_this_txg;
+
+ /* members needed for syncing scan status to disk */
+ dsl_scan_phys_t scn_phys; /* on disk representation of scan */
+ dsl_scan_phys_t scn_phys_cached;
+ avl_tree_t scn_queue; /* queue of datasets to scan */
+ uint64_t scn_bytes_pending; /* outstanding data to issue */
} dsl_scan_t;
+typedef struct dsl_scan_io_queue dsl_scan_io_queue_t;
+
+void scan_init(void);
+void scan_fini(void);
int dsl_scan_init(struct dsl_pool *dp, uint64_t txg);
void dsl_scan_fini(struct dsl_pool *dp);
void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *);
@@ -142,6 +176,9 @@ void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
struct dmu_tx *tx);
boolean_t dsl_scan_active(dsl_scan_t *scn);
boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn);
+void dsl_scan_freed(spa_t *spa, const blkptr_t *bp);
+void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue);
+void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd);
#ifdef __cplusplus
}
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 1aa3b21b5..88e8671db 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -859,17 +859,19 @@ typedef struct pool_scan_stat {
uint64_t pss_start_time; /* scan start time */
uint64_t pss_end_time; /* scan end time */
uint64_t pss_to_examine; /* total bytes to scan */
- uint64_t pss_examined; /* total examined bytes */
+ uint64_t pss_examined; /* total bytes located by scanner */
uint64_t pss_to_process; /* total bytes to process */
uint64_t pss_processed; /* total processed bytes */
uint64_t pss_errors; /* scan errors */
/* values not stored on disk */
- uint64_t pss_pass_exam; /* examined bytes per scan pass */
+ uint64_t pss_pass_exam; /* examined bytes per scan pass */
+ uint64_t pss_pass_issued; /* issued bytes per scan pass */
uint64_t pss_pass_start; /* start time of a scan pass */
uint64_t pss_pass_scrub_pause; /* pause time of a scurb pass */
/* cumulative time scrub spent paused, needed for rate calculation */
uint64_t pss_pass_scrub_spent_paused;
+ uint64_t pss_issued; /* total bytes checked by scanner */
} pool_scan_stat_t;
typedef enum dsl_scan_state {
diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h
index 9f3ead537..1d3bdf9e5 100644
--- a/include/sys/range_tree.h
+++ b/include/sys/range_tree.h
@@ -44,8 +44,13 @@ typedef struct range_tree_ops range_tree_ops_t;
typedef struct range_tree {
avl_tree_t rt_root; /* offset-ordered segment AVL tree */
uint64_t rt_space; /* sum of all segments in the map */
+ uint64_t rt_gap; /* allowable inter-segment gap */
range_tree_ops_t *rt_ops;
+
+ /* rt_avl_compare should only be set if rt_arg is an AVL tree */
void *rt_arg;
+ int (*rt_avl_compare)(const void *, const void *);
+
/*
* The rt_histogram maintains a histogram of ranges. Each bucket,
@@ -61,6 +66,7 @@ typedef struct range_seg {
avl_node_t rs_pp_node; /* AVL picker-private node */
uint64_t rs_start; /* starting offset of this segment */
uint64_t rs_end; /* ending offset (non-inclusive) */
+ uint64_t rs_fill; /* actual fill if gap mode is on */
} range_seg_t;
struct range_tree_ops {
@@ -75,20 +81,37 @@ typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size);
void range_tree_init(void);
void range_tree_fini(void);
+range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg,
+ int (*avl_compare) (const void *, const void *), kmutex_t *lp,
+ uint64_t gap);
range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp);
void range_tree_destroy(range_tree_t *rt);
boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
+range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
+void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
+ uint64_t newstart, uint64_t newsize);
uint64_t range_tree_space(range_tree_t *rt);
void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
void range_tree_stat_verify(range_tree_t *rt);
+void range_tree_set_lock(range_tree_t *rt, kmutex_t *lp);
void range_tree_add(void *arg, uint64_t start, uint64_t size);
void range_tree_remove(void *arg, uint64_t start, uint64_t size);
+void range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size);
+void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta);
void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size);
void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg);
void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg);
+range_seg_t *range_tree_first(range_tree_t *rt);
+
+void rt_avl_create(range_tree_t *rt, void *arg);
+void rt_avl_destroy(range_tree_t *rt, void *arg);
+void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg);
+void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg);
+void rt_avl_vacate(range_tree_t *rt, void *arg);
+extern struct range_tree_ops rt_avl_ops;
#ifdef __cplusplus
}
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 926a0bc24..2fc598016 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -185,9 +185,9 @@ struct spa {
uberblock_t spa_ubsync; /* last synced uberblock */
uberblock_t spa_uberblock; /* current uberblock */
boolean_t spa_extreme_rewind; /* rewind past deferred frees */
- uint64_t spa_last_io; /* lbolt of last non-scan I/O */
kmutex_t spa_scrub_lock; /* resilver/scrub lock */
- uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */
+ uint64_t spa_scrub_inflight; /* in-flight scrub bytes */
+ uint64_t spa_load_verify_ios; /* in-flight verification IOs */
kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */
uint8_t spa_scrub_active; /* active or suspended? */
uint8_t spa_scrub_type; /* type of scrub we're doing */
@@ -198,6 +198,7 @@ struct spa {
uint64_t spa_scan_pass_scrub_pause; /* scrub pause time */
uint64_t spa_scan_pass_scrub_spent_paused; /* total paused */
uint64_t spa_scan_pass_exam; /* examined bytes per pass */
+ uint64_t spa_scan_pass_issued; /* issued bytes per pass */
kmutex_t spa_async_lock; /* protect async state */
kthread_t *spa_async_thread; /* thread doing async task */
int spa_async_suspended; /* async tasks suspended */
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 4c2e3cd2e..5f953a8db 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -198,6 +198,13 @@ struct vdev {
uint64_t vdev_max_async_write_queue_depth;
/*
+ * Protects the vdev_scan_io_queue field itself as well as the
+ * structure's contents (when present).
+ */
+ kmutex_t vdev_scan_io_queue_lock;
+ struct dsl_scan_io_queue *vdev_scan_io_queue;
+
+ /*
* Leaf vdev state.
*/
range_tree_t *vdev_dtl[DTL_TYPES]; /* dirty time logs */