summaryrefslogtreecommitdiffstats
path: root/module/zfs/arc.c
diff options
context:
space:
mode:
authorTom Caputi <[email protected]>2017-11-15 20:27:01 -0500
committerBrian Behlendorf <[email protected]>2017-11-15 17:27:01 -0800
commitd4a72f23863382bdf6d0ae33196f5b5decbc48fd (patch)
tree1084ea930b9a1ef46e58d1757943ab3ad66c22c4 /module/zfs/arc.c
parente301113c17673a290098850830cf2e6d1a1fcbe3 (diff)
Sequential scrub and resilvers
Currently, scrubs and resilvers can take an extremely long time to complete. This is largely due to the fact that zfs scans process pools in logical order, as determined by each block's bookmark. This makes sense from a simplicity perspective, but blocks in zfs are often scattered randomly across disks, particularly due to zfs's copy-on-write mechanisms. This patch improves performance by splitting scrubs and resilvers into a metadata scanning phase and an IO issuing phase. The metadata scan reads through the structure of the pool and gathers an in-memory queue of I/Os, sorted by size and offset on disk. The issuing phase will then issue the scrub I/Os as sequentially as possible, greatly improving performance. This patch also updates and cleans up some of the scan code which has not been updated in several years. Reviewed-by: Brian Behlendorf <[email protected]> Authored-by: Saso Kiselkov <[email protected]> Authored-by: Alek Pinchuk <[email protected]> Authored-by: Tom Caputi <[email protected]> Signed-off-by: Tom Caputi <[email protected]> Closes #3625 Closes #6256
Diffstat (limited to 'module/zfs/arc.c')
-rw-r--r--module/zfs/arc.c123
1 files changed, 84 insertions, 39 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index cd343b04e..698357632 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -357,7 +357,8 @@ int arc_no_grow_shift = 5;
* minimum lifespan of a prefetch block in clock ticks
* (initialized in arc_init())
*/
-static int arc_min_prefetch_lifespan;
+static int arc_min_prefetch_ms;
+static int arc_min_prescient_prefetch_ms;
/*
* If this percent of memory is free, don't throttle.
@@ -407,7 +408,8 @@ unsigned long zfs_arc_dnode_limit_percent = 10;
* These tunables are Linux specific
*/
unsigned long zfs_arc_sys_free = 0;
-int zfs_arc_min_prefetch_lifespan = 0;
+int zfs_arc_min_prefetch_ms = 0;
+int zfs_arc_min_prescient_prefetch_ms = 0;
int zfs_arc_p_aggressive_disable = 1;
int zfs_arc_p_dampener_disable = 1;
int zfs_arc_meta_prune = 10000;
@@ -663,6 +665,7 @@ typedef struct arc_stats {
kstat_named_t arcstat_meta_min;
kstat_named_t arcstat_sync_wait_for_async;
kstat_named_t arcstat_demand_hit_predictive_prefetch;
+ kstat_named_t arcstat_demand_hit_prescient_prefetch;
kstat_named_t arcstat_need_free;
kstat_named_t arcstat_sys_free;
kstat_named_t arcstat_raw_size;
@@ -762,6 +765,7 @@ static arc_stats_t arc_stats = {
{ "arc_meta_min", KSTAT_DATA_UINT64 },
{ "sync_wait_for_async", KSTAT_DATA_UINT64 },
{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
+ { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
{ "arc_need_free", KSTAT_DATA_UINT64 },
{ "arc_sys_free", KSTAT_DATA_UINT64 },
{ "arc_raw_size", KSTAT_DATA_UINT64 }
@@ -861,6 +865,8 @@ static taskq_t *arc_prune_taskq;
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
+#define HDR_PRESCIENT_PREFETCH(hdr) \
+ ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
#define HDR_COMPRESSION_ENABLED(hdr) \
((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
@@ -3778,6 +3784,8 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
{
arc_state_t *evicted_state, *state;
int64_t bytes_evicted = 0;
+ int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
+ arc_min_prescient_prefetch_ms : arc_min_prefetch_ms;
ASSERT(MUTEX_HELD(hash_lock));
ASSERT(HDR_HAS_L1HDR(hdr));
@@ -3831,8 +3839,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
/* prefetch buffers have a minimum lifespan */
if (HDR_IO_IN_PROGRESS(hdr) ||
((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
- ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
- arc_min_prefetch_lifespan)) {
+ ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) {
ARCSTAT_BUMP(arcstat_evict_skip);
return (bytes_evicted);
}
@@ -5492,13 +5499,15 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* - move the buffer to the head of the list if this is
* another prefetch (to make it less likely to be evicted).
*/
- if (HDR_PREFETCH(hdr)) {
+ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
/* link protected by hash lock */
ASSERT(multilist_link_active(
&hdr->b_l1hdr.b_arc_node));
} else {
- arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PREFETCH |
+ ARC_FLAG_PRESCIENT_PREFETCH);
atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
ARCSTAT_BUMP(arcstat_mru_hits);
}
@@ -5532,10 +5541,13 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* MFU state.
*/
- if (HDR_PREFETCH(hdr)) {
+ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
new_state = arc_mru;
- if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
- arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+ if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PREFETCH |
+ ARC_FLAG_PRESCIENT_PREFETCH);
+ }
DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
} else {
new_state = arc_mfu;
@@ -5557,11 +5569,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* If it was a prefetch, we will explicitly move it to
* the head of the list now.
*/
- if ((HDR_PREFETCH(hdr)) != 0) {
- ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
- /* link protected by hash_lock */
- ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
- }
+
atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
ARCSTAT_BUMP(arcstat_mfu_hits);
hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
@@ -5573,12 +5581,11 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* MFU state.
*/
- if (HDR_PREFETCH(hdr)) {
+ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
/*
* This is a prefetch access...
* move this block back to the MRU state.
*/
- ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
new_state = arc_mru;
}
@@ -5605,20 +5612,25 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
/* a generic arc_read_done_func_t which you can use */
/* ARGSUSED */
void
-arc_bcopy_func(zio_t *zio, int error, arc_buf_t *buf, void *arg)
+arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+ arc_buf_t *buf, void *arg)
{
- if (error == 0)
- bcopy(buf->b_data, arg, arc_buf_size(buf));
+ if (buf == NULL)
+ return;
+
+ bcopy(buf->b_data, arg, arc_buf_size(buf));
arc_buf_destroy(buf, arg);
}
/* a generic arc_read_done_func_t */
+/* ARGSUSED */
void
-arc_getbuf_func(zio_t *zio, int error, arc_buf_t *buf, void *arg)
+arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+ arc_buf_t *buf, void *arg)
{
arc_buf_t **bufp = arg;
- if (error != 0) {
- arc_buf_destroy(buf, arg);
+
+ if (buf == NULL) {
*bufp = NULL;
} else {
*bufp = buf;
@@ -5652,7 +5664,6 @@ arc_read_done(zio_t *zio)
arc_callback_t *callback_list;
arc_callback_t *acb;
boolean_t freeable = B_FALSE;
- boolean_t no_zio_error = (zio->io_error == 0);
/*
* The hdr was inserted into hash-table and removed from lists
@@ -5699,7 +5710,7 @@ arc_read_done(zio_t *zio)
}
}
- if (no_zio_error) {
+ if (zio->io_error == 0) {
/* byteswap if necessary */
if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
if (BP_GET_LEVEL(zio->io_bp) > 0) {
@@ -5720,7 +5731,8 @@ arc_read_done(zio_t *zio)
callback_list = hdr->b_l1hdr.b_acb;
ASSERT3P(callback_list, !=, NULL);
- if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
+ if (hash_lock && zio->io_error == 0 &&
+ hdr->b_l1hdr.b_state == arc_anon) {
/*
* Only call arc_access on anonymous buffers. This is because
* if we've issued an I/O for an evicted buffer, we've already
@@ -5741,13 +5753,19 @@ arc_read_done(zio_t *zio)
if (!acb->acb_done)
continue;
- /* This is a demand read since prefetches don't use callbacks */
callback_cnt++;
+ if (zio->io_error != 0)
+ continue;
+
int error = arc_buf_alloc_impl(hdr, zio->io_spa,
acb->acb_dsobj, acb->acb_private, acb->acb_encrypted,
- acb->acb_compressed, acb->acb_noauth, no_zio_error,
+ acb->acb_compressed, acb->acb_noauth, B_TRUE,
&acb->acb_buf);
+ if (error != 0) {
+ arc_buf_destroy(acb->acb_buf, acb->acb_private);
+ acb->acb_buf = NULL;
+ }
/*
* Assert non-speculative zios didn't fail because an
@@ -5770,9 +5788,8 @@ arc_read_done(zio_t *zio)
}
}
- if (no_zio_error) {
+ if (zio->io_error == 0)
zio->io_error = error;
- }
}
hdr->b_l1hdr.b_acb = NULL;
arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
@@ -5782,7 +5799,7 @@ arc_read_done(zio_t *zio)
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
callback_list != NULL);
- if (no_zio_error) {
+ if (zio->io_error == 0) {
arc_hdr_verify(hdr, zio->io_bp);
} else {
arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
@@ -5816,8 +5833,8 @@ arc_read_done(zio_t *zio)
/* execute each callback and free its structure */
while ((acb = callback_list) != NULL) {
if (acb->acb_done) {
- acb->acb_done(zio, zio->io_error, acb->acb_buf,
- acb->acb_private);
+ acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
+ acb->acb_buf, acb->acb_private);
}
if (acb->acb_zio_dummy != NULL) {
@@ -5974,12 +5991,25 @@ top:
arc_hdr_clear_flags(hdr,
ARC_FLAG_PREDICTIVE_PREFETCH);
}
+
+ if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
+ ARCSTAT_BUMP(
+ arcstat_demand_hit_prescient_prefetch);
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PRESCIENT_PREFETCH);
+ }
+
ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
/* Get a buf with the desired data in it. */
rc = arc_buf_alloc_impl(hdr, spa, zb->zb_objset,
private, encrypted_read, compressed_read,
noauth_read, B_TRUE, &buf);
+ if (rc != 0) {
+ arc_buf_destroy(buf, private);
+ buf = NULL;
+ }
+
ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc == 0);
} else if (*arc_flags & ARC_FLAG_PREFETCH &&
refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
@@ -5987,6 +6017,8 @@ top:
}
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
arc_access(hdr, hash_lock);
+ if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
if (*arc_flags & ARC_FLAG_L2CACHE)
arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
mutex_exit(hash_lock);
@@ -5996,7 +6028,7 @@ top:
data, metadata, hits);
if (done)
- done(NULL, rc, buf, private);
+ done(NULL, zb, bp, buf, private);
} else {
uint64_t lsize = BP_GET_LSIZE(bp);
uint64_t psize = BP_GET_PSIZE(bp);
@@ -6112,6 +6144,8 @@ top:
if (*arc_flags & ARC_FLAG_PREFETCH &&
refcount_is_zero(&hdr->b_l1hdr.b_refcnt))
arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+ if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
if (*arc_flags & ARC_FLAG_L2CACHE)
arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
if (BP_IS_AUTHENTICATED(bp))
@@ -7223,9 +7257,15 @@ arc_tuning_update(void)
if (zfs_arc_p_min_shift)
arc_p_min_shift = zfs_arc_p_min_shift;
- /* Valid range: 1 - N ticks */
- if (zfs_arc_min_prefetch_lifespan)
- arc_min_prefetch_lifespan = zfs_arc_min_prefetch_lifespan;
+ /* Valid range: 1 - N ms */
+ if (zfs_arc_min_prefetch_ms)
+ arc_min_prefetch_ms = zfs_arc_min_prefetch_ms;
+
+ /* Valid range: 1 - N ms */
+ if (zfs_arc_min_prescient_prefetch_ms) {
+ arc_min_prescient_prefetch_ms =
+ zfs_arc_min_prescient_prefetch_ms;
+ }
/* Valid range: 0 - 100 */
if ((zfs_arc_lotsfree_percent >= 0) &&
@@ -7368,7 +7408,8 @@ arc_init(void)
cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
/* Convert seconds to clock ticks */
- arc_min_prefetch_lifespan = 1 * hz;
+ arc_min_prefetch_ms = 1;
+ arc_min_prescient_prefetch_ms = 6;
#ifdef _KERNEL
/*
@@ -9006,8 +9047,12 @@ MODULE_PARM_DESC(zfs_arc_average_blocksize, "Target average block size");
module_param(zfs_compressed_arc_enabled, int, 0644);
MODULE_PARM_DESC(zfs_compressed_arc_enabled, "Disable compressed arc buffers");
-module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
-MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
+module_param(zfs_arc_min_prefetch_ms, int, 0644);
+MODULE_PARM_DESC(zfs_arc_min_prefetch_ms, "Min life of prefetch block in ms");
+
+module_param(zfs_arc_min_prescient_prefetch_ms, int, 0644);
+MODULE_PARM_DESC(zfs_arc_min_prescient_prefetch_ms,
+ "Min life of prescient prefetched block in ms");
module_param(l2arc_write_max, ulong, 0644);
MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");