diff options
author | Matthew Macy <[email protected]> | 2020-09-27 17:08:38 -0700 |
---|---|---|
committer | GitHub <[email protected]> | 2020-09-27 17:08:38 -0700 |
commit | af20b97078af7bf4ba7552dff04cc40b643ab72c (patch) | |
tree | f2588e170caba562ff55808458276ca5796b2211 | |
parent | cf2667759f4583bddd4b6ce8167e48b041ae1ea3 (diff) |
zfetch: Don't issue new streams when old have not completed
The current dmu_zfetch code implicitly assumes that I/Os complete
within min_sec_reap seconds. With async dmu and a readonly workload
(and thus no exponential backoff in operations from the "write
throttle") such as L2ARC rebuild it is possible to saturate the drives
with I/O requests. These are then effectively compounded with prefetch
requests.
This change reference counts streams and prevents them from being
recycled after their min_sec_reap timeout if they still have
outstanding I/Os.
Reviewed-by: Alexander Motin <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: Matt Macy <[email protected]>
Closes #10900
-rw-r--r-- | include/sys/dbuf.h | 7 | ||||
-rw-r--r-- | include/sys/dmu_zfetch.h | 16 | ||||
-rw-r--r-- | module/zfs/dbuf.c | 71 | ||||
-rw-r--r-- | module/zfs/dmu_zfetch.c | 116 |
4 files changed, 166 insertions, 44 deletions
diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 04338b2c4..ca2154e12 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -309,6 +309,8 @@ typedef struct dbuf_hash_table { kmutex_t hash_mutexes[DBUF_MUTEXES]; } dbuf_hash_table_t; +typedef void (*dbuf_prefetch_fn)(void *, boolean_t); + uint64_t dbuf_whichblock(const struct dnode *di, const int64_t level, const uint64_t offset); @@ -324,7 +326,10 @@ int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, boolean_t fail_uncached, void *tag, dmu_buf_impl_t **dbp); -void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid, +int dbuf_prefetch_impl(struct dnode *dn, int64_t level, uint64_t blkid, + zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb, + void *arg); +int dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid, zio_priority_t prio, arc_flags_t aflags); void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); diff --git a/include/sys/dmu_zfetch.h b/include/sys/dmu_zfetch.h index 4303ab314..34b711fc0 100644 --- a/include/sys/dmu_zfetch.h +++ b/include/sys/dmu_zfetch.h @@ -40,6 +40,13 @@ extern unsigned long zfetch_array_rd_sz; struct dnode; /* so we can reference dnode */ +typedef struct zfetch { + kmutex_t zf_lock; /* protects zfetch structure */ + list_t zf_stream; /* list of zstream_t's */ + struct dnode *zf_dnode; /* dnode that owns this zfetch */ + int zf_numstreams; /* number of zstream_t's */ +} zfetch_t; + typedef struct zstream { uint64_t zs_blkid; /* expect next access at this blkid */ uint64_t zs_pf_blkid; /* next block to prefetch */ @@ -52,15 +59,12 @@ typedef struct zstream { kmutex_t zs_lock; /* protects stream */ hrtime_t zs_atime; /* time last prefetch issued */ + hrtime_t zs_start_time; /* start of last prefetch */ list_node_t zs_node; /* link for zf_stream */ + zfetch_t *zs_fetch; /* parent fetch */ + zfs_refcount_t zs_blocks; /* number of pending blocks in the stream */ } zstream_t; -typedef struct zfetch { - kmutex_t zf_lock; /* protects zfetch structure */ - list_t zf_stream; /* list of zstream_t's */ - struct dnode *zf_dnode; /* dnode that owns this zfetch */ -} zfetch_t; - void zfetch_init(void); void zfetch_fini(void); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 7d817320a..9b6481d22 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -3003,8 +3003,29 @@ typedef struct dbuf_prefetch_arg { zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ + dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */ + void *dpa_arg; /* prefetch completion arg */ } dbuf_prefetch_arg_t; +static void +dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done) +{ + if (dpa->dpa_cb != NULL) + dpa->dpa_cb(dpa->dpa_arg, io_done); + kmem_free(dpa, sizeof (*dpa)); +} + +static void +dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb, + const blkptr_t *iobp, arc_buf_t *abuf, void *private) +{ + dbuf_prefetch_arg_t *dpa = private; + + dbuf_prefetch_fini(dpa, B_TRUE); + if (abuf != NULL) + arc_buf_destroy(abuf, private); +} + /* * Actually issue the prefetch read for the block given. */ @@ -3017,7 +3038,7 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) SPA_FEATURE_REDACTED_DATASETS)); if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) - return; + return (dbuf_prefetch_fini(dpa, B_FALSE)); int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; arc_flags_t aflags = @@ -3031,7 +3052,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); ASSERT(dpa->dpa_zio != NULL); - (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, + dbuf_issue_final_prefetch_done, dpa, dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb); } @@ -3051,8 +3073,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, if (abuf == NULL) { ASSERT(zio == NULL || zio->io_error != 0); - kmem_free(dpa, sizeof (*dpa)); - return; + return (dbuf_prefetch_fini(dpa, B_TRUE)); } ASSERT(zio == NULL || zio->io_error == 0); @@ -3084,11 +3105,9 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, dpa->dpa_curlevel, curblkid, FTAG); if (db == NULL) { - kmem_free(dpa, sizeof (*dpa)); arc_buf_destroy(abuf, private); - return; + return (dbuf_prefetch_fini(dpa, B_TRUE)); } - (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); dbuf_rele(db, FTAG); @@ -3105,11 +3124,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, dpa->dpa_dnode->dn_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) { - kmem_free(dpa, sizeof (*dpa)); + dbuf_prefetch_fini(dpa, B_TRUE); } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); dbuf_issue_final_prefetch(dpa, bp); - kmem_free(dpa, sizeof (*dpa)); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; @@ -3139,9 +3157,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, * complete. Note that the prefetch might fail if the dataset is encrypted and * the encryption key is unmapped before the IO completes. */ -void -dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, - arc_flags_t aflags) +int +dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid, + zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb, + void *arg) { blkptr_t bp; int epbs, nlevels, curlevel; @@ -3151,10 +3170,10 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); if (blkid > dn->dn_maxblkid) - return; + goto no_issue; if (level == 0 && dnode_block_freed(dn, blkid)) - return; + goto no_issue; /* * This dnode hasn't been written to disk yet, so there's nothing to @@ -3162,11 +3181,11 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, */ nlevels = dn->dn_phys->dn_nlevels; if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) - return; + goto no_issue; epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) - return; + goto no_issue; dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); @@ -3176,7 +3195,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, * This dbuf already exists. It is either CACHED, or * (we assume) about to be read or filled. */ - return; + goto no_issue; } /* @@ -3212,7 +3231,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp)) - return; + goto no_issue; ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); @@ -3230,6 +3249,8 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, dpa->dpa_dnode = dn; dpa->dpa_epbs = epbs; dpa->dpa_zio = pio; + dpa->dpa_cb = cb; + dpa->dpa_arg = arg; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) @@ -3245,7 +3266,6 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, if (curlevel == level) { ASSERT3U(curblkid, ==, blkid); dbuf_issue_final_prefetch(dpa, &bp); - kmem_free(dpa, sizeof (*dpa)); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; @@ -3266,6 +3286,19 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, * dpa may have already been freed. */ zio_nowait(pio); + return (1); +no_issue: + if (cb != NULL) + cb(arg, B_FALSE); + return (0); +} + +int +dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, + arc_flags_t aflags) +{ + + return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL)); } /* diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index 5935b5f99..39113cc48 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -59,16 +59,29 @@ typedef struct zfetch_stats { kstat_named_t zfetchstat_hits; kstat_named_t zfetchstat_misses; kstat_named_t zfetchstat_max_streams; + kstat_named_t zfetchstat_max_completion_us; + kstat_named_t zfetchstat_last_completion_us; + kstat_named_t zfetchstat_io_issued; } zfetch_stats_t; static zfetch_stats_t zfetch_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "max_streams", KSTAT_DATA_UINT64 }, + { "max_completion_us", KSTAT_DATA_UINT64 }, + { "last_completion_us", KSTAT_DATA_UINT64 }, + { "io_issued", KSTAT_DATA_UINT64 }, }; #define ZFETCHSTAT_BUMP(stat) \ - atomic_inc_64(&zfetch_stats.stat.value.ui64); + atomic_inc_64(&zfetch_stats.stat.value.ui64) +#define ZFETCHSTAT_ADD(stat, val) \ + atomic_add_64(&zfetch_stats.stat.value.ui64, val) +#define ZFETCHSTAT_SET(stat, val) \ + zfetch_stats.stat.value.ui64 = val +#define ZFETCHSTAT_GET(stat) \ + zfetch_stats.stat.value.ui64 + kstat_t *zfetch_ksp; @@ -104,8 +117,8 @@ dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) { if (zf == NULL) return; - zf->zf_dnode = dno; + zf->zf_numstreams = 0; list_create(&zf->zf_stream, sizeof (zstream_t), offsetof(zstream_t, zs_node)); @@ -114,12 +127,28 @@ dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) } static void +dmu_zfetch_stream_fini(zstream_t *zs) +{ + mutex_destroy(&zs->zs_lock); + kmem_free(zs, sizeof (*zs)); +} + +static void dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) { ASSERT(MUTEX_HELD(&zf->zf_lock)); list_remove(&zf->zf_stream, zs); - mutex_destroy(&zs->zs_lock); - kmem_free(zs, sizeof (*zs)); + dmu_zfetch_stream_fini(zs); + zf->zf_numstreams--; +} + +static void +dmu_zfetch_stream_orphan(zfetch_t *zf, zstream_t *zs) +{ + ASSERT(MUTEX_HELD(&zf->zf_lock)); + list_remove(&zf->zf_stream, zs); + zs->zs_fetch = NULL; + zf->zf_numstreams--; } /* @@ -133,7 +162,7 @@ dmu_zfetch_fini(zfetch_t *zf) mutex_enter(&zf->zf_lock); while ((zs = list_head(&zf->zf_stream)) != NULL) - dmu_zfetch_stream_remove(zf, zs); + dmu_zfetch_stream_orphan(zf, zs); mutex_exit(&zf->zf_lock); list_destroy(&zf->zf_stream); mutex_destroy(&zf->zf_lock); @@ -151,7 +180,7 @@ static void dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) { zstream_t *zs_next; - int numstreams = 0; + hrtime_t now = gethrtime(); ASSERT(MUTEX_HELD(&zf->zf_lock)); @@ -161,11 +190,14 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) for (zstream_t *zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) { zs_next = list_next(&zf->zf_stream, zs); - if (((gethrtime() - zs->zs_atime) / NANOSEC) > + /* + * Skip gethrtime() call if there are still references + */ + if (zfs_refcount_count(&zs->zs_blocks) != 0) + continue; + if (((now - zs->zs_atime) / NANOSEC) > zfetch_min_sec_reap) dmu_zfetch_stream_remove(zf, zs); - else - numstreams++; } /* @@ -179,7 +211,7 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / zfetch_max_distance)); - if (numstreams >= max_streams) { + if (zf->zf_numstreams >= max_streams) { ZFETCHSTAT_BUMP(zfetchstat_max_streams); return; } @@ -188,12 +220,39 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) zs->zs_blkid = blkid; zs->zs_pf_blkid = blkid; zs->zs_ipf_blkid = blkid; - zs->zs_atime = gethrtime(); + zs->zs_atime = now; + zs->zs_fetch = zf; + zfs_refcount_create(&zs->zs_blocks); mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL); - + zf->zf_numstreams++; list_insert_head(&zf->zf_stream, zs); } +static void +dmu_zfetch_stream_done(void *arg, boolean_t io_issued) +{ + zstream_t *zs = arg; + + if (zs->zs_start_time && io_issued) { + hrtime_t now = gethrtime(); + hrtime_t delta = NSEC2USEC(now - zs->zs_start_time); + + zs->zs_start_time = 0; + ZFETCHSTAT_SET(zfetchstat_last_completion_us, delta); + if (delta > ZFETCHSTAT_GET(zfetchstat_max_completion_us)) + ZFETCHSTAT_SET(zfetchstat_max_completion_us, delta); + } + + if (zfs_refcount_remove(&zs->zs_blocks, NULL) != 0) + return; + + /* + * The parent fetch structure has gone away + */ + if (zs->zs_fetch == NULL) + dmu_zfetch_stream_fini(zs); +} + /* * This is the predictive prefetch entry point. It associates dnode access * specified with blkid and nblks arguments with prefetch stream, predicts @@ -209,7 +268,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, zstream_t *zs; int64_t pf_start, ipf_start, ipf_istart, ipf_iend; int64_t pf_ahead_blks, max_blks; - int epbs, max_dist_blks, pf_nblks, ipf_nblks; + int epbs, max_dist_blks, pf_nblks, ipf_nblks, issued; uint64_t end_of_access_blkid; end_of_access_blkid = blkid + nblks; spa_t *spa = zf->zf_dnode->dn_objset->os_spa; @@ -230,11 +289,21 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, * As a fast path for small (single-block) files, ignore access * to the first block. */ - if (blkid == 0) + if (!have_lock && blkid == 0) return; if (!have_lock) rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); + + /* + * A fast path for small files for which no prefetch will + * happen. + */ + if (zf->zf_dnode->dn_maxblkid < 2) { + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); + return; + } mutex_enter(&zf->zf_lock); /* @@ -343,9 +412,15 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs; zs->zs_atime = gethrtime(); + /* no prior reads in progress */ + if (zfs_refcount_count(&zs->zs_blocks) == 0) + zs->zs_start_time = zs->zs_atime; zs->zs_blkid = end_of_access_blkid; + zfs_refcount_add_many(&zs->zs_blocks, pf_nblks + ipf_iend - ipf_istart, + NULL); mutex_exit(&zs->zs_lock); mutex_exit(&zf->zf_lock); + issued = 0; /* * dbuf_prefetch() is asynchronous (even when it needs to read @@ -354,16 +429,21 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, */ for (int i = 0; i < pf_nblks; i++) { - dbuf_prefetch(zf->zf_dnode, 0, pf_start + i, - ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); + issued += dbuf_prefetch_impl(zf->zf_dnode, 0, pf_start + i, + ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, + dmu_zfetch_stream_done, zs); } for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) { - dbuf_prefetch(zf->zf_dnode, 1, iblk, - ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); + issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, + ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, + dmu_zfetch_stream_done, zs); } if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); ZFETCHSTAT_BUMP(zfetchstat_hits); + + if (issued) + ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); } /* BEGIN CSTYLED */ |