From 5f42d1dbf2854d0224f2f5853aab8153f78bdcc3 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 1 Mar 2023 18:27:40 -0500 Subject: System-wide speculative prefetch limit. With some pathological access patterns it is possible to make ZFS accumulate almost unlimited amount of speculative prefetch ZIOs. Combined with linear ABD allocations in RAIDZ code, it appears to be possible to exhaust system KVA, triggering kernel panic. Address this by introducing a system-wide counter of active prefetch requests and blocking prefetch distance doubling per stream hits if the number of active requests is higher that ~6% of ARC size. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14516 --- module/zfs/dmu_zfetch.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'module') diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index 76b8b5608..ffc012e6c 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -28,6 +28,7 @@ */ #include +#include #include #include #include @@ -65,13 +66,15 @@ typedef struct zfetch_stats { kstat_named_t zfetchstat_misses; kstat_named_t zfetchstat_max_streams; kstat_named_t zfetchstat_io_issued; + kstat_named_t zfetchstat_io_active; } zfetch_stats_t; static zfetch_stats_t zfetch_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "max_streams", KSTAT_DATA_UINT64 }, - { "io_issued", KSTAT_DATA_UINT64 }, + { "io_issued", KSTAT_DATA_UINT64 }, + { "io_active", KSTAT_DATA_UINT64 }, }; struct { @@ -79,6 +82,7 @@ struct { wmsum_t zfetchstat_misses; wmsum_t zfetchstat_max_streams; wmsum_t zfetchstat_io_issued; + aggsum_t zfetchstat_io_active; } zfetch_sums; #define ZFETCHSTAT_BUMP(stat) \ @@ -104,6 +108,8 @@ zfetch_kstats_update(kstat_t *ksp, int rw) wmsum_value(&zfetch_sums.zfetchstat_max_streams); zs->zfetchstat_io_issued.value.ui64 = wmsum_value(&zfetch_sums.zfetchstat_io_issued); + zs->zfetchstat_io_active.value.ui64 = + aggsum_value(&zfetch_sums.zfetchstat_io_active); return (0); } @@ -114,6 +120,7 @@ zfetch_init(void) wmsum_init(&zfetch_sums.zfetchstat_misses, 0); wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0); wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0); + aggsum_init(&zfetch_sums.zfetchstat_io_active, 0); zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), @@ -138,6 +145,8 @@ zfetch_fini(void) wmsum_fini(&zfetch_sums.zfetchstat_misses); wmsum_fini(&zfetch_sums.zfetchstat_max_streams); wmsum_fini(&zfetch_sums.zfetchstat_io_issued); + ASSERT0(aggsum_value(&zfetch_sums.zfetchstat_io_active)); + aggsum_fini(&zfetch_sums.zfetchstat_io_active); } /* @@ -294,6 +303,7 @@ dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued) zs->zs_more = B_TRUE; if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) dmu_zfetch_stream_fini(zs); + aggsum_add(&zfetch_sums.zfetchstat_io_active, -1); } /* @@ -407,20 +417,28 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, * Start prefetch from the demand access size (nblks). Double the * distance every access up to zfetch_min_distance. After that only * if needed increase the distance by 1/8 up to zfetch_max_distance. + * + * Don't double the distance beyond single block if we have more + * than ~6% of ARC held by active prefetches. It should help with + * getting out of RAM on some badly mispredicted read patterns. */ - unsigned int nbytes = nblks << zf->zf_dnode->dn_datablkshift; + unsigned int dbs = zf->zf_dnode->dn_datablkshift; + unsigned int nbytes = nblks << dbs; unsigned int pf_nblks; if (fetch_data) { if (unlikely(zs->zs_pf_dist < nbytes)) zs->zs_pf_dist = nbytes; - else if (zs->zs_pf_dist < zfetch_min_distance) + else if (zs->zs_pf_dist < zfetch_min_distance && + (zs->zs_pf_dist < (1 << dbs) || + aggsum_compare(&zfetch_sums.zfetchstat_io_active, + arc_c_max >> (4 + dbs)) < 0)) zs->zs_pf_dist *= 2; else if (zs->zs_more) zs->zs_pf_dist += zs->zs_pf_dist / 8; zs->zs_more = B_FALSE; if (zs->zs_pf_dist > zfetch_max_distance) zs->zs_pf_dist = zfetch_max_distance; - pf_nblks = zs->zs_pf_dist >> zf->zf_dnode->dn_datablkshift; + pf_nblks = zs->zs_pf_dist >> dbs; } else { pf_nblks = 0; } @@ -439,7 +457,7 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, zs->zs_ipf_dist *= 2; if (zs->zs_ipf_dist > zfetch_max_idistance) zs->zs_ipf_dist = zfetch_max_idistance; - pf_nblks = zs->zs_ipf_dist >> zf->zf_dnode->dn_datablkshift; + pf_nblks = zs->zs_ipf_dist >> dbs; if (zs->zs_ipf_start < zs->zs_pf_end) zs->zs_ipf_start = zs->zs_pf_end; if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks) @@ -510,6 +528,7 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) dmu_zfetch_stream_fini(zs); return; } + aggsum_add(&zfetch_sums.zfetchstat_io_active, issued); if (!have_lock) rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); -- cgit v1.2.3