aboutsummaryrefslogtreecommitdiffstats
path: root/module
diff options
context:
space:
mode:
authorAlexander Motin <[email protected]>2023-03-01 18:27:40 -0500
committerGitHub <[email protected]>2023-03-01 15:27:40 -0800
commit5f42d1dbf2854d0224f2f5853aab8153f78bdcc3 (patch)
tree74fb390a305f2b3729776ac63cadd094d0c9c312 /module
parentcd560c447407cdd73bf94170125351b204567d48 (diff)
System-wide speculative prefetch limit.
With some pathological access patterns it is possible to make ZFS accumulate almost unlimited amount of speculative prefetch ZIOs. Combined with linear ABD allocations in RAIDZ code, it appears to be possible to exhaust system KVA, triggering kernel panic. Address this by introducing a system-wide counter of active prefetch requests and blocking prefetch distance doubling per stream hits if the number of active requests is higher that ~6% of ARC size. Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc. Closes #14516
Diffstat (limited to 'module')
-rw-r--r--module/zfs/dmu_zfetch.c29
1 files changed, 24 insertions, 5 deletions
diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c
index 76b8b5608..ffc012e6c 100644
--- a/module/zfs/dmu_zfetch.c
+++ b/module/zfs/dmu_zfetch.c
@@ -28,6 +28,7 @@
*/
#include <sys/zfs_context.h>
+#include <sys/arc_impl.h>
#include <sys/dnode.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_zfetch.h>
@@ -65,13 +66,15 @@ typedef struct zfetch_stats {
kstat_named_t zfetchstat_misses;
kstat_named_t zfetchstat_max_streams;
kstat_named_t zfetchstat_io_issued;
+ kstat_named_t zfetchstat_io_active;
} zfetch_stats_t;
static zfetch_stats_t zfetch_stats = {
{ "hits", KSTAT_DATA_UINT64 },
{ "misses", KSTAT_DATA_UINT64 },
{ "max_streams", KSTAT_DATA_UINT64 },
- { "io_issued", KSTAT_DATA_UINT64 },
+ { "io_issued", KSTAT_DATA_UINT64 },
+ { "io_active", KSTAT_DATA_UINT64 },
};
struct {
@@ -79,6 +82,7 @@ struct {
wmsum_t zfetchstat_misses;
wmsum_t zfetchstat_max_streams;
wmsum_t zfetchstat_io_issued;
+ aggsum_t zfetchstat_io_active;
} zfetch_sums;
#define ZFETCHSTAT_BUMP(stat) \
@@ -104,6 +108,8 @@ zfetch_kstats_update(kstat_t *ksp, int rw)
wmsum_value(&zfetch_sums.zfetchstat_max_streams);
zs->zfetchstat_io_issued.value.ui64 =
wmsum_value(&zfetch_sums.zfetchstat_io_issued);
+ zs->zfetchstat_io_active.value.ui64 =
+ aggsum_value(&zfetch_sums.zfetchstat_io_active);
return (0);
}
@@ -114,6 +120,7 @@ zfetch_init(void)
wmsum_init(&zfetch_sums.zfetchstat_misses, 0);
wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0);
wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0);
+ aggsum_init(&zfetch_sums.zfetchstat_io_active, 0);
zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
@@ -138,6 +145,8 @@ zfetch_fini(void)
wmsum_fini(&zfetch_sums.zfetchstat_misses);
wmsum_fini(&zfetch_sums.zfetchstat_max_streams);
wmsum_fini(&zfetch_sums.zfetchstat_io_issued);
+ ASSERT0(aggsum_value(&zfetch_sums.zfetchstat_io_active));
+ aggsum_fini(&zfetch_sums.zfetchstat_io_active);
}
/*
@@ -294,6 +303,7 @@ dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued)
zs->zs_more = B_TRUE;
if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
dmu_zfetch_stream_fini(zs);
+ aggsum_add(&zfetch_sums.zfetchstat_io_active, -1);
}
/*
@@ -407,20 +417,28 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
* Start prefetch from the demand access size (nblks). Double the
* distance every access up to zfetch_min_distance. After that only
* if needed increase the distance by 1/8 up to zfetch_max_distance.
+ *
+ * Don't double the distance beyond single block if we have more
+ * than ~6% of ARC held by active prefetches. It should help with
+ * getting out of RAM on some badly mispredicted read patterns.
*/
- unsigned int nbytes = nblks << zf->zf_dnode->dn_datablkshift;
+ unsigned int dbs = zf->zf_dnode->dn_datablkshift;
+ unsigned int nbytes = nblks << dbs;
unsigned int pf_nblks;
if (fetch_data) {
if (unlikely(zs->zs_pf_dist < nbytes))
zs->zs_pf_dist = nbytes;
- else if (zs->zs_pf_dist < zfetch_min_distance)
+ else if (zs->zs_pf_dist < zfetch_min_distance &&
+ (zs->zs_pf_dist < (1 << dbs) ||
+ aggsum_compare(&zfetch_sums.zfetchstat_io_active,
+ arc_c_max >> (4 + dbs)) < 0))
zs->zs_pf_dist *= 2;
else if (zs->zs_more)
zs->zs_pf_dist += zs->zs_pf_dist / 8;
zs->zs_more = B_FALSE;
if (zs->zs_pf_dist > zfetch_max_distance)
zs->zs_pf_dist = zfetch_max_distance;
- pf_nblks = zs->zs_pf_dist >> zf->zf_dnode->dn_datablkshift;
+ pf_nblks = zs->zs_pf_dist >> dbs;
} else {
pf_nblks = 0;
}
@@ -439,7 +457,7 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
zs->zs_ipf_dist *= 2;
if (zs->zs_ipf_dist > zfetch_max_idistance)
zs->zs_ipf_dist = zfetch_max_idistance;
- pf_nblks = zs->zs_ipf_dist >> zf->zf_dnode->dn_datablkshift;
+ pf_nblks = zs->zs_ipf_dist >> dbs;
if (zs->zs_ipf_start < zs->zs_pf_end)
zs->zs_ipf_start = zs->zs_pf_end;
if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks)
@@ -510,6 +528,7 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
dmu_zfetch_stream_fini(zs);
return;
}
+ aggsum_add(&zfetch_sums.zfetchstat_io_active, issued);
if (!have_lock)
rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);