aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--man/man5/zfs-module-parameters.514
-rw-r--r--module/zfs/arc.c89
2 files changed, 80 insertions, 23 deletions
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index 4b3dc3666..fe31e292a 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -414,6 +414,20 @@ Default value: \fB10,000\fR.
.sp
.ne 2
.na
+\fBzfs_arc_meta_adjust_restarts\fR (ulong)
+.ad
+.RS 12n
+The number of restart passes to make while scanning the ARC attempting
+the free buffers in order to stay below the \fBzfs_arc_meta_limit\fR.
+This value should not need to be tuned but is available to facilitate
+performance analysis.
+.sp
+Default value: \fB4096\fR.
+.RE
+
+.sp
+.ne 2
+.na
\fBzfs_arc_min\fR (ulong)
.ad
.RS 12n
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index f9f0008c0..bdf116c35 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -221,6 +221,11 @@ unsigned long zfs_arc_max = 0;
unsigned long zfs_arc_min = 0;
unsigned long zfs_arc_meta_limit = 0;
+/*
+ * Limit the number of restarts in arc_adjust_meta()
+ */
+unsigned long zfs_arc_meta_adjust_restarts = 4096;
+
/* The 6 states: */
static arc_state_t ARC_anon;
static arc_state_t ARC_mru;
@@ -2195,15 +2200,30 @@ arc_do_user_evicts(void)
}
/*
- * Evict only meta data objects from the cache leaving the data objects.
- * This is only used to enforce the tunable arc_meta_limit, if we are
- * unable to evict enough buffers notify the user via the prune callback.
+ * The goal of this function is to evict enough meta data buffers from the
+ * ARC in order to enforce the arc_meta_limit. Achieving this is slightly
+ * more complicated than it appears because it is common for data buffers
+ * to have holds on meta data buffers. In addition, dnode meta data buffers
+ * will be held by the dnodes in the block preventing them from being freed.
+ * This means we can't simply traverse the ARC and expect to always find
+ * enough unheld meta data buffer to release.
+ *
+ * Therefore, this function has been updated to make alternating passes
+ * over the ARC releasing data buffers and then newly unheld meta data
+ * buffers. This ensures forward progress is maintained and arc_meta_used
+ * will decrease. Normally this is sufficient, but if required the ARC
+ * will call the registered prune callbacks causing dentry and inodes to
+ * be dropped from the VFS cache. This will make dnode meta data buffers
+ * available for reclaim.
*/
static void
arc_adjust_meta(void)
{
- int64_t adjustmnt, delta;
+ int64_t adjustmnt, delta, prune = 0;
+ arc_buf_contents_t type = ARC_BUFC_DATA;
+ unsigned long restarts = zfs_arc_meta_adjust_restarts;
+restart:
/*
* This slightly differs than the way we evict from the mru in
* arc_adjust because we don't have a "target" value (i.e. no
@@ -2214,9 +2234,9 @@ arc_adjust_meta(void)
*/
adjustmnt = arc_meta_used - arc_meta_limit;
- if (adjustmnt > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
- delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustmnt);
- arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA);
+ if (adjustmnt > 0 && arc_mru->arcs_lsize[type] > 0) {
+ delta = MIN(arc_mru->arcs_lsize[type], adjustmnt);
+ arc_evict(arc_mru, 0, delta, FALSE, type);
adjustmnt -= delta;
}
@@ -2230,31 +2250,50 @@ arc_adjust_meta(void)
* simply decrement the amount of data evicted from the MRU.
*/
- if (adjustmnt > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
- delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustmnt);
- arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA);
+ if (adjustmnt > 0 && arc_mfu->arcs_lsize[type] > 0) {
+ delta = MIN(arc_mfu->arcs_lsize[type], adjustmnt);
+ arc_evict(arc_mfu, 0, delta, FALSE, type);
}
- adjustmnt = arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
- arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit;
+ adjustmnt = arc_meta_used - arc_meta_limit;
- if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
delta = MIN(adjustmnt,
- arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA]);
- arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_METADATA);
+ arc_mru_ghost->arcs_lsize[type]);
+ arc_evict_ghost(arc_mru_ghost, 0, delta, type);
+ adjustmnt -= delta;
}
- adjustmnt = arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
- arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit;
-
- if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[type] > 0) {
delta = MIN(adjustmnt,
- arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA]);
- arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_METADATA);
+ arc_mfu_ghost->arcs_lsize[type]);
+ arc_evict_ghost(arc_mfu_ghost, 0, delta, type);
}
- if (arc_meta_used > arc_meta_limit)
- arc_do_user_prune(zfs_arc_meta_prune);
+ /*
+ * If after attempting to make the requested adjustment to the ARC
+ * the meta limit is still being exceeded then request that the
+ * higher layers drop some cached objects which have holds on ARC
+ * meta buffers. Requests to the upper layers will be made with
+ * increasingly large scan sizes until the ARC is below the limit.
+ */
+ if (arc_meta_used > arc_meta_limit) {
+ if (type == ARC_BUFC_DATA) {
+ type = ARC_BUFC_METADATA;
+ } else {
+ type = ARC_BUFC_DATA;
+
+ if (zfs_arc_meta_prune) {
+ prune += zfs_arc_meta_prune;
+ arc_do_user_prune(prune);
+ }
+ }
+
+ if (restarts > 0) {
+ restarts--;
+ goto restart;
+ }
+ }
}
/*
@@ -5609,6 +5648,10 @@ MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
module_param(zfs_arc_meta_prune, int, 0644);
MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune");
+module_param(zfs_arc_meta_adjust_restarts, ulong, 0644);
+MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts,
+ "Limit number of restarts in arc_adjust_meta");
+
module_param(zfs_arc_grow_retry, int, 0644);
MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");