Fix arc_adjust_meta() behavior

The goal of this function is to evict enough meta data buffers from the ARC in order to enforce the arc_meta_limit. Achieving this is slightly more complicated than it appears because it is common for data buffers to have holds on meta data buffers. In addition, dnode meta data buffers will be held by the dnodes in the block preventing them from being freed. This means we can't simply traverse the ARC and expect to always find enough unheld meta data buffer to release. Therefore, this function has been updated to make alternating passes over the ARC releasing data buffers and then newly unheld meta data buffers. This ensures forward progress is maintained and arc_meta_used will decrease. Normally this is sufficient, but if required the ARC will call the registered prune callbacks causing dentry and inodes to be dropped from the VFS cache. This will make dnode meta data buffers available for reclaim. The number of total restarts in limited by zfs_arc_meta_adjust_restarts to prevent spinning in the rare case where all meta data is pinned. Signed-off-by: Brian Behlendorf <[email protected]> Signed-off-by: Pavel Snajdr <[email protected]> Issue #3160
author: Brian Behlendorf <[email protected]> 2015-03-17 15:08:22 -0700
committer: Brian Behlendorf <[email protected]> 2015-03-20 10:35:20 -0700
commit: bc88866657979c5658441e201e19df365c67ddfe (patch)
tree: 2725fb8c50d648642b2d3a9f10b0114aeb8aff0b /module/zfs/arc.c
parent: 2cbb06b561f500732de2214eb590149d0c4f3cf5 (diff)
1 files changed, 66 insertions, 23 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index f9f0008c0..bdf116c35 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -221,6 +221,11 @@ unsigned long zfs_arc_max = 0;
 unsigned long zfs_arc_min = 0;
 unsigned long zfs_arc_meta_limit = 0;
 
+/*
+ * Limit the number of restarts in arc_adjust_meta()
+ */
+unsigned long zfs_arc_meta_adjust_restarts = 4096;
+
 /* The 6 states: */
 static arc_state_t ARC_anon;
 static arc_state_t ARC_mru;
@@ -2195,15 +2200,30 @@ arc_do_user_evicts(void)
 }
 
 /*
- * Evict only meta data objects from the cache leaving the data objects.
- * This is only used to enforce the tunable arc_meta_limit, if we are
- * unable to evict enough buffers notify the user via the prune callback.
+ * The goal of this function is to evict enough meta data buffers from the
+ * ARC in order to enforce the arc_meta_limit.  Achieving this is slightly
+ * more complicated than it appears because it is common for data buffers
+ * to have holds on meta data buffers.  In addition, dnode meta data buffers
+ * will be held by the dnodes in the block preventing them from being freed.
+ * This means we can't simply traverse the ARC and expect to always find
+ * enough unheld meta data buffer to release.
+ *
+ * Therefore, this function has been updated to make alternating passes
+ * over the ARC releasing data buffers and then newly unheld meta data
+ * buffers.  This ensures forward progress is maintained and arc_meta_used
+ * will decrease.  Normally this is sufficient, but if required the ARC
+ * will call the registered prune callbacks causing dentry and inodes to
+ * be dropped from the VFS cache.  This will make dnode meta data buffers
+ * available for reclaim.
  */
 static void
 arc_adjust_meta(void)
 {
-	int64_t adjustmnt, delta;
+	int64_t adjustmnt, delta, prune = 0;
+	arc_buf_contents_t type = ARC_BUFC_DATA;
+	unsigned long restarts = zfs_arc_meta_adjust_restarts;
 
+restart:
 	/*
 	 * This slightly differs than the way we evict from the mru in
 	 * arc_adjust because we don't have a "target" value (i.e. no
@@ -2214,9 +2234,9 @@ arc_adjust_meta(void)
 	 */
 	adjustmnt = arc_meta_used - arc_meta_limit;
 
-	if (adjustmnt > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
-		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustmnt);
-		arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA);
+	if (adjustmnt > 0 && arc_mru->arcs_lsize[type] > 0) {
+		delta = MIN(arc_mru->arcs_lsize[type], adjustmnt);
+		arc_evict(arc_mru, 0, delta, FALSE, type);
 		adjustmnt -= delta;
 	}
 
@@ -2230,31 +2250,50 @@ arc_adjust_meta(void)
 	 * simply decrement the amount of data evicted from the MRU.
 	 */
 
-	if (adjustmnt > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
-		delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustmnt);
-		arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA);
+	if (adjustmnt > 0 && arc_mfu->arcs_lsize[type] > 0) {
+		delta = MIN(arc_mfu->arcs_lsize[type], adjustmnt);
+		arc_evict(arc_mfu, 0, delta, FALSE, type);
 	}
 
-	adjustmnt = arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
-	    arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit;
+	adjustmnt = arc_meta_used - arc_meta_limit;
 
-	if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+	if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
 		delta = MIN(adjustmnt,
-		    arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA]);
-		arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_METADATA);
+		    arc_mru_ghost->arcs_lsize[type]);
+		arc_evict_ghost(arc_mru_ghost, 0, delta, type);
+		adjustmnt -= delta;
 	}
 
-	adjustmnt = arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
-	    arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit;
-
-	if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+	if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[type] > 0) {
 		delta = MIN(adjustmnt,
-		    arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA]);
-		arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_METADATA);
+		    arc_mfu_ghost->arcs_lsize[type]);
+		arc_evict_ghost(arc_mfu_ghost, 0, delta, type);
 	}
 
-	if (arc_meta_used > arc_meta_limit)
-		arc_do_user_prune(zfs_arc_meta_prune);
+	/*
+	 * If after attempting to make the requested adjustment to the ARC
+	 * the meta limit is still being exceeded then request that the
+	 * higher layers drop some cached objects which have holds on ARC
+	 * meta buffers.  Requests to the upper layers will be made with
+	 * increasingly large scan sizes until the ARC is below the limit.
+	 */
+	if (arc_meta_used > arc_meta_limit) {
+		if (type == ARC_BUFC_DATA) {
+			type = ARC_BUFC_METADATA;
+		} else {
+			type = ARC_BUFC_DATA;
+
+			if (zfs_arc_meta_prune) {
+				prune += zfs_arc_meta_prune;
+				arc_do_user_prune(prune);
+			}
+		}
+
+		if (restarts > 0) {
+			restarts--;
+			goto restart;
+		}
+	}
 }
 
 /*
@@ -5609,6 +5648,10 @@ MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
 module_param(zfs_arc_meta_prune, int, 0644);
 MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune");
 
+module_param(zfs_arc_meta_adjust_restarts, ulong, 0644);
+MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts,
+	"Limit number of restarts in arc_adjust_meta");
+
 module_param(zfs_arc_grow_retry, int, 0644);
 MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
author	Brian Behlendorf <[email protected]>	2015-03-17 15:08:22 -0700
committer	Brian Behlendorf <[email protected]>	2015-03-20 10:35:20 -0700
commit	bc88866657979c5658441e201e19df365c67ddfe (patch)
tree	2725fb8c50d648642b2d3a9f10b0114aeb8aff0b /module/zfs/arc.c
parent	2cbb06b561f500732de2214eb590149d0c4f3cf5 (diff)