Reduce number of atomic_add() calls in aggsum

Previous code used 4 atomics to do aggsum_flush_bucket() and 2 more to re-borrow after the flush. But since asc_borrowed and asc_delta are accessed only while holding asc_lock, it makes no any sense to modify as_lower_bound and as_upper_bound in multiple steps. Instead of that the new code uses only 2 atomics in all the cases, one per as_*_bound variable. I think even that is overkill, simple atomic store and load could be used here, since all modifications are done under the as_lock, but there are no such primitives in ZFS code now. While there, make borrow code consider previous borrow value, so that on mixed request patterns reduce chance of needing to borrow again if much larger request follows tiny one that needed borrow. Also reduce as_numbuckets from uint64_t to u_int. It makes no sense to use so large division operation on every aggsum_add(). Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Paul Dagnelie <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Sponsored-By: iXsystems, Inc. Closes #9930
author: Alexander Motin <[email protected]> 2020-02-06 16:21:06 -0500
committer: GitHub <[email protected]> 2020-02-06 13:21:06 -0800
commit: e0ce98d57c44b324ffa99f0620ef8721814fc43e (patch)
tree: d4c2759dd507d4d5bce8b0da685edb38121fa111
parent: 77122f9d68a3a6e3fbd266652467acbd7b5d3225 (diff)
2 files changed, 34 insertions, 33 deletions
diff --git a/include/sys/aggsum.h b/include/sys/aggsum.h
index caa08d773..cb43727f1 100644
--- a/include/sys/aggsum.h
+++ b/include/sys/aggsum.h
@@ -40,7 +40,7 @@ typedef struct aggsum {
 	kmutex_t as_lock;
 	int64_t as_lower_bound;
 	int64_t as_upper_bound;
-	uint64_t as_numbuckets;
+	uint_t as_numbuckets;
 	aggsum_bucket_t *as_buckets;
 } aggsum_t;
 
diff --git a/module/zfs/aggsum.c b/module/zfs/aggsum.c
index ace3a83a5..a2fec2774 100644
--- a/module/zfs/aggsum.c
+++ b/module/zfs/aggsum.c
@@ -125,13 +125,11 @@ aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb)
 	 * We use atomic instructions for this because we read the upper and
 	 * lower bounds without the lock, so we need stores to be atomic.
 	 */
-	atomic_add_64((volatile uint64_t *)&as->as_lower_bound, asb->asc_delta);
-	atomic_add_64((volatile uint64_t *)&as->as_upper_bound, asb->asc_delta);
-	asb->asc_delta = 0;
-	atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
-	    -asb->asc_borrowed);
 	atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
-	    asb->asc_borrowed);
+	    asb->asc_delta + asb->asc_borrowed);
+	atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
+	    asb->asc_delta - asb->asc_borrowed);
+	asb->asc_delta = 0;
 	asb->asc_borrowed = 0;
 }
 
@@ -163,43 +161,46 @@ aggsum_value(aggsum_t *as)
 	return (rv);
 }
 
-static void
-aggsum_borrow(aggsum_t *as, int64_t delta, struct aggsum_bucket *asb)
-{
-	int64_t abs_delta = (delta < 0 ? -delta : delta);
-	mutex_enter(&as->as_lock);
-	mutex_enter(&asb->asc_lock);
-
-	aggsum_flush_bucket(as, asb);
-
-	atomic_add_64((volatile uint64_t *)&as->as_upper_bound, abs_delta);
-	atomic_add_64((volatile uint64_t *)&as->as_lower_bound, -abs_delta);
-	asb->asc_borrowed = abs_delta;
-
-	mutex_exit(&asb->asc_lock);
-	mutex_exit(&as->as_lock);
-}
-
 void
 aggsum_add(aggsum_t *as, int64_t delta)
 {
 	struct aggsum_bucket *asb;
+	int64_t borrow;
 
 	kpreempt_disable();
 	asb = &as->as_buckets[CPU_SEQID % as->as_numbuckets];
 	kpreempt_enable();
 
-	for (;;) {
-		mutex_enter(&asb->asc_lock);
-		if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed &&
-		    asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) {
-			asb->asc_delta += delta;
-			mutex_exit(&asb->asc_lock);
-			return;
-		}
+	/* Try fast path if we already borrowed enough before. */
+	mutex_enter(&asb->asc_lock);
+	if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed &&
+	    asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) {
+		asb->asc_delta += delta;
 		mutex_exit(&asb->asc_lock);
-		aggsum_borrow(as, delta * aggsum_borrow_multiplier, asb);
+		return;
 	}
+	mutex_exit(&asb->asc_lock);
+
+	/*
+	 * We haven't borrowed enough.  Take the global lock and borrow
+	 * considering what is requested now and what we borrowed before.
+	 */
+	borrow = (delta < 0 ? -delta : delta) * aggsum_borrow_multiplier;
+	mutex_enter(&as->as_lock);
+	mutex_enter(&asb->asc_lock);
+	delta += asb->asc_delta;
+	asb->asc_delta = 0;
+	if (borrow >= asb->asc_borrowed)
+		borrow -= asb->asc_borrowed;
+	else
+		borrow = (borrow - (int64_t)asb->asc_borrowed) / 4;
+	asb->asc_borrowed += borrow;
+	atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
+	    delta - borrow);
+	atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
+	    delta + borrow);
+	mutex_exit(&asb->asc_lock);
+	mutex_exit(&as->as_lock);
 }
 
 /*
author	Alexander Motin <[email protected]>	2020-02-06 16:21:06 -0500
committer	GitHub <[email protected]>	2020-02-06 13:21:06 -0800
commit	e0ce98d57c44b324ffa99f0620ef8721814fc43e (patch)
tree	d4c2759dd507d4d5bce8b0da685edb38121fa111
parent	77122f9d68a3a6e3fbd266652467acbd7b5d3225 (diff)