6 files changed, 40 insertions, 20 deletions
diff --git a/module/zfs/aggsum.c b/module/zfs/aggsum.c
index e38f4a66c..e46da95f6 100644
--- a/module/zfs/aggsum.c
+++ b/module/zfs/aggsum.c
@@ -70,6 +70,11 @@
  * zeroing out the borrowed value (forcing that thread to borrow on its next
  * request, which will also be expensive).  This is what makes aggsums well
  * suited for write-many read-rarely operations.
+ *
+ * Note that the aggsums do not expand if more CPUs are hot-added. In that
+ * case, we will have less fanout than boot_ncpus, but we don't want to always
+ * reserve the RAM necessary to create the extra slots for additional CPUs up
+ * front, and dynamically adding them is a complex task.
  */
 
 /*
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index efc6bb138..1bc27391c 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -7593,6 +7593,15 @@ arc_target_bytes(void)
 }
 
 void
+arc_set_limits(uint64_t allmem)
+{
+	/* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */
+	arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT);
+
+	/* How to set default max varies by platform. */
+	arc_c_max = arc_default_max(arc_c_min, allmem);
+}
+void
 arc_init(void)
 {
 	uint64_t percent, allmem = arc_all_memory();
@@ -7607,11 +7616,7 @@ arc_init(void)
 	arc_lowmem_init();
 #endif
 
-	/* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */
-	arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT);
-
-	/* How to set default max varies by platform. */
-	arc_c_max = arc_default_max(arc_c_min, allmem);
+	arc_set_limits(allmem);
 
 #ifndef _KERNEL
 	/*
@@ -7648,6 +7653,8 @@ arc_init(void)
 	if (arc_c < arc_c_min)
 		arc_c = arc_c_min;
 
+	arc_register_hotplug();
+
 	arc_state_init();
 
 	buf_init();
@@ -7656,8 +7663,9 @@ arc_init(void)
 	    offsetof(arc_prune_t, p_node));
 	mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
 
-	arc_prune_taskq = taskq_create("arc_prune", boot_ncpus, defclsyspri,
-	    boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+	arc_prune_taskq = taskq_create("arc_prune", 100, defclsyspri,
+	    boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
+	    TASKQ_THREADS_CPU_PCT);
 
 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
@@ -7754,6 +7762,8 @@ arc_fini(void)
 	buf_fini();
 	arc_state_fini();
 
+	arc_unregister_hotplug();
+
 	/*
 	 * We destroy the zthrs after all the ARC state has been
 	 * torn down to avoid the case of them receiving any
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index 3a2028625..8f6675820 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -220,11 +220,12 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
 
-	dp->dp_zrele_taskq = taskq_create("z_zrele", boot_ncpus, defclsyspri,
-	    boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+	dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri,
+	    boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
+	    TASKQ_THREADS_CPU_PCT);
 	dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain",
-	    boot_ncpus, defclsyspri, boot_ncpus, INT_MAX,
-	    TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+	    100, defclsyspri, boot_ncpus, INT_MAX,
+	    TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
 
 	return (dp);
 }
diff --git a/module/zfs/multilist.c b/module/zfs/multilist.c
index a3adfd317..36c0d33bf 100644
--- a/module/zfs/multilist.c
+++ b/module/zfs/multilist.c
@@ -96,9 +96,12 @@ multilist_create_impl(size_t size, size_t offset,
 }
 
 /*
- * Allocate a new multilist, using the default number of sublists
- * (the number of CPUs, or at least 4, or the tunable
- * zfs_multilist_num_sublists).
+ * Allocate a new multilist, using the default number of sublists (the number
+ * of CPUs, or at least 4, or the tunable zfs_multilist_num_sublists). Note
+ * that the multilists do not expand if more CPUs are hot-added. In that case,
+ * we will have less fanout than boot_ncpus, but we don't want to always
+ * reserve the RAM necessary to create the extra slots for additional CPUs up
+ * front, and dynamically adding them is a complex task.
  */
 multilist_t *
 multilist_create(size_t size, size_t offset,
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 65c907d9d..50822cfae 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1281,15 +1281,15 @@ spa_activate(spa_t *spa, spa_mode_t mode)
 	 * pool traverse code from monopolizing the global (and limited)
 	 * system_taskq by inappropriately scheduling long running tasks on it.
 	 */
-	spa->spa_prefetch_taskq = taskq_create("z_prefetch", boot_ncpus,
-	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC);
+	spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100,
+	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
 
 	/*
 	 * The taskq to upgrade datasets in this pool. Currently used by
 	 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA.
 	 */
-	spa->spa_upgrade_taskq = taskq_create("z_upgrade", boot_ncpus,
-	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC);
+	spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100,
+	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
 }
 
 /*
diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index 420244abb..3efd26155 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -446,8 +446,9 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
 			 * Commit callback taskq hasn't been created yet.
 			 */
 			tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
-			    boot_ncpus, defclsyspri, boot_ncpus, boot_ncpus * 2,
-			    TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+			    100, defclsyspri, boot_ncpus, boot_ncpus * 2,
+			    TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
+			    TASKQ_THREADS_CPU_PCT);
 		}
 
 		cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);