14 files changed, 290 insertions, 36 deletions
diff --git a/config/kernel-hotplug.m4 b/config/kernel-hotplug.m4
new file mode 100644
index 000000000..e796a6d2e
--- /dev/null
+++ b/config/kernel-hotplug.m4
@@ -0,0 +1,26 @@
+dnl #
+dnl # 4.6 API change
+dnl # Added CPU hotplug APIs
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_CPU_HOTPLUG], [
+	ZFS_LINUX_TEST_SRC([cpu_hotplug], [
+		#include <linux/cpuhotplug.h>
+	],[
+		enum cpuhp_state state = CPUHP_ONLINE;
+		int (*fp)(unsigned int, struct hlist_node *) = NULL;
+		cpuhp_state_add_instance_nocalls(0, (struct hlist_node *)NULL);
+		cpuhp_state_remove_instance_nocalls(0, (struct hlist_node *)NULL);
+		cpuhp_setup_state_multi(state, "", fp, fp);
+		cpuhp_remove_multi_state(0);
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_CPU_HOTPLUG], [
+	AC_MSG_CHECKING([whether CPU hotplug APIs exist])
+	ZFS_LINUX_TEST_RESULT([cpu_hotplug], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_CPU_HOTPLUG, 1, [yes])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
diff --git a/config/kernel.m4 b/config/kernel.m4
index 3e01daa5e..14a8d4c58 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -124,6 +124,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_TOTALHIGH_PAGES
 	ZFS_AC_KERNEL_SRC_KSTRTOUL
 	ZFS_AC_KERNEL_SRC_PERCPU
+	ZFS_AC_KERNEL_SRC_CPU_HOTPLUG
 
 	AC_MSG_CHECKING([for available kernel interfaces])
 	ZFS_LINUX_TEST_COMPILE_ALL([kabi])
@@ -221,6 +222,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_TOTALHIGH_PAGES
 	ZFS_AC_KERNEL_KSTRTOUL
 	ZFS_AC_KERNEL_PERCPU
+	ZFS_AC_KERNEL_CPU_HOTPLUG
 ])
 
 dnl #
diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h
index 16f4349e7..b50175a10 100644
--- a/include/os/linux/spl/sys/taskq.h
+++ b/include/os/linux/spl/sys/taskq.h
@@ -84,6 +84,8 @@ typedef struct taskq {
 	int			tq_nthreads;	/* # of existing threads */
 	int			tq_nspawn;	/* # of threads being spawned */
 	int			tq_maxthreads;	/* # of threads maximum */
+	/* If PERCPU flag is set, percent of NCPUs to have as threads */
+	int			tq_cpu_pct;
 	int			tq_pri;		/* priority */
 	int			tq_minalloc;	/* min taskq_ent_t pool size */
 	int			tq_maxalloc;	/* max taskq_ent_t pool size */
@@ -99,6 +101,9 @@ typedef struct taskq {
 	spl_wait_queue_head_t	tq_work_waitq;	/* new work waitq */
 	spl_wait_queue_head_t	tq_wait_waitq;	/* wait waitq */
 	tq_lock_role_t		tq_lock_class;	/* class when taking tq_lock */
+	/* list node for the cpu hotplug callback */
+	struct hlist_node	tq_hp_cb_node;
+	boolean_t		tq_hp_support;
 } taskq_t;
 
 typedef struct taskq_ent {
diff --git a/include/sys/arc.h b/include/sys/arc.h
index 9ade1a432..f58fa53b6 100644
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@@ -310,6 +310,7 @@ int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg);
 uint64_t arc_all_memory(void);
 uint64_t arc_default_max(uint64_t min, uint64_t allmem);
 uint64_t arc_target_bytes(void);
+void arc_set_limits(uint64_t);
 void arc_init(void);
 void arc_fini(void);
 
diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h
index b08a42c75..6adcd4b54 100644
--- a/include/sys/arc_impl.h
+++ b/include/sys/arc_impl.h
@@ -926,6 +926,8 @@ extern int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg);
 extern uint64_t arc_free_memory(void);
 extern int64_t arc_available_memory(void);
 extern void arc_tuning_update(boolean_t);
+extern void arc_register_hotplug(void);
+extern void arc_unregister_hotplug(void);
 
 extern int param_set_arc_long(ZFS_MODULE_PARAM_ARGS);
 extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS);
diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c
index 94df75003..4fc7468bf 100644
--- a/module/os/freebsd/zfs/arc_os.c
+++ b/module/os/freebsd/zfs/arc_os.c
@@ -243,3 +243,13 @@ arc_lowmem_fini(void)
 	if (arc_event_lowmem != NULL)
 		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
 }
+
+void
+arc_register_hotplug(void)
+{
+}
+
+void
+arc_unregister_hotplug(void)
+{
+}
diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c
index fafadffc7..e8d89bfea 100644
--- a/module/os/linux/spl/spl-taskq.c
+++ b/module/os/linux/spl/spl-taskq.c
@@ -28,6 +28,9 @@
 #include <sys/kmem.h>
 #include <sys/tsd.h>
 #include <sys/trace_spl.h>
+#ifdef HAVE_CPU_HOTPLUG
+#include <linux/cpuhotplug.h>
+#endif
 
 int spl_taskq_thread_bind = 0;
 module_param(spl_taskq_thread_bind, int, 0644);
@@ -35,7 +38,7 @@ MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
 
 
 int spl_taskq_thread_dynamic = 1;
-module_param(spl_taskq_thread_dynamic, int, 0644);
+module_param(spl_taskq_thread_dynamic, int, 0444);
 MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads");
 
 int spl_taskq_thread_priority = 1;
@@ -59,6 +62,11 @@ EXPORT_SYMBOL(system_delay_taskq);
 static taskq_t *dynamic_taskq;
 static taskq_thread_t *taskq_thread_create(taskq_t *);
 
+#ifdef HAVE_CPU_HOTPLUG
+/* Multi-callback id for cpu hotplugging. */
+static int spl_taskq_cpuhp_state;
+#endif
+
 /* List of all taskqs */
 LIST_HEAD(tq_list);
 struct rw_semaphore tq_list_sem;
@@ -1024,13 +1032,14 @@ taskq_thread_create(taskq_t *tq)
 }
 
 taskq_t *
-taskq_create(const char *name, int nthreads, pri_t pri,
+taskq_create(const char *name, int threads_arg, pri_t pri,
     int minalloc, int maxalloc, uint_t flags)
 {
 	taskq_t *tq;
 	taskq_thread_t *tqt;
 	int count = 0, rc = 0, i;
 	unsigned long irqflags;
+	int nthreads = threads_arg;
 
 	ASSERT(name != NULL);
 	ASSERT(minalloc >= 0);
@@ -1041,15 +1050,27 @@ taskq_create(const char *name, int nthreads, pri_t pri,
 	if (flags & TASKQ_THREADS_CPU_PCT) {
 		ASSERT(nthreads <= 100);
 		ASSERT(nthreads >= 0);
-		nthreads = MIN(nthreads, 100);
+		nthreads = MIN(threads_arg, 100);
 		nthreads = MAX(nthreads, 0);
-		nthreads = MAX((num_online_cpus() * nthreads) / 100, 1);
+		nthreads = MAX((num_online_cpus() * nthreads) /100, 1);
 	}
 
 	tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE);
 	if (tq == NULL)
 		return (NULL);
 
+	tq->tq_hp_support = B_FALSE;
+#ifdef HAVE_CPU_HOTPLUG
+	if (flags & TASKQ_THREADS_CPU_PCT) {
+		tq->tq_hp_support = B_TRUE;
+		if (cpuhp_state_add_instance_nocalls(spl_taskq_cpuhp_state,
+		    &tq->tq_hp_cb_node) != 0) {
+			kmem_free(tq, sizeof (*tq));
+			return (NULL);
+		}
+	}
+#endif
+
 	spin_lock_init(&tq->tq_lock);
 	INIT_LIST_HEAD(&tq->tq_thread_list);
 	INIT_LIST_HEAD(&tq->tq_active_list);
@@ -1058,6 +1079,7 @@ taskq_create(const char *name, int nthreads, pri_t pri,
 	tq->tq_nthreads = 0;
 	tq->tq_nspawn = 0;
 	tq->tq_maxthreads = nthreads;
+	tq->tq_cpu_pct = threads_arg;
 	tq->tq_pri = pri;
 	tq->tq_minalloc = minalloc;
 	tq->tq_maxalloc = maxalloc;
@@ -1131,6 +1153,12 @@ taskq_destroy(taskq_t *tq)
 	tq->tq_flags &= ~TASKQ_ACTIVE;
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
+#ifdef HAVE_CPU_HOTPLUG
+	if (tq->tq_hp_support) {
+		VERIFY0(cpuhp_state_remove_instance_nocalls(
+		    spl_taskq_cpuhp_state, &tq->tq_hp_cb_node));
+	}
+#endif
 	/*
 	 * When TASKQ_ACTIVE is clear new tasks may not be added nor may
 	 * new worker threads be spawned for dynamic taskq.
@@ -1198,7 +1226,6 @@ taskq_destroy(taskq_t *tq)
 }
 EXPORT_SYMBOL(taskq_destroy);
 
-
 static unsigned int spl_taskq_kick = 0;
 
 /*
@@ -1255,12 +1282,96 @@ module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint,
 MODULE_PARM_DESC(spl_taskq_kick,
 	"Write nonzero to kick stuck taskqs to spawn more threads");
 
+#ifdef HAVE_CPU_HOTPLUG
+/*
+ * This callback will be called exactly once for each core that comes online,
+ * for each dynamic taskq. We attempt to expand taskqs that have
+ * TASKQ_THREADS_CPU_PCT set. We need to redo the percentage calculation every
+ * time, to correctly determine whether or not to add a thread.
+ */
+static int
+spl_taskq_expand(unsigned int cpu, struct hlist_node *node)
+{
+	taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node);
+	unsigned long flags;
+	int err = 0;
+
+	ASSERT(tq);
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+
+	if (!(tq->tq_flags & TASKQ_ACTIVE))
+		goto out;
+
+	ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT);
+	int nthreads = MIN(tq->tq_cpu_pct, 100);
+	nthreads = MAX(((num_online_cpus() + 1) * nthreads) / 100, 1);
+	tq->tq_maxthreads = nthreads;
+
+	if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) &&
+	    tq->tq_maxthreads > tq->tq_nthreads) {
+		ASSERT3U(tq->tq_maxthreads, ==, tq->tq_nthreads + 1);
+		taskq_thread_t *tqt = taskq_thread_create(tq);
+		if (tqt == NULL)
+			err = -1;
+	}
+
+out:
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+	return (err);
+}
+
+/*
+ * While we don't support offlining CPUs, it is possible that CPUs will fail
+ * to online successfully. We do need to be able to handle this case
+ * gracefully.
+ */
+static int
+spl_taskq_prepare_down(unsigned int cpu, struct hlist_node *node)
+{
+	taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node);
+	unsigned long flags;
+
+	ASSERT(tq);
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+
+	if (!(tq->tq_flags & TASKQ_ACTIVE))
+		goto out;
+
+	ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT);
+	int nthreads = MIN(tq->tq_cpu_pct, 100);
+	nthreads = MAX(((num_online_cpus()) * nthreads) / 100, 1);
+	tq->tq_maxthreads = nthreads;
+
+	if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) &&
+	    tq->tq_maxthreads < tq->tq_nthreads) {
+		ASSERT3U(tq->tq_maxthreads, ==, tq->tq_nthreads - 1);
+		taskq_thread_t *tqt = list_entry(tq->tq_thread_list.next,
+		    taskq_thread_t, tqt_thread_list);
+		struct task_struct *thread = tqt->tqt_thread;
+		spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+		kthread_stop(thread);
+
+		return (0);
+	}
+
+out:
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+	return (0);
+}
+#endif
+
 int
 spl_taskq_init(void)
 {
 	init_rwsem(&tq_list_sem);
 	tsd_create(&taskq_tsd, NULL);
 
+#ifdef HAVE_CPU_HOTPLUG
+	spl_taskq_cpuhp_state = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+	    "fs/spl_taskq:online", spl_taskq_expand, spl_taskq_prepare_down);
+#endif
+
 	system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64),
 	    maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
 	if (system_taskq == NULL)
@@ -1269,6 +1380,9 @@ spl_taskq_init(void)
 	system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4),
 	    maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
 	if (system_delay_taskq == NULL) {
+#ifdef HAVE_CPU_HOTPLUG
+		cpuhp_remove_multi_state(spl_taskq_cpuhp_state);
+#endif
 		taskq_destroy(system_taskq);
 		return (1);
 	}
@@ -1276,6 +1390,9 @@ spl_taskq_init(void)
 	dynamic_taskq = taskq_create("spl_dynamic_taskq", 1,
 	    maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE);
 	if (dynamic_taskq == NULL) {
+#ifdef HAVE_CPU_HOTPLUG
+		cpuhp_remove_multi_state(spl_taskq_cpuhp_state);
+#endif
 		taskq_destroy(system_taskq);
 		taskq_destroy(system_delay_taskq);
 		return (1);
@@ -1304,4 +1421,9 @@ spl_taskq_fini(void)
 	system_taskq = NULL;
 
 	tsd_destroy(&taskq_tsd);
+
+#ifdef HAVE_CPU_HOTPLUG
+	cpuhp_remove_multi_state(spl_taskq_cpuhp_state);
+	spl_taskq_cpuhp_state = 0;
+#endif
 }
diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c
index 792c75d46..83d4a3d84 100644
--- a/module/os/linux/zfs/arc_os.c
+++ b/module/os/linux/zfs/arc_os.c
@@ -48,6 +48,8 @@
 #include <sys/vmsystm.h>
 #include <sys/zpl.h>
 #include <linux/page_compat.h>
+#include <linux/notifier.h>
+#include <linux/memory.h>
 #endif
 #include <sys/callb.h>
 #include <sys/kstat.h>
@@ -73,6 +75,9 @@
  */
 int zfs_arc_shrinker_limit = 10000;
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+static struct notifier_block arc_hotplug_callback_mem_nb;
+#endif
 
 /*
  * Return a default max arc size based on the amount of physical memory.
@@ -278,18 +283,9 @@ arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
 	return (0);
 }
 
-void
-arc_lowmem_init(void)
+static void
+arc_set_sys_free(uint64_t allmem)
 {
-	uint64_t allmem = arc_all_memory();
-
-	/*
-	 * Register a shrinker to support synchronous (direct) memory
-	 * reclaim from the arc.  This is done to prevent kswapd from
-	 * swapping out pages when it is preferable to shrink the arc.
-	 */
-	spl_register_shrinker(&arc_shrinker);
-
 	/*
 	 * The ARC tries to keep at least this much memory available for the
 	 * system.  This gives the ARC time to shrink in response to memory
@@ -343,6 +339,20 @@ arc_lowmem_init(void)
 }
 
 void
+arc_lowmem_init(void)
+{
+	uint64_t allmem = arc_all_memory();
+
+	/*
+	 * Register a shrinker to support synchronous (direct) memory
+	 * reclaim from the arc.  This is done to prevent kswapd from
+	 * swapping out pages when it is preferable to shrink the arc.
+	 */
+	spl_register_shrinker(&arc_shrinker);
+	arc_set_sys_free(allmem);
+}
+
+void
 arc_lowmem_fini(void)
 {
 	spl_unregister_shrinker(&arc_shrinker);
@@ -375,6 +385,52 @@ param_set_arc_int(const char *buf, zfs_kernel_param_t *kp)
 
 	return (0);
 }
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/* ARGSUSED */
+static int
+arc_hotplug_callback(struct notifier_block *self, unsigned long action,
+    void *arg)
+{
+	uint64_t allmem = arc_all_memory();
+	if (action != MEM_ONLINE)
+		return (NOTIFY_OK);
+
+	arc_set_limits(allmem);
+
+#ifdef __LP64__
+	if (zfs_dirty_data_max_max == 0)
+		zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024,
+		    allmem * zfs_dirty_data_max_max_percent / 100);
+#else
+	if (zfs_dirty_data_max_max == 0)
+		zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024,
+		    allmem * zfs_dirty_data_max_max_percent / 100);
+#endif
+
+	arc_set_sys_free(allmem);
+	return (NOTIFY_OK);
+}
+#endif
+
+void
+arc_register_hotplug(void)
+{
+#ifdef CONFIG_MEMORY_HOTPLUG
+	arc_hotplug_callback_mem_nb.notifier_call = arc_hotplug_callback;
+	/* There is no significance to the value 100 */
+	arc_hotplug_callback_mem_nb.priority = 100;
+	register_memory_notifier(&arc_hotplug_callback_mem_nb);
+#endif
+}
+
+void
+arc_unregister_hotplug(void)
+{
+#ifdef CONFIG_MEMORY_HOTPLUG
+	unregister_memory_notifier(&arc_hotplug_callback_mem_nb);
+#endif
+}
 #else /* _KERNEL */
 int64_t
 arc_available_memory(void)
@@ -405,6 +461,16 @@ arc_free_memory(void)
 {
 	return (spa_get_random(arc_all_memory() * 20 / 100));
 }
+
+void
+arc_register_hotplug(void)
+{
+}
+
+void
+arc_unregister_hotplug(void)
+{
+}
 #endif /* _KERNEL */
 
 /*
diff --git a/module/zfs/aggsum.c b/module/zfs/aggsum.c
index e38f4a66c..e46da95f6 100644
--- a/module/zfs/aggsum.c
+++ b/module/zfs/aggsum.c
@@ -70,6 +70,11 @@
  * zeroing out the borrowed value (forcing that thread to borrow on its next
  * request, which will also be expensive).  This is what makes aggsums well
  * suited for write-many read-rarely operations.
+ *
+ * Note that the aggsums do not expand if more CPUs are hot-added. In that
+ * case, we will have less fanout than boot_ncpus, but we don't want to always
+ * reserve the RAM necessary to create the extra slots for additional CPUs up
+ * front, and dynamically adding them is a complex task.
  */
 
 /*
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index efc6bb138..1bc27391c 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -7593,6 +7593,15 @@ arc_target_bytes(void)
 }
 
 void
+arc_set_limits(uint64_t allmem)
+{
+	/* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */
+	arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT);
+
+	/* How to set default max varies by platform. */
+	arc_c_max = arc_default_max(arc_c_min, allmem);
+}
+void
 arc_init(void)
 {
 	uint64_t percent, allmem = arc_all_memory();
@@ -7607,11 +7616,7 @@ arc_init(void)
 	arc_lowmem_init();
 #endif
 
-	/* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */
-	arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT);
-
-	/* How to set default max varies by platform. */
-	arc_c_max = arc_default_max(arc_c_min, allmem);
+	arc_set_limits(allmem);
 
 #ifndef _KERNEL
 	/*
@@ -7648,6 +7653,8 @@ arc_init(void)
 	if (arc_c < arc_c_min)
 		arc_c = arc_c_min;
 
+	arc_register_hotplug();
+
 	arc_state_init();
 
 	buf_init();
@@ -7656,8 +7663,9 @@ arc_init(void)
 	    offsetof(arc_prune_t, p_node));
 	mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
 
-	arc_prune_taskq = taskq_create("arc_prune", boot_ncpus, defclsyspri,
-	    boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+	arc_prune_taskq = taskq_create("arc_prune", 100, defclsyspri,
+	    boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
+	    TASKQ_THREADS_CPU_PCT);
 
 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
@@ -7754,6 +7762,8 @@ arc_fini(void)
 	buf_fini();
 	arc_state_fini();
 
+	arc_unregister_hotplug();
+
 	/*
 	 * We destroy the zthrs after all the ARC state has been
 	 * torn down to avoid the case of them receiving any
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index 3a2028625..8f6675820 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -220,11 +220,12 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
 
-	dp->dp_zrele_taskq = taskq_create("z_zrele", boot_ncpus, defclsyspri,
-	    boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+	dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri,
+	    boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
+	    TASKQ_THREADS_CPU_PCT);
 	dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain",
-	    boot_ncpus, defclsyspri, boot_ncpus, INT_MAX,
-	    TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+	    100, defclsyspri, boot_ncpus, INT_MAX,
+	    TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
 
 	return (dp);
 }
diff --git a/module/zfs/multilist.c b/module/zfs/multilist.c
index a3adfd317..36c0d33bf 100644
--- a/module/zfs/multilist.c
+++ b/module/zfs/multilist.c
@@ -96,9 +96,12 @@ multilist_create_impl(size_t size, size_t offset,
 }
 
 /*
- * Allocate a new multilist, using the default number of sublists
- * (the number of CPUs, or at least 4, or the tunable
- * zfs_multilist_num_sublists).
+ * Allocate a new multilist, using the default number of sublists (the number
+ * of CPUs, or at least 4, or the tunable zfs_multilist_num_sublists). Note
+ * that the multilists do not expand if more CPUs are hot-added. In that case,
+ * we will have less fanout than boot_ncpus, but we don't want to always
+ * reserve the RAM necessary to create the extra slots for additional CPUs up
+ * front, and dynamically adding them is a complex task.
  */
 multilist_t *
 multilist_create(size_t size, size_t offset,
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 65c907d9d..50822cfae 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1281,15 +1281,15 @@ spa_activate(spa_t *spa, spa_mode_t mode)
 	 * pool traverse code from monopolizing the global (and limited)
 	 * system_taskq by inappropriately scheduling long running tasks on it.
 	 */
-	spa->spa_prefetch_taskq = taskq_create("z_prefetch", boot_ncpus,
-	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC);
+	spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100,
+	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
 
 	/*
 	 * The taskq to upgrade datasets in this pool. Currently used by
 	 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA.
 	 */
-	spa->spa_upgrade_taskq = taskq_create("z_upgrade", boot_ncpus,
-	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC);
+	spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100,
+	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
 }
 
 /*
diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index 420244abb..3efd26155 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -446,8 +446,9 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
 			 * Commit callback taskq hasn't been created yet.
 			 */
 			tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
-			    boot_ncpus, defclsyspri, boot_ncpus, boot_ncpus * 2,
-			    TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+			    100, defclsyspri, boot_ncpus, boot_ncpus * 2,
+			    TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
+			    TASKQ_THREADS_CPU_PCT);
 		}
 
 		cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);