diff options
-rw-r--r-- | config/kernel-hotplug.m4 | 26 | ||||
-rw-r--r-- | config/kernel.m4 | 2 | ||||
-rw-r--r-- | include/os/linux/spl/sys/taskq.h | 5 | ||||
-rw-r--r-- | include/sys/arc.h | 1 | ||||
-rw-r--r-- | include/sys/arc_impl.h | 2 | ||||
-rw-r--r-- | module/os/freebsd/zfs/arc_os.c | 10 | ||||
-rw-r--r-- | module/os/linux/spl/spl-taskq.c | 132 | ||||
-rw-r--r-- | module/os/linux/zfs/arc_os.c | 88 | ||||
-rw-r--r-- | module/zfs/aggsum.c | 5 | ||||
-rw-r--r-- | module/zfs/arc.c | 24 | ||||
-rw-r--r-- | module/zfs/dsl_pool.c | 9 | ||||
-rw-r--r-- | module/zfs/multilist.c | 9 | ||||
-rw-r--r-- | module/zfs/spa.c | 8 | ||||
-rw-r--r-- | module/zfs/txg.c | 5 |
14 files changed, 290 insertions, 36 deletions
diff --git a/config/kernel-hotplug.m4 b/config/kernel-hotplug.m4 new file mode 100644 index 000000000..e796a6d2e --- /dev/null +++ b/config/kernel-hotplug.m4 @@ -0,0 +1,26 @@ +dnl # +dnl # 4.6 API change +dnl # Added CPU hotplug APIs +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_CPU_HOTPLUG], [ + ZFS_LINUX_TEST_SRC([cpu_hotplug], [ + #include <linux/cpuhotplug.h> + ],[ + enum cpuhp_state state = CPUHP_ONLINE; + int (*fp)(unsigned int, struct hlist_node *) = NULL; + cpuhp_state_add_instance_nocalls(0, (struct hlist_node *)NULL); + cpuhp_state_remove_instance_nocalls(0, (struct hlist_node *)NULL); + cpuhp_setup_state_multi(state, "", fp, fp); + cpuhp_remove_multi_state(0); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CPU_HOTPLUG], [ + AC_MSG_CHECKING([whether CPU hotplug APIs exist]) + ZFS_LINUX_TEST_RESULT([cpu_hotplug], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_CPU_HOTPLUG, 1, [yes]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 3e01daa5e..14a8d4c58 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -124,6 +124,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_TOTALHIGH_PAGES ZFS_AC_KERNEL_SRC_KSTRTOUL ZFS_AC_KERNEL_SRC_PERCPU + ZFS_AC_KERNEL_SRC_CPU_HOTPLUG AC_MSG_CHECKING([for available kernel interfaces]) ZFS_LINUX_TEST_COMPILE_ALL([kabi]) @@ -221,6 +222,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_TOTALHIGH_PAGES ZFS_AC_KERNEL_KSTRTOUL ZFS_AC_KERNEL_PERCPU + ZFS_AC_KERNEL_CPU_HOTPLUG ]) dnl # diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h index 16f4349e7..b50175a10 100644 --- a/include/os/linux/spl/sys/taskq.h +++ b/include/os/linux/spl/sys/taskq.h @@ -84,6 +84,8 @@ typedef struct taskq { int tq_nthreads; /* # of existing threads */ int tq_nspawn; /* # of threads being spawned */ int tq_maxthreads; /* # of threads maximum */ + /* If PERCPU flag is set, percent of NCPUs to have as threads */ + int tq_cpu_pct; int tq_pri; /* priority */ int tq_minalloc; /* min taskq_ent_t pool size */ int tq_maxalloc; /* max taskq_ent_t pool size */ @@ -99,6 +101,9 @@ typedef struct taskq { spl_wait_queue_head_t tq_work_waitq; /* new work waitq */ spl_wait_queue_head_t tq_wait_waitq; /* wait waitq */ tq_lock_role_t tq_lock_class; /* class when taking tq_lock */ + /* list node for the cpu hotplug callback */ + struct hlist_node tq_hp_cb_node; + boolean_t tq_hp_support; } taskq_t; typedef struct taskq_ent { diff --git a/include/sys/arc.h b/include/sys/arc.h index 9ade1a432..f58fa53b6 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -310,6 +310,7 @@ int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg); uint64_t arc_all_memory(void); uint64_t arc_default_max(uint64_t min, uint64_t allmem); uint64_t arc_target_bytes(void); +void arc_set_limits(uint64_t); void arc_init(void); void arc_fini(void); diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index b08a42c75..6adcd4b54 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -926,6 +926,8 @@ extern int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg); extern uint64_t arc_free_memory(void); extern int64_t arc_available_memory(void); extern void arc_tuning_update(boolean_t); +extern void arc_register_hotplug(void); +extern void arc_unregister_hotplug(void); extern int param_set_arc_long(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS); diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index 94df75003..4fc7468bf 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -243,3 +243,13 @@ arc_lowmem_fini(void) if (arc_event_lowmem != NULL) EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); } + +void +arc_register_hotplug(void) +{ +} + +void +arc_unregister_hotplug(void) +{ +} diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c index fafadffc7..e8d89bfea 100644 --- a/module/os/linux/spl/spl-taskq.c +++ b/module/os/linux/spl/spl-taskq.c @@ -28,6 +28,9 @@ #include <sys/kmem.h> #include <sys/tsd.h> #include <sys/trace_spl.h> +#ifdef HAVE_CPU_HOTPLUG +#include <linux/cpuhotplug.h> +#endif int spl_taskq_thread_bind = 0; module_param(spl_taskq_thread_bind, int, 0644); @@ -35,7 +38,7 @@ MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default"); int spl_taskq_thread_dynamic = 1; -module_param(spl_taskq_thread_dynamic, int, 0644); +module_param(spl_taskq_thread_dynamic, int, 0444); MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads"); int spl_taskq_thread_priority = 1; @@ -59,6 +62,11 @@ EXPORT_SYMBOL(system_delay_taskq); static taskq_t *dynamic_taskq; static taskq_thread_t *taskq_thread_create(taskq_t *); +#ifdef HAVE_CPU_HOTPLUG +/* Multi-callback id for cpu hotplugging. */ +static int spl_taskq_cpuhp_state; +#endif + /* List of all taskqs */ LIST_HEAD(tq_list); struct rw_semaphore tq_list_sem; @@ -1024,13 +1032,14 @@ taskq_thread_create(taskq_t *tq) } taskq_t * -taskq_create(const char *name, int nthreads, pri_t pri, +taskq_create(const char *name, int threads_arg, pri_t pri, int minalloc, int maxalloc, uint_t flags) { taskq_t *tq; taskq_thread_t *tqt; int count = 0, rc = 0, i; unsigned long irqflags; + int nthreads = threads_arg; ASSERT(name != NULL); ASSERT(minalloc >= 0); @@ -1041,15 +1050,27 @@ taskq_create(const char *name, int nthreads, pri_t pri, if (flags & TASKQ_THREADS_CPU_PCT) { ASSERT(nthreads <= 100); ASSERT(nthreads >= 0); - nthreads = MIN(nthreads, 100); + nthreads = MIN(threads_arg, 100); nthreads = MAX(nthreads, 0); - nthreads = MAX((num_online_cpus() * nthreads) / 100, 1); + nthreads = MAX((num_online_cpus() * nthreads) /100, 1); } tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE); if (tq == NULL) return (NULL); + tq->tq_hp_support = B_FALSE; +#ifdef HAVE_CPU_HOTPLUG + if (flags & TASKQ_THREADS_CPU_PCT) { + tq->tq_hp_support = B_TRUE; + if (cpuhp_state_add_instance_nocalls(spl_taskq_cpuhp_state, + &tq->tq_hp_cb_node) != 0) { + kmem_free(tq, sizeof (*tq)); + return (NULL); + } + } +#endif + spin_lock_init(&tq->tq_lock); INIT_LIST_HEAD(&tq->tq_thread_list); INIT_LIST_HEAD(&tq->tq_active_list); @@ -1058,6 +1079,7 @@ taskq_create(const char *name, int nthreads, pri_t pri, tq->tq_nthreads = 0; tq->tq_nspawn = 0; tq->tq_maxthreads = nthreads; + tq->tq_cpu_pct = threads_arg; tq->tq_pri = pri; tq->tq_minalloc = minalloc; tq->tq_maxalloc = maxalloc; @@ -1131,6 +1153,12 @@ taskq_destroy(taskq_t *tq) tq->tq_flags &= ~TASKQ_ACTIVE; spin_unlock_irqrestore(&tq->tq_lock, flags); +#ifdef HAVE_CPU_HOTPLUG + if (tq->tq_hp_support) { + VERIFY0(cpuhp_state_remove_instance_nocalls( + spl_taskq_cpuhp_state, &tq->tq_hp_cb_node)); + } +#endif /* * When TASKQ_ACTIVE is clear new tasks may not be added nor may * new worker threads be spawned for dynamic taskq. @@ -1198,7 +1226,6 @@ taskq_destroy(taskq_t *tq) } EXPORT_SYMBOL(taskq_destroy); - static unsigned int spl_taskq_kick = 0; /* @@ -1255,12 +1282,96 @@ module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint, MODULE_PARM_DESC(spl_taskq_kick, "Write nonzero to kick stuck taskqs to spawn more threads"); +#ifdef HAVE_CPU_HOTPLUG +/* + * This callback will be called exactly once for each core that comes online, + * for each dynamic taskq. We attempt to expand taskqs that have + * TASKQ_THREADS_CPU_PCT set. We need to redo the percentage calculation every + * time, to correctly determine whether or not to add a thread. + */ +static int +spl_taskq_expand(unsigned int cpu, struct hlist_node *node) +{ + taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node); + unsigned long flags; + int err = 0; + + ASSERT(tq); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + + if (!(tq->tq_flags & TASKQ_ACTIVE)) + goto out; + + ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT); + int nthreads = MIN(tq->tq_cpu_pct, 100); + nthreads = MAX(((num_online_cpus() + 1) * nthreads) / 100, 1); + tq->tq_maxthreads = nthreads; + + if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) && + tq->tq_maxthreads > tq->tq_nthreads) { + ASSERT3U(tq->tq_maxthreads, ==, tq->tq_nthreads + 1); + taskq_thread_t *tqt = taskq_thread_create(tq); + if (tqt == NULL) + err = -1; + } + +out: + spin_unlock_irqrestore(&tq->tq_lock, flags); + return (err); +} + +/* + * While we don't support offlining CPUs, it is possible that CPUs will fail + * to online successfully. We do need to be able to handle this case + * gracefully. + */ +static int +spl_taskq_prepare_down(unsigned int cpu, struct hlist_node *node) +{ + taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node); + unsigned long flags; + + ASSERT(tq); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + + if (!(tq->tq_flags & TASKQ_ACTIVE)) + goto out; + + ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT); + int nthreads = MIN(tq->tq_cpu_pct, 100); + nthreads = MAX(((num_online_cpus()) * nthreads) / 100, 1); + tq->tq_maxthreads = nthreads; + + if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) && + tq->tq_maxthreads < tq->tq_nthreads) { + ASSERT3U(tq->tq_maxthreads, ==, tq->tq_nthreads - 1); + taskq_thread_t *tqt = list_entry(tq->tq_thread_list.next, + taskq_thread_t, tqt_thread_list); + struct task_struct *thread = tqt->tqt_thread; + spin_unlock_irqrestore(&tq->tq_lock, flags); + + kthread_stop(thread); + + return (0); + } + +out: + spin_unlock_irqrestore(&tq->tq_lock, flags); + return (0); +} +#endif + int spl_taskq_init(void) { init_rwsem(&tq_list_sem); tsd_create(&taskq_tsd, NULL); +#ifdef HAVE_CPU_HOTPLUG + spl_taskq_cpuhp_state = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + "fs/spl_taskq:online", spl_taskq_expand, spl_taskq_prepare_down); +#endif + system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64), maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); if (system_taskq == NULL) @@ -1269,6 +1380,9 @@ spl_taskq_init(void) system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4), maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); if (system_delay_taskq == NULL) { +#ifdef HAVE_CPU_HOTPLUG + cpuhp_remove_multi_state(spl_taskq_cpuhp_state); +#endif taskq_destroy(system_taskq); return (1); } @@ -1276,6 +1390,9 @@ spl_taskq_init(void) dynamic_taskq = taskq_create("spl_dynamic_taskq", 1, maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE); if (dynamic_taskq == NULL) { +#ifdef HAVE_CPU_HOTPLUG + cpuhp_remove_multi_state(spl_taskq_cpuhp_state); +#endif taskq_destroy(system_taskq); taskq_destroy(system_delay_taskq); return (1); @@ -1304,4 +1421,9 @@ spl_taskq_fini(void) system_taskq = NULL; tsd_destroy(&taskq_tsd); + +#ifdef HAVE_CPU_HOTPLUG + cpuhp_remove_multi_state(spl_taskq_cpuhp_state); + spl_taskq_cpuhp_state = 0; +#endif } diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c index 792c75d46..83d4a3d84 100644 --- a/module/os/linux/zfs/arc_os.c +++ b/module/os/linux/zfs/arc_os.c @@ -48,6 +48,8 @@ #include <sys/vmsystm.h> #include <sys/zpl.h> #include <linux/page_compat.h> +#include <linux/notifier.h> +#include <linux/memory.h> #endif #include <sys/callb.h> #include <sys/kstat.h> @@ -73,6 +75,9 @@ */ int zfs_arc_shrinker_limit = 10000; +#ifdef CONFIG_MEMORY_HOTPLUG +static struct notifier_block arc_hotplug_callback_mem_nb; +#endif /* * Return a default max arc size based on the amount of physical memory. @@ -278,18 +283,9 @@ arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) return (0); } -void -arc_lowmem_init(void) +static void +arc_set_sys_free(uint64_t allmem) { - uint64_t allmem = arc_all_memory(); - - /* - * Register a shrinker to support synchronous (direct) memory - * reclaim from the arc. This is done to prevent kswapd from - * swapping out pages when it is preferable to shrink the arc. - */ - spl_register_shrinker(&arc_shrinker); - /* * The ARC tries to keep at least this much memory available for the * system. This gives the ARC time to shrink in response to memory @@ -343,6 +339,20 @@ arc_lowmem_init(void) } void +arc_lowmem_init(void) +{ + uint64_t allmem = arc_all_memory(); + + /* + * Register a shrinker to support synchronous (direct) memory + * reclaim from the arc. This is done to prevent kswapd from + * swapping out pages when it is preferable to shrink the arc. + */ + spl_register_shrinker(&arc_shrinker); + arc_set_sys_free(allmem); +} + +void arc_lowmem_fini(void) { spl_unregister_shrinker(&arc_shrinker); @@ -375,6 +385,52 @@ param_set_arc_int(const char *buf, zfs_kernel_param_t *kp) return (0); } + +#ifdef CONFIG_MEMORY_HOTPLUG +/* ARGSUSED */ +static int +arc_hotplug_callback(struct notifier_block *self, unsigned long action, + void *arg) +{ + uint64_t allmem = arc_all_memory(); + if (action != MEM_ONLINE) + return (NOTIFY_OK); + + arc_set_limits(allmem); + +#ifdef __LP64__ + if (zfs_dirty_data_max_max == 0) + zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024, + allmem * zfs_dirty_data_max_max_percent / 100); +#else + if (zfs_dirty_data_max_max == 0) + zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024, + allmem * zfs_dirty_data_max_max_percent / 100); +#endif + + arc_set_sys_free(allmem); + return (NOTIFY_OK); +} +#endif + +void +arc_register_hotplug(void) +{ +#ifdef CONFIG_MEMORY_HOTPLUG + arc_hotplug_callback_mem_nb.notifier_call = arc_hotplug_callback; + /* There is no significance to the value 100 */ + arc_hotplug_callback_mem_nb.priority = 100; + register_memory_notifier(&arc_hotplug_callback_mem_nb); +#endif +} + +void +arc_unregister_hotplug(void) +{ +#ifdef CONFIG_MEMORY_HOTPLUG + unregister_memory_notifier(&arc_hotplug_callback_mem_nb); +#endif +} #else /* _KERNEL */ int64_t arc_available_memory(void) @@ -405,6 +461,16 @@ arc_free_memory(void) { return (spa_get_random(arc_all_memory() * 20 / 100)); } + +void +arc_register_hotplug(void) +{ +} + +void +arc_unregister_hotplug(void) +{ +} #endif /* _KERNEL */ /* diff --git a/module/zfs/aggsum.c b/module/zfs/aggsum.c index e38f4a66c..e46da95f6 100644 --- a/module/zfs/aggsum.c +++ b/module/zfs/aggsum.c @@ -70,6 +70,11 @@ * zeroing out the borrowed value (forcing that thread to borrow on its next * request, which will also be expensive). This is what makes aggsums well * suited for write-many read-rarely operations. + * + * Note that the aggsums do not expand if more CPUs are hot-added. In that + * case, we will have less fanout than boot_ncpus, but we don't want to always + * reserve the RAM necessary to create the extra slots for additional CPUs up + * front, and dynamically adding them is a complex task. */ /* diff --git a/module/zfs/arc.c b/module/zfs/arc.c index efc6bb138..1bc27391c 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -7593,6 +7593,15 @@ arc_target_bytes(void) } void +arc_set_limits(uint64_t allmem) +{ + /* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */ + arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT); + + /* How to set default max varies by platform. */ + arc_c_max = arc_default_max(arc_c_min, allmem); +} +void arc_init(void) { uint64_t percent, allmem = arc_all_memory(); @@ -7607,11 +7616,7 @@ arc_init(void) arc_lowmem_init(); #endif - /* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */ - arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT); - - /* How to set default max varies by platform. */ - arc_c_max = arc_default_max(arc_c_min, allmem); + arc_set_limits(allmem); #ifndef _KERNEL /* @@ -7648,6 +7653,8 @@ arc_init(void) if (arc_c < arc_c_min) arc_c = arc_c_min; + arc_register_hotplug(); + arc_state_init(); buf_init(); @@ -7656,8 +7663,9 @@ arc_init(void) offsetof(arc_prune_t, p_node)); mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); - arc_prune_taskq = taskq_create("arc_prune", boot_ncpus, defclsyspri, - boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + arc_prune_taskq = taskq_create("arc_prune", 100, defclsyspri, + boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | + TASKQ_THREADS_CPU_PCT); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); @@ -7754,6 +7762,8 @@ arc_fini(void) buf_fini(); arc_state_fini(); + arc_unregister_hotplug(); + /* * We destroy the zthrs after all the ARC state has been * torn down to avoid the case of them receiving any diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 3a2028625..8f6675820 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -220,11 +220,12 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); - dp->dp_zrele_taskq = taskq_create("z_zrele", boot_ncpus, defclsyspri, - boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri, + boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | + TASKQ_THREADS_CPU_PCT); dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain", - boot_ncpus, defclsyspri, boot_ncpus, INT_MAX, - TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + 100, defclsyspri, boot_ncpus, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); return (dp); } diff --git a/module/zfs/multilist.c b/module/zfs/multilist.c index a3adfd317..36c0d33bf 100644 --- a/module/zfs/multilist.c +++ b/module/zfs/multilist.c @@ -96,9 +96,12 @@ multilist_create_impl(size_t size, size_t offset, } /* - * Allocate a new multilist, using the default number of sublists - * (the number of CPUs, or at least 4, or the tunable - * zfs_multilist_num_sublists). + * Allocate a new multilist, using the default number of sublists (the number + * of CPUs, or at least 4, or the tunable zfs_multilist_num_sublists). Note + * that the multilists do not expand if more CPUs are hot-added. In that case, + * we will have less fanout than boot_ncpus, but we don't want to always + * reserve the RAM necessary to create the extra slots for additional CPUs up + * front, and dynamically adding them is a complex task. */ multilist_t * multilist_create(size_t size, size_t offset, diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 65c907d9d..50822cfae 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1281,15 +1281,15 @@ spa_activate(spa_t *spa, spa_mode_t mode) * pool traverse code from monopolizing the global (and limited) * system_taskq by inappropriately scheduling long running tasks on it. */ - spa->spa_prefetch_taskq = taskq_create("z_prefetch", boot_ncpus, - defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC); + spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100, + defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); /* * The taskq to upgrade datasets in this pool. Currently used by * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. */ - spa->spa_upgrade_taskq = taskq_create("z_upgrade", boot_ncpus, - defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC); + spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100, + defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); } /* diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 420244abb..3efd26155 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -446,8 +446,9 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) * Commit callback taskq hasn't been created yet. */ tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb", - boot_ncpus, defclsyspri, boot_ncpus, boot_ncpus * 2, - TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + 100, defclsyspri, boot_ncpus, boot_ncpus * 2, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC | + TASKQ_THREADS_CPU_PCT); } cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP); |