diff options
author | Matthew Ahrens <[email protected]> | 2017-03-20 18:36:00 -0700 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2017-03-20 18:36:00 -0700 |
commit | 64fc776208ad14b0078b89317b0f3b24338e10c1 (patch) | |
tree | b8c229ca8b052f3aa718a27b97c759a564c8fd78 /module/zfs/dsl_pool.c | |
parent | a3478c074752610814f894375c3d947ece4938fe (diff) |
OpenZFS 7968 - multi-threaded spa_sync()
Reviewed by: Pavel Zakharov <[email protected]>
Reviewed by: Brad Lewis <[email protected]>
Reviewed by: Saso Kiselkov <[email protected]>
Reviewed by: Brian Behlendorf <[email protected]>
Ported-by: Matthew Ahrens <[email protected]>
spa_sync() iterates over all the dirty dnodes and processes each of them
by calling dnode_sync(). If there are many dirty dnodes (e.g. because we
created or removed a lot of files), the single thread of spa_sync()
calling dnode_sync() can become a bottleneck. Additionally, if many
dnodes are dirtied concurrently in open context (e.g. due to concurrent
file creation), the os_lock will experience lock contention via
dnode_setdirty().
The solution is to track dirty dnodes on a multilist_t, and for
spa_sync() to use separate threads to process each of the sublists in
the multilist.
OpenZFS-issue: https://www.illumos.org/issues/7968
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/4a2a54c
Closes #5752
Diffstat (limited to 'module/zfs/dsl_pool.c')
-rw-r--r-- | module/zfs/dsl_pool.c | 28 |
1 files changed, 23 insertions, 5 deletions
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index d1aeb8017..2386484c8 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. @@ -132,6 +132,11 @@ unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000; hrtime_t zfs_throttle_delay = MSEC2NSEC(10); hrtime_t zfs_throttle_resolution = MSEC2NSEC(10); +/* + * This determines the number of threads used by the dp_sync_taskq. + */ +int zfs_sync_taskq_batch_pct = 75; + int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) { @@ -168,6 +173,10 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) txg_list_create(&dp->dp_sync_tasks, offsetof(dsl_sync_task_t, dst_node)); + dp->dp_sync_taskq = taskq_create("dp_sync_taskq", + zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX, + TASKQ_THREADS_CPU_PCT); + mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); @@ -326,6 +335,8 @@ dsl_pool_close(dsl_pool_t *dp) txg_list_destroy(&dp->dp_sync_tasks); txg_list_destroy(&dp->dp_dirty_dirs); + taskq_destroy(dp->dp_sync_taskq); + /* * We can't set retry to TRUE since we're explicitly specifying * a spa to flush. This is good enough; any missed buffers for @@ -514,12 +525,15 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) /* * After the data blocks have been written (ensured by the zio_wait() - * above), update the user/group space accounting. + * above), update the user/group space accounting. This happens + * in tasks dispatched to dp_sync_taskq, so wait for them before + * continuing. */ for (ds = list_head(&synced_datasets); ds != NULL; ds = list_next(&synced_datasets, ds)) { dmu_objset_do_userquota_updates(ds->ds_objset, tx); } + taskq_wait(dp->dp_sync_taskq); /* * Sync the datasets again to push out the changes due to @@ -567,8 +581,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dp->dp_mos_uncompressed_delta = 0; } - if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || - list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { + if (!multilist_is_empty(mos->os_dirty_dnodes[txg & TXG_MASK])) { dsl_pool_sync_mos(dp, tx); } @@ -619,7 +632,8 @@ int dsl_pool_sync_context(dsl_pool_t *dp) { return (curthread == dp->dp_tx.tx_sync_thread || - spa_is_initializing(dp->dp_spa)); + spa_is_initializing(dp->dp_spa) || + taskq_member(dp->dp_sync_taskq, curthread)); } uint64_t @@ -1116,5 +1130,9 @@ MODULE_PARM_DESC(zfs_dirty_data_sync, "sync txg when this much dirty data"); module_param(zfs_delay_scale, ulong, 0644); MODULE_PARM_DESC(zfs_delay_scale, "how quickly delay approaches infinity"); + +module_param(zfs_sync_taskq_batch_pct, int, 0644); +MODULE_PARM_DESC(zfs_sync_taskq_batch_pct, + "max percent of CPUs that are used to sync dirty data"); /* END CSTYLED */ #endif |