diff options
author | Jinshan Xiong <[email protected]> | 2016-09-21 13:49:47 -0700 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2016-10-07 09:45:13 -0700 |
commit | 9b7a83cbb6cae54c127fde4c83e73505ad9c9e73 (patch) | |
tree | 1b6d67d481b0bd4a9e2b36af04ad95d96485516b /module | |
parent | 1de321e6260f5b83eb943b6ce2166a3879f42df4 (diff) |
OpenZFS 6988 spa_sync() spends half its time in dmu_objset_do_userquota_updates
Using a benchmark which creates 2 million files in one TXG, I observe
that the thread running spa_sync() is on CPU almost the entire time we
are syncing, and therefore can be a performance bottleneck. About 50% of
the time in spa_sync() is in dmu_objset_do_userquota_updates().
The problem is that dmu_objset_do_userquota_updates() calls
zap_increment_int(DMU_USERUSED_OBJECT) once for every file that was
modified (or created). In this benchmark, all the files are owned by the
same user/group, so all 2 million calls to zap_increment_int() are
modifying the same entry in the zap. The same issue exists for the
DMU_GROUPUSED_OBJECT.
We should keep an in-memory map from user to space delta while we are
syncing, and when we finish, iterate over the in-memory map and modify
the ZAP once per entry. This reduces the number of calls to
zap_increment_int() from "number of objects modified" to "number of
owners/groups of modified files".
This reduced the time spent in spa_sync() in the file create benchmark
by ~33%, from 11 seconds to 7 seconds.
Upstream bugs: DLPX-44799
Ported by: Ned Bass <[email protected]>
OpenZFS-issue: https://www.illumos.org/issues/6988
ZFSonLinux-issue: https://github.com/zfsonlinux/zfs/issues/4642
OpenZFS-commit: unmerged
Porting notes:
- Added curly braces around declaration of userquota_cache_t cache to
quiet compiler warning;
- Handled the userobj accounting the same way it proposed in this path.
Signed-off-by: Jinshan Xiong <[email protected]>
Diffstat (limited to 'module')
-rw-r--r-- | module/zfs/dmu_objset.c | 140 | ||||
-rw-r--r-- | module/zfs/dnode.c | 3 |
2 files changed, 107 insertions, 36 deletions
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index c6b6eb745..b1f05cb61 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1338,37 +1338,113 @@ dmu_objset_userobjused_enabled(objset_t *os) spa_feature_is_enabled(os->os_spa, SPA_FEATURE_USEROBJ_ACCOUNTING)); } +typedef struct userquota_node { + /* must be in the first filed, see userquota_update_cache() */ + char uqn_id[20 + DMU_OBJACCT_PREFIX_LEN]; + int64_t uqn_delta; + avl_node_t uqn_node; +} userquota_node_t; + +typedef struct userquota_cache { + avl_tree_t uqc_user_deltas; + avl_tree_t uqc_group_deltas; +} userquota_cache_t; + +static int +userquota_compare(const void *l, const void *r) +{ + const userquota_node_t *luqn = l; + const userquota_node_t *ruqn = r; + + /* + * NB: can only access uqn_id because userquota_update_cache() doesn't + * pass in an entire userquota_node_t. + */ + return (strcmp(luqn->uqn_id, ruqn->uqn_id)); +} + +static void +do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx) +{ + void *cookie; + userquota_node_t *uqn; + + ASSERT(dmu_tx_is_syncing(tx)); + + cookie = NULL; + while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas, + &cookie)) != NULL) { + VERIFY0(zap_increment(os, DMU_USERUSED_OBJECT, + uqn->uqn_id, uqn->uqn_delta, tx)); + kmem_free(uqn, sizeof (*uqn)); + } + avl_destroy(&cache->uqc_user_deltas); + + cookie = NULL; + while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas, + &cookie)) != NULL) { + VERIFY0(zap_increment(os, DMU_GROUPUSED_OBJECT, + uqn->uqn_id, uqn->uqn_delta, tx)); + kmem_free(uqn, sizeof (*uqn)); + } + avl_destroy(&cache->uqc_group_deltas); +} + +static void +userquota_update_cache(avl_tree_t *avl, const char *id, int64_t delta) +{ + userquota_node_t *uqn; + avl_index_t idx; + + ASSERT(strlen(id) < sizeof (uqn->uqn_id)); + /* + * Use id directly for searching because uqn_id is the first field of + * userquota_node_t and fields after uqn_id won't be accessed in + * avl_find(). + */ + uqn = avl_find(avl, (const void *)id, &idx); + if (uqn == NULL) { + uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP); + strcpy(uqn->uqn_id, id); + avl_insert(avl, uqn, idx); + } + uqn->uqn_delta += delta; +} + static void -do_userquota_update(objset_t *os, uint64_t used, uint64_t flags, - uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx) +do_userquota_update(userquota_cache_t *cache, uint64_t used, uint64_t flags, + uint64_t user, uint64_t group, boolean_t subtract) { if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { int64_t delta = DNODE_MIN_SIZE + used; + char name[20]; + if (subtract) delta = -delta; - VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT, - user, delta, tx)); - VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT, - group, delta, tx)); + + (void) sprintf(name, "%llx", (longlong_t)user); + userquota_update_cache(&cache->uqc_user_deltas, name, delta); + + (void) sprintf(name, "%llx", (longlong_t)group); + userquota_update_cache(&cache->uqc_group_deltas, name, delta); } } static void -do_userobjquota_update(objset_t *os, uint64_t flags, uint64_t user, - uint64_t group, boolean_t subtract, dmu_tx_t *tx) +do_userobjquota_update(userquota_cache_t *cache, uint64_t flags, + uint64_t user, uint64_t group, boolean_t subtract) { if (flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) { char name[20 + DMU_OBJACCT_PREFIX_LEN]; + int delta = subtract ? -1 : 1; (void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx", (longlong_t)user); - VERIFY0(zap_increment(os, DMU_USERUSED_OBJECT, name, - subtract ? -1 : 1, tx)); + userquota_update_cache(&cache->uqc_user_deltas, name, delta); (void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx", (longlong_t)group); - VERIFY0(zap_increment(os, DMU_GROUPUSED_OBJECT, name, - subtract ? -1 : 1, tx)); + userquota_update_cache(&cache->uqc_group_deltas, name, delta); } } @@ -1377,9 +1453,15 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) { dnode_t *dn; list_t *list = &os->os_synced_dnodes; + userquota_cache_t cache = { { 0 } }; ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); + avl_create(&cache.uqc_user_deltas, userquota_compare, + sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node)); + avl_create(&cache.uqc_group_deltas, userquota_compare, + sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node)); + while ((dn = list_head(list))) { int flags; ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); @@ -1389,36 +1471,27 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) /* Allocate the user/groupused objects if necessary. */ if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { - VERIFY(0 == zap_create_claim(os, - DMU_USERUSED_OBJECT, + VERIFY0(zap_create_claim(os, DMU_USERUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); - VERIFY(0 == zap_create_claim(os, - DMU_GROUPUSED_OBJECT, + VERIFY0(zap_create_claim(os, DMU_GROUPUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); } - /* - * We intentionally modify the zap object even if the - * net delta is zero. Otherwise - * the block of the zap obj could be shared between - * datasets but need to be different between them after - * a bprewrite. - */ - flags = dn->dn_id_flags; ASSERT(flags); if (flags & DN_ID_OLD_EXIST) { - do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags, - dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx); - do_userobjquota_update(os, dn->dn_oldflags, - dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx); + do_userquota_update(&cache, + dn->dn_oldused, dn->dn_oldflags, + dn->dn_olduid, dn->dn_oldgid, B_TRUE); + do_userobjquota_update(&cache, dn->dn_oldflags, + dn->dn_olduid, dn->dn_oldgid, B_TRUE); } if (flags & DN_ID_NEW_EXIST) { - do_userquota_update(os, DN_USED_BYTES(dn->dn_phys), - dn->dn_phys->dn_flags, dn->dn_newuid, - dn->dn_newgid, B_FALSE, tx); - do_userobjquota_update(os, dn->dn_phys->dn_flags, - dn->dn_newuid, dn->dn_newgid, B_FALSE, tx); + do_userquota_update(&cache, + DN_USED_BYTES(dn->dn_phys), dn->dn_phys->dn_flags, + dn->dn_newuid, dn->dn_newgid, B_FALSE); + do_userobjquota_update(&cache, dn->dn_phys->dn_flags, + dn->dn_newuid, dn->dn_newgid, B_FALSE); } mutex_enter(&dn->dn_mtx); @@ -1439,6 +1512,7 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) list_remove(list, dn); dnode_rele(dn, list); } + do_userquota_cacheflush(os, &cache, tx); } /* diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index f0b03bbba..6ba8207e2 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -2001,9 +2001,6 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, boolean_t hole; int i, inc, error, span; - dprintf("probing object %llu offset %llx level %d of %u\n", - dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels); - hole = ((flags & DNODE_FIND_HOLE) != 0); inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; ASSERT(txg == 0 || !hole); |