From ba67d82142bc7034734a49a62998cfc96b1d0038 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Fri, 11 Dec 2020 10:26:02 -0800 Subject: Improve zfs receive performance with lightweight write The performance of `zfs receive` can be bottlenecked on the CPU consumed by the `receive_writer` thread, especially when receiving streams with small compressed block sizes. Much of the CPU is spent creating and destroying dbuf's and arc buf's, one for each `WRITE` record in the send stream. This commit introduces the concept of "lightweight writes", which allows `zfs receive` to write to the DMU by providing an ABD, and instantiating only a new type of `dbuf_dirty_record_t`. The dbuf and arc buf for this "dirty leaf block" are not instantiated. Because there is no dbuf with the dirty data, this mechanism doesn't support reading from "lightweight-dirty" blocks (they would see the on-disk state rather than the dirty data). Since the dedup-receive code has been removed, `zfs receive` is write-only, so this works fine. Because there are no arc bufs for the received data, the received data is no longer cached in the ARC. Testing a receive of a stream with average compressed block size of 4KB, this commit improves performance by 50%, while also reducing CPU usage by 50% of a CPU. On a per-block basis, CPU consumed by receive_writer() and dbuf_evict() is now 1/7th (14%) of what it was. Baseline: 450MB/s, CPU in receive_writer() 40% + dbuf_evict() 35% New: 670MB/s, CPU in receive_writer() 17% + dbuf_evict() 0% The code is also restructured in a few ways: Added a `dr_dnode` field to the dbuf_dirty_record_t. This simplifies some existing code that no longer needs `DB_DNODE_ENTER()` and related routines. The new field is needed by the lightweight-type dirty record. To ensure that the `dr_dnode` field remains valid until the dirty record is freed, we have to ensure that the `dnode_move()` doesn't relocate the dnode_t. To do this we keep a hold on the dnode until it's zio's have completed. This is already done by the user-accounting code (`userquota_updates_task()`), this commit extends that so that it always keeps the dnode hold until zio completion (see `dnode_rele_task()`). `dn_dirty_txg` was previously zeroed when the dnode was synced. This was not necessary, since its meaning can be "when was this dnode last dirtied". This change simplifies the new `dnode_rele_task()` code. Removed some dead code related to `DRR_WRITE_BYREF` (dedup receive). Reviewed-by: Brian Behlendorf Reviewed-by: Paul Dagnelie Reviewed-by: George Wilson Signed-off-by: Matthew Ahrens Closes #11105 --- module/zfs/dmu_objset.c | 135 ++++++++++++++++++++++++++++-------------------- 1 file changed, 80 insertions(+), 55 deletions(-) (limited to 'module/zfs/dmu_objset.c') diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 6c1d23f2b..14cfa4362 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -1235,7 +1235,7 @@ dmu_objset_create_sync(void *arg, dmu_tx_t *tx) } VERIFY0(zio_wait(rzio)); - dmu_objset_do_userquota_updates(os, tx); + dmu_objset_sync_done(os, tx); taskq_wait(dp->dp_sync_taskq); if (txg_list_member(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { ASSERT3P(ds->ds_key_mapping, !=, NULL); @@ -1502,23 +1502,13 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx) multilist_sublist_remove(list, dn); /* - * If we are not doing useraccounting (os_synced_dnodes == NULL) - * we are done with this dnode for this txg. Unset dn_dirty_txg - * if later txgs aren't dirtying it so that future holders do - * not get a stale value. Otherwise, we will do this in - * userquota_updates_task() when processing has completely - * finished for this txg. + * See the comment above dnode_rele_task() for an explanation + * of why this dnode hold is always needed (even when not + * doing user accounting). */ multilist_t *newlist = dn->dn_objset->os_synced_dnodes; - if (newlist != NULL) { - (void) dnode_add_ref(dn, newlist); - multilist_insert(newlist, dn); - } else { - mutex_enter(&dn->dn_mtx); - if (dn->dn_dirty_txg == tx->tx_txg) - dn->dn_dirty_txg = 0; - mutex_exit(&dn->dn_mtx); - } + (void) dnode_add_ref(dn, newlist); + multilist_insert(newlist, dn); dnode_sync(dn, tx); } @@ -1680,22 +1670,19 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) txgoff = tx->tx_txg & TXG_MASK; - if (dmu_objset_userused_enabled(os) && - (!os->os_encrypted || !dmu_objset_is_receiving(os))) { - /* - * We must create the list here because it uses the - * dn_dirty_link[] of this txg. But it may already - * exist because we call dsl_dataset_sync() twice per txg. - */ - if (os->os_synced_dnodes == NULL) { - os->os_synced_dnodes = - multilist_create(sizeof (dnode_t), - offsetof(dnode_t, dn_dirty_link[txgoff]), - dnode_multilist_index_func); - } else { - ASSERT3U(os->os_synced_dnodes->ml_offset, ==, - offsetof(dnode_t, dn_dirty_link[txgoff])); - } + /* + * We must create the list here because it uses the + * dn_dirty_link[] of this txg. But it may already + * exist because we call dsl_dataset_sync() twice per txg. + */ + if (os->os_synced_dnodes == NULL) { + os->os_synced_dnodes = + multilist_create(sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[txgoff]), + dnode_multilist_index_func); + } else { + ASSERT3U(os->os_synced_dnodes->ml_offset, ==, + offsetof(dnode_t, dn_dirty_link[txgoff])); } ml = os->os_dirty_dnodes[txgoff]; @@ -2002,8 +1989,6 @@ userquota_updates_task(void *arg) dn->dn_id_flags |= DN_ID_CHKED_BONUS; } dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); - if (dn->dn_dirty_txg == spa_syncing_txg(os->os_spa)) - dn->dn_dirty_txg = 0; mutex_exit(&dn->dn_mtx); multilist_sublist_remove(list, dn); @@ -2014,13 +1999,44 @@ userquota_updates_task(void *arg) kmem_free(uua, sizeof (*uua)); } -void -dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) +/* + * Release dnode holds from dmu_objset_sync_dnodes(). When the dnode is being + * synced (i.e. we have issued the zio's for blocks in the dnode), it can't be + * evicted because the block containing the dnode can't be evicted until it is + * written out. However, this hold is necessary to prevent the dnode_t from + * being moved (via dnode_move()) while it's still referenced by + * dbuf_dirty_record_t:dr_dnode. And dr_dnode is needed for + * dirty_lightweight_leaf-type dirty records. + * + * If we are doing user-object accounting, the dnode_rele() happens from + * userquota_updates_task() instead. + */ +static void +dnode_rele_task(void *arg) { - int num_sublists; + userquota_updates_arg_t *uua = arg; + objset_t *os = uua->uua_os; + + multilist_sublist_t *list = + multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx); + dnode_t *dn; + while ((dn = multilist_sublist_head(list)) != NULL) { + multilist_sublist_remove(list, dn); + dnode_rele(dn, os->os_synced_dnodes); + } + multilist_sublist_unlock(list); + kmem_free(uua, sizeof (*uua)); +} + +/* + * Return TRUE if userquota updates are needed. + */ +static boolean_t +dmu_objset_do_userquota_updates_prep(objset_t *os, dmu_tx_t *tx) +{ if (!dmu_objset_userused_enabled(os)) - return; + return (B_FALSE); /* * If this is a raw receive just return and handle accounting @@ -2030,10 +2046,10 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) * used for recovery. */ if (os->os_encrypted && dmu_objset_is_receiving(os)) - return; + return (B_FALSE); if (tx->tx_txg <= os->os_spa->spa_claim_max_txg) - return; + return (B_FALSE); /* Allocate the user/group/project used objects if necessary. */ if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { @@ -2050,23 +2066,39 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) VERIFY0(zap_create_claim(os, DMU_PROJECTUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); } + return (B_TRUE); +} - num_sublists = multilist_get_num_sublists(os->os_synced_dnodes); +/* + * Dispatch taskq tasks to dp_sync_taskq to update the user accounting, and + * also release the holds on the dnodes from dmu_objset_sync_dnodes(). + * The caller must taskq_wait(dp_sync_taskq). + */ +void +dmu_objset_sync_done(objset_t *os, dmu_tx_t *tx) +{ + boolean_t need_userquota = dmu_objset_do_userquota_updates_prep(os, tx); + + int num_sublists = multilist_get_num_sublists(os->os_synced_dnodes); for (int i = 0; i < num_sublists; i++) { - if (multilist_sublist_is_empty_idx(os->os_synced_dnodes, i)) - continue; userquota_updates_arg_t *uua = kmem_alloc(sizeof (*uua), KM_SLEEP); uua->uua_os = os; uua->uua_sublist_idx = i; uua->uua_tx = tx; - /* note: caller does taskq_wait() */ + + /* + * If we don't need to update userquotas, use + * dnode_rele_task() to call dnode_rele() + */ (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, - userquota_updates_task, uua, 0); + need_userquota ? userquota_updates_task : dnode_rele_task, + uua, 0); /* callback frees uua */ } } + /* * Returns a pointer to data to find uid/gid from * @@ -2088,18 +2120,11 @@ dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) if (dr == NULL) { data = NULL; } else { - dnode_t *dn; - - DB_DNODE_ENTER(dr->dr_dbuf); - dn = DB_DNODE(dr->dr_dbuf); - - if (dn->dn_bonuslen == 0 && + if (dr->dr_dnode->dn_bonuslen == 0 && dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) data = dr->dt.dl.dr_data->b_data; else data = dr->dt.dl.dr_data; - - DB_DNODE_EXIT(dr->dr_dbuf); } return (data); @@ -2990,7 +3015,7 @@ EXPORT_SYMBOL(dmu_objset_create_impl); EXPORT_SYMBOL(dmu_objset_open_impl); EXPORT_SYMBOL(dmu_objset_evict); EXPORT_SYMBOL(dmu_objset_register_type); -EXPORT_SYMBOL(dmu_objset_do_userquota_updates); +EXPORT_SYMBOL(dmu_objset_sync_done); EXPORT_SYMBOL(dmu_objset_userquota_get_ids); EXPORT_SYMBOL(dmu_objset_userused_enabled); EXPORT_SYMBOL(dmu_objset_userspace_upgrade); -- cgit v1.2.3