aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs/dmu_objset.c
diff options
context:
space:
mode:
authorMatthew Ahrens <[email protected]>2020-12-11 10:26:02 -0800
committerGitHub <[email protected]>2020-12-11 10:26:02 -0800
commitba67d82142bc7034734a49a62998cfc96b1d0038 (patch)
tree571de79e9293abc687e93aea8912431997d00669 /module/zfs/dmu_objset.c
parent7d4b365ce32ecaf97020178f2a847a01f7e35476 (diff)
Improve zfs receive performance with lightweight write
The performance of `zfs receive` can be bottlenecked on the CPU consumed by the `receive_writer` thread, especially when receiving streams with small compressed block sizes. Much of the CPU is spent creating and destroying dbuf's and arc buf's, one for each `WRITE` record in the send stream. This commit introduces the concept of "lightweight writes", which allows `zfs receive` to write to the DMU by providing an ABD, and instantiating only a new type of `dbuf_dirty_record_t`. The dbuf and arc buf for this "dirty leaf block" are not instantiated. Because there is no dbuf with the dirty data, this mechanism doesn't support reading from "lightweight-dirty" blocks (they would see the on-disk state rather than the dirty data). Since the dedup-receive code has been removed, `zfs receive` is write-only, so this works fine. Because there are no arc bufs for the received data, the received data is no longer cached in the ARC. Testing a receive of a stream with average compressed block size of 4KB, this commit improves performance by 50%, while also reducing CPU usage by 50% of a CPU. On a per-block basis, CPU consumed by receive_writer() and dbuf_evict() is now 1/7th (14%) of what it was. Baseline: 450MB/s, CPU in receive_writer() 40% + dbuf_evict() 35% New: 670MB/s, CPU in receive_writer() 17% + dbuf_evict() 0% The code is also restructured in a few ways: Added a `dr_dnode` field to the dbuf_dirty_record_t. This simplifies some existing code that no longer needs `DB_DNODE_ENTER()` and related routines. The new field is needed by the lightweight-type dirty record. To ensure that the `dr_dnode` field remains valid until the dirty record is freed, we have to ensure that the `dnode_move()` doesn't relocate the dnode_t. To do this we keep a hold on the dnode until it's zio's have completed. This is already done by the user-accounting code (`userquota_updates_task()`), this commit extends that so that it always keeps the dnode hold until zio completion (see `dnode_rele_task()`). `dn_dirty_txg` was previously zeroed when the dnode was synced. This was not necessary, since its meaning can be "when was this dnode last dirtied". This change simplifies the new `dnode_rele_task()` code. Removed some dead code related to `DRR_WRITE_BYREF` (dedup receive). Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Paul Dagnelie <[email protected]> Reviewed-by: George Wilson <[email protected]> Signed-off-by: Matthew Ahrens <[email protected]> Closes #11105
Diffstat (limited to 'module/zfs/dmu_objset.c')
-rw-r--r--module/zfs/dmu_objset.c135
1 files changed, 80 insertions, 55 deletions
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 6c1d23f2b..14cfa4362 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
@@ -1235,7 +1235,7 @@ dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
}
VERIFY0(zio_wait(rzio));
- dmu_objset_do_userquota_updates(os, tx);
+ dmu_objset_sync_done(os, tx);
taskq_wait(dp->dp_sync_taskq);
if (txg_list_member(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
ASSERT3P(ds->ds_key_mapping, !=, NULL);
@@ -1502,23 +1502,13 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)
multilist_sublist_remove(list, dn);
/*
- * If we are not doing useraccounting (os_synced_dnodes == NULL)
- * we are done with this dnode for this txg. Unset dn_dirty_txg
- * if later txgs aren't dirtying it so that future holders do
- * not get a stale value. Otherwise, we will do this in
- * userquota_updates_task() when processing has completely
- * finished for this txg.
+ * See the comment above dnode_rele_task() for an explanation
+ * of why this dnode hold is always needed (even when not
+ * doing user accounting).
*/
multilist_t *newlist = dn->dn_objset->os_synced_dnodes;
- if (newlist != NULL) {
- (void) dnode_add_ref(dn, newlist);
- multilist_insert(newlist, dn);
- } else {
- mutex_enter(&dn->dn_mtx);
- if (dn->dn_dirty_txg == tx->tx_txg)
- dn->dn_dirty_txg = 0;
- mutex_exit(&dn->dn_mtx);
- }
+ (void) dnode_add_ref(dn, newlist);
+ multilist_insert(newlist, dn);
dnode_sync(dn, tx);
}
@@ -1680,22 +1670,19 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
txgoff = tx->tx_txg & TXG_MASK;
- if (dmu_objset_userused_enabled(os) &&
- (!os->os_encrypted || !dmu_objset_is_receiving(os))) {
- /*
- * We must create the list here because it uses the
- * dn_dirty_link[] of this txg. But it may already
- * exist because we call dsl_dataset_sync() twice per txg.
- */
- if (os->os_synced_dnodes == NULL) {
- os->os_synced_dnodes =
- multilist_create(sizeof (dnode_t),
- offsetof(dnode_t, dn_dirty_link[txgoff]),
- dnode_multilist_index_func);
- } else {
- ASSERT3U(os->os_synced_dnodes->ml_offset, ==,
- offsetof(dnode_t, dn_dirty_link[txgoff]));
- }
+ /*
+ * We must create the list here because it uses the
+ * dn_dirty_link[] of this txg. But it may already
+ * exist because we call dsl_dataset_sync() twice per txg.
+ */
+ if (os->os_synced_dnodes == NULL) {
+ os->os_synced_dnodes =
+ multilist_create(sizeof (dnode_t),
+ offsetof(dnode_t, dn_dirty_link[txgoff]),
+ dnode_multilist_index_func);
+ } else {
+ ASSERT3U(os->os_synced_dnodes->ml_offset, ==,
+ offsetof(dnode_t, dn_dirty_link[txgoff]));
}
ml = os->os_dirty_dnodes[txgoff];
@@ -2002,8 +1989,6 @@ userquota_updates_task(void *arg)
dn->dn_id_flags |= DN_ID_CHKED_BONUS;
}
dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
- if (dn->dn_dirty_txg == spa_syncing_txg(os->os_spa))
- dn->dn_dirty_txg = 0;
mutex_exit(&dn->dn_mtx);
multilist_sublist_remove(list, dn);
@@ -2014,13 +1999,44 @@ userquota_updates_task(void *arg)
kmem_free(uua, sizeof (*uua));
}
-void
-dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
+/*
+ * Release dnode holds from dmu_objset_sync_dnodes(). When the dnode is being
+ * synced (i.e. we have issued the zio's for blocks in the dnode), it can't be
+ * evicted because the block containing the dnode can't be evicted until it is
+ * written out. However, this hold is necessary to prevent the dnode_t from
+ * being moved (via dnode_move()) while it's still referenced by
+ * dbuf_dirty_record_t:dr_dnode. And dr_dnode is needed for
+ * dirty_lightweight_leaf-type dirty records.
+ *
+ * If we are doing user-object accounting, the dnode_rele() happens from
+ * userquota_updates_task() instead.
+ */
+static void
+dnode_rele_task(void *arg)
{
- int num_sublists;
+ userquota_updates_arg_t *uua = arg;
+ objset_t *os = uua->uua_os;
+
+ multilist_sublist_t *list =
+ multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx);
+ dnode_t *dn;
+ while ((dn = multilist_sublist_head(list)) != NULL) {
+ multilist_sublist_remove(list, dn);
+ dnode_rele(dn, os->os_synced_dnodes);
+ }
+ multilist_sublist_unlock(list);
+ kmem_free(uua, sizeof (*uua));
+}
+
+/*
+ * Return TRUE if userquota updates are needed.
+ */
+static boolean_t
+dmu_objset_do_userquota_updates_prep(objset_t *os, dmu_tx_t *tx)
+{
if (!dmu_objset_userused_enabled(os))
- return;
+ return (B_FALSE);
/*
* If this is a raw receive just return and handle accounting
@@ -2030,10 +2046,10 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
* used for recovery.
*/
if (os->os_encrypted && dmu_objset_is_receiving(os))
- return;
+ return (B_FALSE);
if (tx->tx_txg <= os->os_spa->spa_claim_max_txg)
- return;
+ return (B_FALSE);
/* Allocate the user/group/project used objects if necessary. */
if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
@@ -2050,23 +2066,39 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
VERIFY0(zap_create_claim(os, DMU_PROJECTUSED_OBJECT,
DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
}
+ return (B_TRUE);
+}
- num_sublists = multilist_get_num_sublists(os->os_synced_dnodes);
+/*
+ * Dispatch taskq tasks to dp_sync_taskq to update the user accounting, and
+ * also release the holds on the dnodes from dmu_objset_sync_dnodes().
+ * The caller must taskq_wait(dp_sync_taskq).
+ */
+void
+dmu_objset_sync_done(objset_t *os, dmu_tx_t *tx)
+{
+ boolean_t need_userquota = dmu_objset_do_userquota_updates_prep(os, tx);
+
+ int num_sublists = multilist_get_num_sublists(os->os_synced_dnodes);
for (int i = 0; i < num_sublists; i++) {
- if (multilist_sublist_is_empty_idx(os->os_synced_dnodes, i))
- continue;
userquota_updates_arg_t *uua =
kmem_alloc(sizeof (*uua), KM_SLEEP);
uua->uua_os = os;
uua->uua_sublist_idx = i;
uua->uua_tx = tx;
- /* note: caller does taskq_wait() */
+
+ /*
+ * If we don't need to update userquotas, use
+ * dnode_rele_task() to call dnode_rele()
+ */
(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
- userquota_updates_task, uua, 0);
+ need_userquota ? userquota_updates_task : dnode_rele_task,
+ uua, 0);
/* callback frees uua */
}
}
+
/*
* Returns a pointer to data to find uid/gid from
*
@@ -2088,18 +2120,11 @@ dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (dr == NULL) {
data = NULL;
} else {
- dnode_t *dn;
-
- DB_DNODE_ENTER(dr->dr_dbuf);
- dn = DB_DNODE(dr->dr_dbuf);
-
- if (dn->dn_bonuslen == 0 &&
+ if (dr->dr_dnode->dn_bonuslen == 0 &&
dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
data = dr->dt.dl.dr_data->b_data;
else
data = dr->dt.dl.dr_data;
-
- DB_DNODE_EXIT(dr->dr_dbuf);
}
return (data);
@@ -2990,7 +3015,7 @@ EXPORT_SYMBOL(dmu_objset_create_impl);
EXPORT_SYMBOL(dmu_objset_open_impl);
EXPORT_SYMBOL(dmu_objset_evict);
EXPORT_SYMBOL(dmu_objset_register_type);
-EXPORT_SYMBOL(dmu_objset_do_userquota_updates);
+EXPORT_SYMBOL(dmu_objset_sync_done);
EXPORT_SYMBOL(dmu_objset_userquota_get_ids);
EXPORT_SYMBOL(dmu_objset_userused_enabled);
EXPORT_SYMBOL(dmu_objset_userspace_upgrade);