summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorMatthew Ahrens <[email protected]>2020-12-11 10:26:02 -0800
committerGitHub <[email protected]>2020-12-11 10:26:02 -0800
commitba67d82142bc7034734a49a62998cfc96b1d0038 (patch)
tree571de79e9293abc687e93aea8912431997d00669 /include
parent7d4b365ce32ecaf97020178f2a847a01f7e35476 (diff)
Improve zfs receive performance with lightweight write
The performance of `zfs receive` can be bottlenecked on the CPU consumed by the `receive_writer` thread, especially when receiving streams with small compressed block sizes. Much of the CPU is spent creating and destroying dbuf's and arc buf's, one for each `WRITE` record in the send stream. This commit introduces the concept of "lightweight writes", which allows `zfs receive` to write to the DMU by providing an ABD, and instantiating only a new type of `dbuf_dirty_record_t`. The dbuf and arc buf for this "dirty leaf block" are not instantiated. Because there is no dbuf with the dirty data, this mechanism doesn't support reading from "lightweight-dirty" blocks (they would see the on-disk state rather than the dirty data). Since the dedup-receive code has been removed, `zfs receive` is write-only, so this works fine. Because there are no arc bufs for the received data, the received data is no longer cached in the ARC. Testing a receive of a stream with average compressed block size of 4KB, this commit improves performance by 50%, while also reducing CPU usage by 50% of a CPU. On a per-block basis, CPU consumed by receive_writer() and dbuf_evict() is now 1/7th (14%) of what it was. Baseline: 450MB/s, CPU in receive_writer() 40% + dbuf_evict() 35% New: 670MB/s, CPU in receive_writer() 17% + dbuf_evict() 0% The code is also restructured in a few ways: Added a `dr_dnode` field to the dbuf_dirty_record_t. This simplifies some existing code that no longer needs `DB_DNODE_ENTER()` and related routines. The new field is needed by the lightweight-type dirty record. To ensure that the `dr_dnode` field remains valid until the dirty record is freed, we have to ensure that the `dnode_move()` doesn't relocate the dnode_t. To do this we keep a hold on the dnode until it's zio's have completed. This is already done by the user-accounting code (`userquota_updates_task()`), this commit extends that so that it always keeps the dnode hold until zio completion (see `dnode_rele_task()`). `dn_dirty_txg` was previously zeroed when the dnode was synced. This was not necessary, since its meaning can be "when was this dnode last dirtied". This change simplifies the new `dnode_rele_task()` code. Removed some dead code related to `DRR_WRITE_BYREF` (dedup receive). Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Paul Dagnelie <[email protected]> Reviewed-by: George Wilson <[email protected]> Signed-off-by: Matthew Ahrens <[email protected]> Closes #11105
Diffstat (limited to 'include')
-rw-r--r--include/sys/dbuf.h28
-rw-r--r--include/sys/dmu_objset.h2
2 files changed, 28 insertions, 2 deletions
diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h
index ca2154e12..d221eac4c 100644
--- a/include/sys/dbuf.h
+++ b/include/sys/dbuf.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
@@ -130,6 +130,16 @@ typedef struct dbuf_dirty_record {
/* list link for dbuf dirty records */
list_node_t dr_dbuf_node;
+ /*
+ * The dnode we are part of. Note that the dnode can not be moved or
+ * evicted due to the hold that's added by dnode_setdirty() or
+ * dmu_objset_sync_dnodes(), and released by dnode_rele_task() or
+ * userquota_updates_task(). This hold is necessary for
+ * dirty_lightweight_leaf-type dirty records, which don't have a hold
+ * on a dbuf.
+ */
+ dnode_t *dr_dnode;
+
/* pointer to parent dirty record */
struct dbuf_dirty_record *dr_parent;
@@ -171,6 +181,17 @@ typedef struct dbuf_dirty_record {
uint8_t dr_iv[ZIO_DATA_IV_LEN];
uint8_t dr_mac[ZIO_DATA_MAC_LEN];
} dl;
+ struct dirty_lightweight_leaf {
+ /*
+ * This dirty record refers to a leaf (level=0)
+ * block, whose dbuf has not been instantiated for
+ * performance reasons.
+ */
+ uint64_t dr_blkid;
+ abd_t *dr_abd;
+ zio_prop_t dr_props;
+ enum zio_flag dr_flags;
+ } dll;
} dt;
} dbuf_dirty_record_t;
@@ -349,11 +370,16 @@ void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid,
+ dmu_tx_t *tx);
arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
bp_embedded_type_t etype, enum zio_compress comp,
int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
+int dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd,
+ const struct zio_prop *zp, enum zio_flag flags, dmu_tx_t *tx);
+
void dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx);
void dbuf_destroy(dmu_buf_impl_t *db);
diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h
index f27417c1f..a8cb81271 100644
--- a/include/sys/dmu_objset.h
+++ b/include/sys/dmu_objset.h
@@ -242,7 +242,7 @@ objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
objset_t **osp);
void dmu_objset_evict(objset_t *os);
-void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx);
+void dmu_objset_sync_done(objset_t *os, dmu_tx_t *tx);
void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx);
boolean_t dmu_objset_userused_enabled(objset_t *os);
void dmu_objset_userspace_upgrade(objset_t *os);