aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs/dmu.c
diff options
context:
space:
mode:
authorMatthew Ahrens <[email protected]>2020-12-11 10:26:02 -0800
committerGitHub <[email protected]>2020-12-11 10:26:02 -0800
commitba67d82142bc7034734a49a62998cfc96b1d0038 (patch)
tree571de79e9293abc687e93aea8912431997d00669 /module/zfs/dmu.c
parent7d4b365ce32ecaf97020178f2a847a01f7e35476 (diff)
Improve zfs receive performance with lightweight write
The performance of `zfs receive` can be bottlenecked on the CPU consumed by the `receive_writer` thread, especially when receiving streams with small compressed block sizes. Much of the CPU is spent creating and destroying dbuf's and arc buf's, one for each `WRITE` record in the send stream. This commit introduces the concept of "lightweight writes", which allows `zfs receive` to write to the DMU by providing an ABD, and instantiating only a new type of `dbuf_dirty_record_t`. The dbuf and arc buf for this "dirty leaf block" are not instantiated. Because there is no dbuf with the dirty data, this mechanism doesn't support reading from "lightweight-dirty" blocks (they would see the on-disk state rather than the dirty data). Since the dedup-receive code has been removed, `zfs receive` is write-only, so this works fine. Because there are no arc bufs for the received data, the received data is no longer cached in the ARC. Testing a receive of a stream with average compressed block size of 4KB, this commit improves performance by 50%, while also reducing CPU usage by 50% of a CPU. On a per-block basis, CPU consumed by receive_writer() and dbuf_evict() is now 1/7th (14%) of what it was. Baseline: 450MB/s, CPU in receive_writer() 40% + dbuf_evict() 35% New: 670MB/s, CPU in receive_writer() 17% + dbuf_evict() 0% The code is also restructured in a few ways: Added a `dr_dnode` field to the dbuf_dirty_record_t. This simplifies some existing code that no longer needs `DB_DNODE_ENTER()` and related routines. The new field is needed by the lightweight-type dirty record. To ensure that the `dr_dnode` field remains valid until the dirty record is freed, we have to ensure that the `dnode_move()` doesn't relocate the dnode_t. To do this we keep a hold on the dnode until it's zio's have completed. This is already done by the user-accounting code (`userquota_updates_task()`), this commit extends that so that it always keeps the dnode hold until zio completion (see `dnode_rele_task()`). `dn_dirty_txg` was previously zeroed when the dnode was synced. This was not necessary, since its meaning can be "when was this dnode last dirtied". This change simplifies the new `dnode_rele_task()` code. Removed some dead code related to `DRR_WRITE_BYREF` (dedup receive). Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Paul Dagnelie <[email protected]> Reviewed-by: George Wilson <[email protected]> Signed-off-by: Matthew Ahrens <[email protected]> Closes #11105
Diffstat (limited to 'module/zfs/dmu.c')
-rw-r--r--module/zfs/dmu.c30
1 files changed, 28 insertions, 2 deletions
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index a8288a8b4..a02f43df1 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1396,6 +1396,32 @@ dmu_return_arcbuf(arc_buf_t *buf)
}
/*
+ * A "lightweight" write is faster than a regular write (e.g.
+ * dmu_write_by_dnode() or dmu_assign_arcbuf_by_dnode()), because it avoids the
+ * CPU cost of creating a dmu_buf_impl_t and arc_buf_[hdr_]_t. However, the
+ * data can not be read or overwritten until the transaction's txg has been
+ * synced. This makes it appropriate for workloads that are known to be
+ * (temporarily) write-only, like "zfs receive".
+ *
+ * A single block is written, starting at the specified offset in bytes. If
+ * the call is successful, it returns 0 and the provided abd has been
+ * consumed (the caller should not free it).
+ */
+int
+dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd,
+ const zio_prop_t *zp, enum zio_flag flags, dmu_tx_t *tx)
+{
+ dbuf_dirty_record_t *dr =
+ dbuf_dirty_lightweight(dn, dbuf_whichblock(dn, 0, offset), tx);
+ if (dr == NULL)
+ return (SET_ERROR(EIO));
+ dr->dt.dll.dr_abd = abd;
+ dr->dt.dll.dr_props = *zp;
+ dr->dt.dll.dr_flags = flags;
+ return (0);
+}
+
+/*
* When possible directly assign passed loaned arc buffer to a dbuf.
* If this is not possible copy the contents of passed arc buf via
* dmu_write().
@@ -1418,8 +1444,8 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
rw_exit(&dn->dn_struct_rwlock);
/*
- * We can only assign if the offset is aligned, the arc buf is the
- * same size as the dbuf, and the dbuf is not metadata.
+ * We can only assign if the offset is aligned and the arc buf is the
+ * same size as the dbuf.
*/
if (offset == db->db.db_offset && blksz == db->db.db_size) {
dbuf_assign_arcbuf(db, buf, tx);