diff options
author | Matthew Ahrens <[email protected]> | 2020-12-11 10:26:02 -0800 |
---|---|---|
committer | GitHub <[email protected]> | 2020-12-11 10:26:02 -0800 |
commit | ba67d82142bc7034734a49a62998cfc96b1d0038 (patch) | |
tree | 571de79e9293abc687e93aea8912431997d00669 /include | |
parent | 7d4b365ce32ecaf97020178f2a847a01f7e35476 (diff) |
Improve zfs receive performance with lightweight write
The performance of `zfs receive` can be bottlenecked on the CPU consumed
by the `receive_writer` thread, especially when receiving streams with
small compressed block sizes. Much of the CPU is spent creating and
destroying dbuf's and arc buf's, one for each `WRITE` record in the send
stream.
This commit introduces the concept of "lightweight writes", which allows
`zfs receive` to write to the DMU by providing an ABD, and instantiating
only a new type of `dbuf_dirty_record_t`. The dbuf and arc buf for this
"dirty leaf block" are not instantiated.
Because there is no dbuf with the dirty data, this mechanism doesn't
support reading from "lightweight-dirty" blocks (they would see the
on-disk state rather than the dirty data). Since the dedup-receive code
has been removed, `zfs receive` is write-only, so this works fine.
Because there are no arc bufs for the received data, the received data
is no longer cached in the ARC.
Testing a receive of a stream with average compressed block size of 4KB,
this commit improves performance by 50%, while also reducing CPU usage
by 50% of a CPU. On a per-block basis, CPU consumed by receive_writer()
and dbuf_evict() is now 1/7th (14%) of what it was.
Baseline: 450MB/s, CPU in receive_writer() 40% + dbuf_evict() 35%
New: 670MB/s, CPU in receive_writer() 17% + dbuf_evict() 0%
The code is also restructured in a few ways:
Added a `dr_dnode` field to the dbuf_dirty_record_t. This simplifies
some existing code that no longer needs `DB_DNODE_ENTER()` and related
routines. The new field is needed by the lightweight-type dirty record.
To ensure that the `dr_dnode` field remains valid until the dirty record
is freed, we have to ensure that the `dnode_move()` doesn't relocate the
dnode_t. To do this we keep a hold on the dnode until it's zio's have
completed. This is already done by the user-accounting code
(`userquota_updates_task()`), this commit extends that so that it always
keeps the dnode hold until zio completion (see `dnode_rele_task()`).
`dn_dirty_txg` was previously zeroed when the dnode was synced. This
was not necessary, since its meaning can be "when was this dnode last
dirtied". This change simplifies the new `dnode_rele_task()` code.
Removed some dead code related to `DRR_WRITE_BYREF` (dedup receive).
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Paul Dagnelie <[email protected]>
Reviewed-by: George Wilson <[email protected]>
Signed-off-by: Matthew Ahrens <[email protected]>
Closes #11105
Diffstat (limited to 'include')
-rw-r--r-- | include/sys/dbuf.h | 28 | ||||
-rw-r--r-- | include/sys/dmu_objset.h | 2 |
2 files changed, 28 insertions, 2 deletions
diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index ca2154e12..d221eac4c 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -130,6 +130,16 @@ typedef struct dbuf_dirty_record { /* list link for dbuf dirty records */ list_node_t dr_dbuf_node; + /* + * The dnode we are part of. Note that the dnode can not be moved or + * evicted due to the hold that's added by dnode_setdirty() or + * dmu_objset_sync_dnodes(), and released by dnode_rele_task() or + * userquota_updates_task(). This hold is necessary for + * dirty_lightweight_leaf-type dirty records, which don't have a hold + * on a dbuf. + */ + dnode_t *dr_dnode; + /* pointer to parent dirty record */ struct dbuf_dirty_record *dr_parent; @@ -171,6 +181,17 @@ typedef struct dbuf_dirty_record { uint8_t dr_iv[ZIO_DATA_IV_LEN]; uint8_t dr_mac[ZIO_DATA_MAC_LEN]; } dl; + struct dirty_lightweight_leaf { + /* + * This dirty record refers to a leaf (level=0) + * block, whose dbuf has not been instantiated for + * performance reasons. + */ + uint64_t dr_blkid; + abd_t *dr_abd; + zio_prop_t dr_props; + enum zio_flag dr_flags; + } dll; } dt; } dbuf_dirty_record_t; @@ -349,11 +370,16 @@ void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, + dmu_tx_t *tx); arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); +int dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd, + const struct zio_prop *zp, enum zio_flag flags, dmu_tx_t *tx); + void dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx); void dbuf_destroy(dmu_buf_impl_t *db); diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index f27417c1f..a8cb81271 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -242,7 +242,7 @@ objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, objset_t **osp); void dmu_objset_evict(objset_t *os); -void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx); +void dmu_objset_sync_done(objset_t *os, dmu_tx_t *tx); void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx); boolean_t dmu_objset_userused_enabled(objset_t *os); void dmu_objset_userspace_upgrade(objset_t *os); |