summaryrefslogtreecommitdiffstats
path: root/module/zfs/dmu_recv.c
diff options
context:
space:
mode:
authorMatthew Ahrens <[email protected]>2020-12-11 10:26:02 -0800
committerGitHub <[email protected]>2020-12-11 10:26:02 -0800
commitba67d82142bc7034734a49a62998cfc96b1d0038 (patch)
tree571de79e9293abc687e93aea8912431997d00669 /module/zfs/dmu_recv.c
parent7d4b365ce32ecaf97020178f2a847a01f7e35476 (diff)
Improve zfs receive performance with lightweight write
The performance of `zfs receive` can be bottlenecked on the CPU consumed by the `receive_writer` thread, especially when receiving streams with small compressed block sizes. Much of the CPU is spent creating and destroying dbuf's and arc buf's, one for each `WRITE` record in the send stream. This commit introduces the concept of "lightweight writes", which allows `zfs receive` to write to the DMU by providing an ABD, and instantiating only a new type of `dbuf_dirty_record_t`. The dbuf and arc buf for this "dirty leaf block" are not instantiated. Because there is no dbuf with the dirty data, this mechanism doesn't support reading from "lightweight-dirty" blocks (they would see the on-disk state rather than the dirty data). Since the dedup-receive code has been removed, `zfs receive` is write-only, so this works fine. Because there are no arc bufs for the received data, the received data is no longer cached in the ARC. Testing a receive of a stream with average compressed block size of 4KB, this commit improves performance by 50%, while also reducing CPU usage by 50% of a CPU. On a per-block basis, CPU consumed by receive_writer() and dbuf_evict() is now 1/7th (14%) of what it was. Baseline: 450MB/s, CPU in receive_writer() 40% + dbuf_evict() 35% New: 670MB/s, CPU in receive_writer() 17% + dbuf_evict() 0% The code is also restructured in a few ways: Added a `dr_dnode` field to the dbuf_dirty_record_t. This simplifies some existing code that no longer needs `DB_DNODE_ENTER()` and related routines. The new field is needed by the lightweight-type dirty record. To ensure that the `dr_dnode` field remains valid until the dirty record is freed, we have to ensure that the `dnode_move()` doesn't relocate the dnode_t. To do this we keep a hold on the dnode until it's zio's have completed. This is already done by the user-accounting code (`userquota_updates_task()`), this commit extends that so that it always keeps the dnode hold until zio completion (see `dnode_rele_task()`). `dn_dirty_txg` was previously zeroed when the dnode was synced. This was not necessary, since its meaning can be "when was this dnode last dirtied". This change simplifies the new `dnode_rele_task()` code. Removed some dead code related to `DRR_WRITE_BYREF` (dedup receive). Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Paul Dagnelie <[email protected]> Reviewed-by: George Wilson <[email protected]> Signed-off-by: Matthew Ahrens <[email protected]> Closes #11105
Diffstat (limited to 'module/zfs/dmu_recv.c')
-rw-r--r--module/zfs/dmu_recv.c283
1 files changed, 143 insertions, 140 deletions
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 2eee19a28..f2d151578 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -79,10 +79,10 @@ struct receive_record_arg {
dmu_replay_record_t header;
void *payload; /* Pointer to a buffer containing the payload */
/*
- * If the record is a write, pointer to the arc_buf_t containing the
+ * If the record is a WRITE or SPILL, pointer to the abd containing the
* payload.
*/
- arc_buf_t *arc_buf;
+ abd_t *abd;
int payload_size;
uint64_t bytes_read; /* bytes read from stream when record created */
boolean_t eos_marker; /* Marks the end of the stream */
@@ -95,8 +95,8 @@ struct receive_writer_arg {
bqueue_t q;
/*
- * These three args are used to signal to the main thread that we're
- * done.
+ * These three members are used to signal to the main thread when
+ * we're done.
*/
kmutex_t mutex;
kcondvar_t cv;
@@ -175,18 +175,6 @@ byteswap_record(dmu_replay_record_t *drr)
DO64(drr_write.drr_key.ddk_prop);
DO64(drr_write.drr_compressed_size);
break;
- case DRR_WRITE_BYREF:
- DO64(drr_write_byref.drr_object);
- DO64(drr_write_byref.drr_offset);
- DO64(drr_write_byref.drr_length);
- DO64(drr_write_byref.drr_toguid);
- DO64(drr_write_byref.drr_refguid);
- DO64(drr_write_byref.drr_refobject);
- DO64(drr_write_byref.drr_refoffset);
- ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref.
- drr_key.ddk_cksum);
- DO64(drr_write_byref.drr_key.ddk_prop);
- break;
case DRR_WRITE_EMBEDDED:
DO64(drr_write_embedded.drr_object);
DO64(drr_write_embedded.drr_offset);
@@ -1903,58 +1891,106 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
struct receive_record_arg *rrd;
while ((rrd = list_head(&rwa->write_batch)) != NULL) {
struct drr_write *drrw = &rrd->header.drr_u.drr_write;
- arc_buf_t *abuf = rrd->arc_buf;
+ abd_t *abd = rrd->abd;
ASSERT3U(drrw->drr_object, ==, rwa->last_object);
- if (rwa->byteswap && !arc_is_encrypted(abuf) &&
- arc_get_compression(abuf) == ZIO_COMPRESS_OFF) {
- dmu_object_byteswap_t byteswap =
- DMU_OT_BYTESWAP(drrw->drr_type);
- dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
- DRR_WRITE_PAYLOAD_SIZE(drrw));
- }
-
- /*
- * If we are receiving an incremental large-block stream into
- * a dataset that previously did a non-large-block receive,
- * the WRITE record may be larger than the object's block
- * size. dmu_assign_arcbuf_by_dnode() handles this as long
- * as the arcbuf is not compressed, so decompress it here if
- * necessary.
- */
- if (drrw->drr_logical_size != dn->dn_datablksz &&
- arc_get_compression(abuf) != ZIO_COMPRESS_OFF) {
+ if (drrw->drr_logical_size != dn->dn_datablksz) {
+ /*
+ * The WRITE record is larger than the object's block
+ * size. We must be receiving an incremental
+ * large-block stream into a dataset that previously did
+ * a non-large-block receive. Lightweight writes must
+ * be exactly one block, so we need to decompress the
+ * data (if compressed) and do a normal dmu_write().
+ */
ASSERT3U(drrw->drr_logical_size, >, dn->dn_datablksz);
- zbookmark_phys_t zb = {
- .zb_objset = dmu_objset_id(rwa->os),
- .zb_object = rwa->last_object,
- .zb_level = 0,
- .zb_blkid =
- drrw->drr_offset >> dn->dn_datablkshift,
- };
+ if (DRR_WRITE_COMPRESSED(drrw)) {
+ abd_t *decomp_abd =
+ abd_alloc_linear(drrw->drr_logical_size,
+ B_FALSE);
+
+ err = zio_decompress_data(
+ drrw->drr_compressiontype,
+ abd, abd_to_buf(decomp_abd),
+ abd_get_size(abd),
+ abd_get_size(decomp_abd), NULL);
+
+ if (err == 0) {
+ dmu_write_by_dnode(dn,
+ drrw->drr_offset,
+ drrw->drr_logical_size,
+ abd_to_buf(decomp_abd), tx);
+ }
+ abd_free(decomp_abd);
+ } else {
+ dmu_write_by_dnode(dn,
+ drrw->drr_offset,
+ drrw->drr_logical_size,
+ abd_to_buf(abd), tx);
+ }
+ if (err == 0)
+ abd_free(abd);
+ } else {
+ zio_prop_t zp;
+ dmu_write_policy(rwa->os, dn, 0, 0, &zp);
+
+ enum zio_flag zio_flags = 0;
+
+ if (rwa->raw) {
+ zp.zp_encrypt = B_TRUE;
+ zp.zp_compress = drrw->drr_compressiontype;
+ zp.zp_byteorder = ZFS_HOST_BYTEORDER ^
+ !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^
+ rwa->byteswap;
+ bcopy(drrw->drr_salt, zp.zp_salt,
+ ZIO_DATA_SALT_LEN);
+ bcopy(drrw->drr_iv, zp.zp_iv,
+ ZIO_DATA_IV_LEN);
+ bcopy(drrw->drr_mac, zp.zp_mac,
+ ZIO_DATA_MAC_LEN);
+ if (DMU_OT_IS_ENCRYPTED(zp.zp_type)) {
+ zp.zp_nopwrite = B_FALSE;
+ zp.zp_copies = MIN(zp.zp_copies,
+ SPA_DVAS_PER_BP - 1);
+ }
+ zio_flags |= ZIO_FLAG_RAW;
+ } else if (DRR_WRITE_COMPRESSED(drrw)) {
+ ASSERT3U(drrw->drr_compressed_size, >, 0);
+ ASSERT3U(drrw->drr_logical_size, >=,
+ drrw->drr_compressed_size);
+ zp.zp_compress = drrw->drr_compressiontype;
+ zio_flags |= ZIO_FLAG_RAW_COMPRESS;
+ } else if (rwa->byteswap) {
+ /*
+ * Note: compressed blocks never need to be
+ * byteswapped, because WRITE records for
+ * metadata blocks are never compressed. The
+ * exception is raw streams, which are written
+ * in the original byteorder, and the byteorder
+ * bit is preserved in the BP by setting
+ * zp_byteorder above.
+ */
+ dmu_object_byteswap_t byteswap =
+ DMU_OT_BYTESWAP(drrw->drr_type);
+ dmu_ot_byteswap[byteswap].ob_func(
+ abd_to_buf(abd),
+ DRR_WRITE_PAYLOAD_SIZE(drrw));
+ }
/*
- * The size of loaned arc bufs is counted in
- * arc_loaned_bytes. When we untransform
- * (decompress) the buf, its size increases. To
- * ensure that arc_loaned_bytes remains accurate, we
- * need to return (un-loan) the buf (with its
- * compressed size) and then re-loan it (with its
- * new, uncompressed size).
+ * Since this data can't be read until the receive
+ * completes, we can do a "lightweight" write for
+ * improved performance.
*/
- arc_return_buf(abuf, FTAG);
- VERIFY0(arc_untransform(abuf, dmu_objset_spa(rwa->os),
- &zb, B_FALSE));
- arc_loan_inuse_buf(abuf, FTAG);
+ err = dmu_lightweight_write_by_dnode(dn,
+ drrw->drr_offset, abd, &zp, zio_flags, tx);
}
- err = dmu_assign_arcbuf_by_dnode(dn,
- drrw->drr_offset, abuf, tx);
if (err != 0) {
/*
* This rrd is left on the list, so the caller will
- * free it (and the arc_buf).
+ * free it (and the abd).
*/
break;
}
@@ -1987,7 +2023,7 @@ flush_write_batch(struct receive_writer_arg *rwa)
if (err != 0) {
struct receive_record_arg *rrd;
while ((rrd = list_remove_head(&rwa->write_batch)) != NULL) {
- dmu_return_arcbuf(rrd->arc_buf);
+ abd_free(rrd->abd);
kmem_free(rrd, sizeof (*rrd));
}
}
@@ -2090,9 +2126,8 @@ receive_write_embedded(struct receive_writer_arg *rwa,
static int
receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
- arc_buf_t *abuf)
+ abd_t *abd)
{
- dmu_tx_t *tx;
dmu_buf_t *db, *db_spill;
int err;
@@ -2107,7 +2142,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
* the DRR_FLAG_SPILL_BLOCK flag.
*/
if (rwa->spill && DRR_SPILL_IS_UNMODIFIED(drrs->drr_flags)) {
- dmu_return_arcbuf(abuf);
+ abd_free(abd);
return (0);
}
@@ -2131,7 +2166,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
return (err);
}
- tx = dmu_tx_create(rwa->os);
+ dmu_tx_t *tx = dmu_tx_create(rwa->os);
dmu_tx_hold_spill(tx, db->db_object);
@@ -2150,18 +2185,35 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
*/
if (db_spill->db_size != drrs->drr_length) {
dmu_buf_will_fill(db_spill, tx);
- VERIFY(0 == dbuf_spill_set_blksz(db_spill,
+ VERIFY0(dbuf_spill_set_blksz(db_spill,
drrs->drr_length, tx));
}
- if (rwa->byteswap && !arc_is_encrypted(abuf) &&
- arc_get_compression(abuf) == ZIO_COMPRESS_OFF) {
- dmu_object_byteswap_t byteswap =
- DMU_OT_BYTESWAP(drrs->drr_type);
- dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
- DRR_SPILL_PAYLOAD_SIZE(drrs));
+ arc_buf_t *abuf;
+ if (rwa->raw) {
+ boolean_t byteorder = ZFS_HOST_BYTEORDER ^
+ !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^
+ rwa->byteswap;
+
+ abuf = arc_loan_raw_buf(dmu_objset_spa(rwa->os),
+ drrs->drr_object, byteorder, drrs->drr_salt,
+ drrs->drr_iv, drrs->drr_mac, drrs->drr_type,
+ drrs->drr_compressed_size, drrs->drr_length,
+ drrs->drr_compressiontype, 0);
+ } else {
+ abuf = arc_loan_buf(dmu_objset_spa(rwa->os),
+ DMU_OT_IS_METADATA(drrs->drr_type),
+ drrs->drr_length);
+ if (rwa->byteswap) {
+ dmu_object_byteswap_t byteswap =
+ DMU_OT_BYTESWAP(drrs->drr_type);
+ dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(abd),
+ DRR_SPILL_PAYLOAD_SIZE(drrs));
+ }
}
+ bcopy(abd_to_buf(abd), abuf->b_data, DRR_SPILL_PAYLOAD_SIZE(drrs));
+ abd_free(abd);
dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx);
dmu_buf_rele(db, FTAG);
@@ -2451,53 +2503,19 @@ receive_read_record(dmu_recv_cookie_t *drc)
case DRR_WRITE:
{
struct drr_write *drrw = &drc->drc_rrd->header.drr_u.drr_write;
- arc_buf_t *abuf;
- boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type);
-
- if (drc->drc_raw) {
- boolean_t byteorder = ZFS_HOST_BYTEORDER ^
- !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^
- drc->drc_byteswap;
-
- abuf = arc_loan_raw_buf(dmu_objset_spa(drc->drc_os),
- drrw->drr_object, byteorder, drrw->drr_salt,
- drrw->drr_iv, drrw->drr_mac, drrw->drr_type,
- drrw->drr_compressed_size, drrw->drr_logical_size,
- drrw->drr_compressiontype, 0);
- } else if (DRR_WRITE_COMPRESSED(drrw)) {
- ASSERT3U(drrw->drr_compressed_size, >, 0);
- ASSERT3U(drrw->drr_logical_size, >=,
- drrw->drr_compressed_size);
- ASSERT(!is_meta);
- abuf = arc_loan_compressed_buf(
- dmu_objset_spa(drc->drc_os),
- drrw->drr_compressed_size, drrw->drr_logical_size,
- drrw->drr_compressiontype, 0);
- } else {
- abuf = arc_loan_buf(dmu_objset_spa(drc->drc_os),
- is_meta, drrw->drr_logical_size);
- }
-
- err = receive_read_payload_and_next_header(drc,
- DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data);
+ int size = DRR_WRITE_PAYLOAD_SIZE(drrw);
+ abd_t *abd = abd_alloc_linear(size, B_FALSE);
+ err = receive_read_payload_and_next_header(drc, size,
+ abd_to_buf(abd));
if (err != 0) {
- dmu_return_arcbuf(abuf);
+ abd_free(abd);
return (err);
}
- drc->drc_rrd->arc_buf = abuf;
+ drc->drc_rrd->abd = abd;
receive_read_prefetch(drc, drrw->drr_object, drrw->drr_offset,
drrw->drr_logical_size);
return (err);
}
- case DRR_WRITE_BYREF:
- {
- struct drr_write_byref *drrwb =
- &drc->drc_rrd->header.drr_u.drr_write_byref;
- err = receive_read_payload_and_next_header(drc, 0, NULL);
- receive_read_prefetch(drc, drrwb->drr_object, drrwb->drr_offset,
- drrwb->drr_length);
- return (err);
- }
case DRR_WRITE_EMBEDDED:
{
struct drr_write_embedded *drrwe =
@@ -2536,29 +2554,14 @@ receive_read_record(dmu_recv_cookie_t *drc)
case DRR_SPILL:
{
struct drr_spill *drrs = &drc->drc_rrd->header.drr_u.drr_spill;
- arc_buf_t *abuf;
- /* DRR_SPILL records are either raw or uncompressed */
- if (drc->drc_raw) {
- boolean_t byteorder = ZFS_HOST_BYTEORDER ^
- !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^
- drc->drc_byteswap;
-
- abuf = arc_loan_raw_buf(dmu_objset_spa(drc->drc_os),
- drrs->drr_object, byteorder, drrs->drr_salt,
- drrs->drr_iv, drrs->drr_mac, drrs->drr_type,
- drrs->drr_compressed_size, drrs->drr_length,
- drrs->drr_compressiontype, 0);
- } else {
- abuf = arc_loan_buf(dmu_objset_spa(drc->drc_os),
- DMU_OT_IS_METADATA(drrs->drr_type),
- drrs->drr_length);
- }
- err = receive_read_payload_and_next_header(drc,
- DRR_SPILL_PAYLOAD_SIZE(drrs), abuf->b_data);
+ int size = DRR_SPILL_PAYLOAD_SIZE(drrs);
+ abd_t *abd = abd_alloc_linear(size, B_FALSE);
+ err = receive_read_payload_and_next_header(drc, size,
+ abd_to_buf(abd));
if (err != 0)
- dmu_return_arcbuf(abuf);
+ abd_free(abd);
else
- drc->drc_rrd->arc_buf = abuf;
+ drc->drc_rrd->abd = abd;
return (err);
}
case DRR_OBJECT_RANGE:
@@ -2687,9 +2690,9 @@ receive_process_record(struct receive_writer_arg *rwa,
if (rrd->header.drr_type != DRR_WRITE) {
err = flush_write_batch(rwa);
if (err != 0) {
- if (rrd->arc_buf != NULL) {
- dmu_return_arcbuf(rrd->arc_buf);
- rrd->arc_buf = NULL;
+ if (rrd->abd != NULL) {
+ abd_free(rrd->abd);
+ rrd->abd = NULL;
rrd->payload = NULL;
} else if (rrd->payload != NULL) {
kmem_free(rrd->payload, rrd->payload_size);
@@ -2726,8 +2729,8 @@ receive_process_record(struct receive_writer_arg *rwa,
* the rrd or arc_buf.
*/
ASSERT(err != 0);
- dmu_return_arcbuf(rrd->arc_buf);
- rrd->arc_buf = NULL;
+ abd_free(rrd->abd);
+ rrd->abd = NULL;
}
break;
}
@@ -2749,10 +2752,10 @@ receive_process_record(struct receive_writer_arg *rwa,
case DRR_SPILL:
{
struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
- err = receive_spill(rwa, drrs, rrd->arc_buf);
+ err = receive_spill(rwa, drrs, rrd->abd);
if (err != 0)
- dmu_return_arcbuf(rrd->arc_buf);
- rrd->arc_buf = NULL;
+ abd_free(rrd->abd);
+ rrd->abd = NULL;
rrd->payload = NULL;
break;
}
@@ -2800,9 +2803,9 @@ receive_writer_thread(void *arg)
int err = 0;
if (rwa->err == 0) {
err = receive_process_record(rwa, rrd);
- } else if (rrd->arc_buf != NULL) {
- dmu_return_arcbuf(rrd->arc_buf);
- rrd->arc_buf = NULL;
+ } else if (rrd->abd != NULL) {
+ abd_free(rrd->abd);
+ rrd->abd = NULL;
rrd->payload = NULL;
} else if (rrd->payload != NULL) {
kmem_free(rrd->payload, rrd->payload_size);