aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChunwei Chen <[email protected]>2024-11-06 11:52:01 -0800
committerGitHub <[email protected]>2024-11-06 11:52:01 -0800
commit5945676bcc5d8b45554c93ea08a0d1f654c7075e (patch)
tree6631f47851ff5e81aed233a6577a088e0d32bda1
parent7b6e9675daefcc94393293c46e4672e66490757d (diff)
ZFS send should use spill block prefetched from send_reader_thread
Currently, even though send_reader_thread prefetches spill block, do_dump() will not use it and issues its own blocking arc_read. This causes significant performance degradation when sending datasets with lots of spill blocks. For unmodified spill blocks, we also create send_range struct for them in send_reader_thread and issue prefetches for them. We piggyback them on the dnode send_range instead of enqueueing them so we don't break send_range_after check. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Alexander Motin <[email protected]> Signed-off-by: Chunwei Chen <[email protected]> Co-authored-by: david.chen <[email protected]> Closes #16701
-rw-r--r--module/zfs/dmu_send.c126
1 files changed, 64 insertions, 62 deletions
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index a174972e9..aeb8ff3b6 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -180,6 +180,8 @@ struct send_range {
*/
dnode_phys_t *dnp;
blkptr_t bp;
+ /* Piggyback unmodified spill block */
+ struct send_range *spill_range;
} object;
struct srr {
uint32_t datablksz;
@@ -231,6 +233,8 @@ range_free(struct send_range *range)
size_t size = sizeof (dnode_phys_t) *
(range->sru.object.dnp->dn_extra_slots + 1);
kmem_free(range->sru.object.dnp, size);
+ if (range->sru.object.spill_range)
+ range_free(range->sru.object.spill_range);
} else if (range->type == DATA) {
mutex_enter(&range->sru.data.lock);
while (range->sru.data.io_outstanding)
@@ -617,7 +621,7 @@ dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
drrs->drr_length = blksz;
drrs->drr_toguid = dscp->dsc_toguid;
- /* See comment in dump_dnode() for full details */
+ /* See comment in piggyback_unmodified_spill() for full details */
if (zfs_send_unmodified_spill_blocks &&
(BP_GET_LOGICAL_BIRTH(bp) <= dscp->dsc_fromtxg)) {
drrs->drr_flags |= DRR_SPILL_UNMODIFIED;
@@ -793,35 +797,6 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
(dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0)
return (SET_ERROR(EINTR));
- /*
- * Send DRR_SPILL records for unmodified spill blocks. This is useful
- * because changing certain attributes of the object (e.g. blocksize)
- * can cause old versions of ZFS to incorrectly remove a spill block.
- * Including these records in the stream forces an up to date version
- * to always be written ensuring they're never lost. Current versions
- * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can
- * ignore these unmodified spill blocks.
- */
- if (zfs_send_unmodified_spill_blocks &&
- (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
- (BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= dscp->dsc_fromtxg)) {
- struct send_range record;
- blkptr_t *bp = DN_SPILL_BLKPTR(dnp);
-
- memset(&record, 0, sizeof (struct send_range));
- record.type = DATA;
- record.object = object;
- record.eos_marker = B_FALSE;
- record.start_blkid = DMU_SPILL_BLKID;
- record.end_blkid = record.start_blkid + 1;
- record.sru.data.bp = *bp;
- record.sru.data.obj_type = dnp->dn_type;
- record.sru.data.datablksz = BP_GET_LSIZE(bp);
-
- if (do_dump(dscp, &record) != 0)
- return (SET_ERROR(EINTR));
- }
-
if (dscp->dsc_err != 0)
return (SET_ERROR(EINTR));
@@ -911,6 +886,9 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
case OBJECT:
err = dump_dnode(dscp, &range->sru.object.bp, range->object,
range->sru.object.dnp);
+ /* Dump piggybacked unmodified spill block */
+ if (!err && range->sru.object.spill_range)
+ err = do_dump(dscp, range->sru.object.spill_range);
return (err);
case OBJECT_RANGE: {
ASSERT3U(range->start_blkid + 1, ==, range->end_blkid);
@@ -939,34 +917,7 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
ASSERT3U(srdp->datablksz, ==, BP_GET_LSIZE(bp));
ASSERT3U(range->start_blkid + 1, ==, range->end_blkid);
- if (BP_GET_TYPE(bp) == DMU_OT_SA) {
- arc_flags_t aflags = ARC_FLAG_WAIT;
- zio_flag_t zioflags = ZIO_FLAG_CANFAIL;
- if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) {
- ASSERT(BP_IS_PROTECTED(bp));
- zioflags |= ZIO_FLAG_RAW;
- }
-
- zbookmark_phys_t zb;
- ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID);
- zb.zb_objset = dmu_objset_id(dscp->dsc_os);
- zb.zb_object = range->object;
- zb.zb_level = 0;
- zb.zb_blkid = range->start_blkid;
-
- arc_buf_t *abuf = NULL;
- if (!dscp->dsc_dso->dso_dryrun && arc_read(NULL, spa,
- bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
- zioflags, &aflags, &zb) != 0)
- return (SET_ERROR(EIO));
-
- err = dump_spill(dscp, bp, zb.zb_object,
- (abuf == NULL ? NULL : abuf->b_data));
- if (abuf != NULL)
- arc_buf_destroy(abuf, &abuf);
- return (err);
- }
if (send_do_embed(bp, dscp->dsc_featureflags)) {
err = dump_write_embedded(dscp, range->object,
range->start_blkid * srdp->datablksz,
@@ -975,8 +926,9 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
}
ASSERT(range->object > dscp->dsc_resume_object ||
(range->object == dscp->dsc_resume_object &&
+ (range->start_blkid == DMU_SPILL_BLKID ||
range->start_blkid * srdp->datablksz >=
- dscp->dsc_resume_offset));
+ dscp->dsc_resume_offset)));
/* it's a level-0 block of a regular object */
mutex_enter(&srdp->lock);
@@ -1006,8 +958,6 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
ASSERT(dscp->dsc_dso->dso_dryrun ||
srdp->abuf != NULL || srdp->abd != NULL);
- uint64_t offset = range->start_blkid * srdp->datablksz;
-
char *data = NULL;
if (srdp->abd != NULL) {
data = abd_to_buf(srdp->abd);
@@ -1016,6 +966,14 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
data = srdp->abuf->b_data;
}
+ if (BP_GET_TYPE(bp) == DMU_OT_SA) {
+ ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID);
+ err = dump_spill(dscp, bp, range->object, data);
+ return (err);
+ }
+
+ uint64_t offset = range->start_blkid * srdp->datablksz;
+
/*
* If we have large blocks stored on disk but the send flags
* don't allow us to send large blocks, we split the data from
@@ -1098,6 +1056,8 @@ range_alloc(enum type type, uint64_t object, uint64_t start_blkid,
range->sru.data.io_outstanding = 0;
range->sru.data.io_err = 0;
range->sru.data.io_compressed = B_FALSE;
+ } else if (type == OBJECT) {
+ range->sru.object.spill_range = NULL;
}
return (range);
}
@@ -1743,6 +1703,45 @@ enqueue_range(struct send_reader_thread_arg *srta, bqueue_t *q, dnode_t *dn,
}
/*
+ * Send DRR_SPILL records for unmodified spill blocks. This is useful
+ * because changing certain attributes of the object (e.g. blocksize)
+ * can cause old versions of ZFS to incorrectly remove a spill block.
+ * Including these records in the stream forces an up to date version
+ * to always be written ensuring they're never lost. Current versions
+ * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can
+ * ignore these unmodified spill blocks.
+ *
+ * We piggyback the spill_range to dnode range instead of enqueueing it
+ * so send_range_after won't complain.
+ */
+static uint64_t
+piggyback_unmodified_spill(struct send_reader_thread_arg *srta,
+ struct send_range *range)
+{
+ ASSERT3U(range->type, ==, OBJECT);
+
+ dnode_phys_t *dnp = range->sru.object.dnp;
+ uint64_t fromtxg = srta->smta->to_arg->fromtxg;
+
+ if (!zfs_send_unmodified_spill_blocks ||
+ !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ||
+ !(BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= fromtxg))
+ return (0);
+
+ blkptr_t *bp = DN_SPILL_BLKPTR(dnp);
+ struct send_range *spill_range = range_alloc(DATA, range->object,
+ DMU_SPILL_BLKID, DMU_SPILL_BLKID+1, B_FALSE);
+ spill_range->sru.data.bp = *bp;
+ spill_range->sru.data.obj_type = dnp->dn_type;
+ spill_range->sru.data.datablksz = BP_GET_LSIZE(bp);
+
+ issue_data_read(srta, spill_range);
+ range->sru.object.spill_range = spill_range;
+
+ return (BP_GET_LSIZE(bp));
+}
+
+/*
* This thread is responsible for two things: First, it retrieves the correct
* blkptr in the to ds if we need to send the data because of something from
* the from thread. As a result of this, we're the first ones to discover that
@@ -1773,17 +1772,20 @@ send_reader_thread(void *arg)
uint64_t last_obj_exists = B_TRUE;
while (!range->eos_marker && !srta->cancel && smta->error == 0 &&
err == 0) {
+ uint64_t spill = 0;
switch (range->type) {
case DATA:
issue_data_read(srta, range);
bqueue_enqueue(outq, range, range->sru.data.datablksz);
range = get_next_range_nofree(inq, range);
break;
- case HOLE:
case OBJECT:
+ spill = piggyback_unmodified_spill(srta, range);
+ zfs_fallthrough;
+ case HOLE:
case OBJECT_RANGE:
case REDACT: // Redacted blocks must exist
- bqueue_enqueue(outq, range, sizeof (*range));
+ bqueue_enqueue(outq, range, sizeof (*range) + spill);
range = get_next_range_nofree(inq, range);
break;
case PREVIOUSLY_REDACTED: {