aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2019-05-07 15:18:44 -0700
committerGitHub <[email protected]>2019-05-07 15:18:44 -0700
commitcaf9dd209fdcfccabc2f32b3f23c5386ccfb896c (patch)
tree8df4542698b146a103602702113a55afba053e27 /module/zfs
parent9c53e51616c99592973ebf94b4fd54a5f8c8756d (diff)
Fix send/recv lost spill block
When receiving a DRR_OBJECT record the receive_object() function needs to determine how to handle a spill block associated with the object. It may need to be removed or kept depending on how the object was modified at the source. This determination is currently accomplished using a heuristic which takes in to account the DRR_OBJECT record and the existing object properties. This is a problem because there isn't quite enough information available to do the right thing under all circumstances. For example, when only the block size changes the spill block is removed when it should be kept. What's needed to resolve this is an additional flag in the DRR_OBJECT which indicates if the object being received references a spill block. The DRR_OBJECT_SPILL flag was added for this purpose. When set then the object references a spill block and it must be kept. Either it is update to date, or it will be replaced by a subsequent DRR_SPILL record. Conversely, if the object being received doesn't reference a spill block then any existing spill block should always be removed. Since previous versions of ZFS do not understand this new flag additional DRR_SPILL records will be inserted in to the stream. This has the advantage of being fully backward compatible. Existing ZFS systems receiving this stream will recreate the spill block if it was incorrectly removed. Updated ZFS versions will correctly ignore the additional spill blocks which can be identified by checking for the DRR_SPILL_UNMODIFIED flag. The small downside to this approach is that is may increase the size of the stream and of the received snapshot on previous versions of ZFS. Additionally, when receiving streams generated by previous unpatched versions of ZFS spill blocks may still be lost. OpenZFS-issue: https://www.illumos.org/issues/9952 FreeBSD-issue: https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=233277 Reviewed-by: Paul Dagnelie <[email protected]> Reviewed-by: Matt Ahrens <[email protected]> Reviewed-by: Tom Caputi <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #8668
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/dbuf.c2
-rw-r--r--module/zfs/dmu_object.c31
-rw-r--r--module/zfs/dmu_recv.c67
-rw-r--r--module/zfs/dmu_send.c52
-rw-r--r--module/zfs/dnode.c5
5 files changed, 142 insertions, 15 deletions
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index d52a520fa..07e616f6f 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -2466,7 +2466,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
ASSERT(db->db_level == 0);
ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
ASSERT(buf != NULL);
- ASSERT(arc_buf_lsize(buf) == db->db.db_size);
+ ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size);
ASSERT(tx->tx_txg != 0);
arc_return_buf(buf, db);
diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c
index e77ebeca5..ec78ebbdc 100644
--- a/module/zfs/dmu_object.c
+++ b/module/zfs/dmu_object.c
@@ -24,6 +24,7 @@
* Copyright 2014 HybridCluster. All rights reserved.
*/
+#include <sys/dbuf.h>
#include <sys/dmu.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_tx.h>
@@ -304,13 +305,13 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
- bonuslen, DNODE_MIN_SIZE, tx));
+ bonuslen, DNODE_MIN_SIZE, B_FALSE, tx));
}
int
dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
- dmu_tx_t *tx)
+ boolean_t keep_spill, dmu_tx_t *tx)
{
dnode_t *dn;
int dn_slots = dnodesize >> DNODE_SHIFT;
@@ -327,7 +328,30 @@ dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
if (err)
return (err);
- dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx);
+ dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots,
+ keep_spill, tx);
+
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+int
+dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
+ FTAG, &dn);
+ if (err)
+ return (err);
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ dbuf_rm_spill(dn, tx);
+ dnode_rm_spill(dn, tx);
+ }
+ rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
return (err);
@@ -489,6 +513,7 @@ EXPORT_SYMBOL(dmu_object_claim);
EXPORT_SYMBOL(dmu_object_claim_dnsize);
EXPORT_SYMBOL(dmu_object_reclaim);
EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
+EXPORT_SYMBOL(dmu_object_rm_spill);
EXPORT_SYMBOL(dmu_object_free);
EXPORT_SYMBOL(dmu_object_next);
EXPORT_SYMBOL(dmu_object_zapify);
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index fc5d47f5f..976b1bd46 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -274,6 +274,10 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
/* embedded data is incompatible with encryption and raw recv */
if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
return (SET_ERROR(EINVAL));
+
+ /* raw receives require spill block allocation flag */
+ if (!(flags & DRR_FLAG_SPILL_BLOCK))
+ return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING));
} else {
dsflags |= DS_HOLD_FLAG_DECRYPT;
}
@@ -615,8 +619,13 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
(void) snprintf(recvname, sizeof (recvname), "%s/%s",
tofs, recv_clone_name);
- if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0)
+ if (featureflags & DMU_BACKUP_FEATURE_RAW) {
+ /* raw receives require spill block allocation flag */
+ if (!(drrb->drr_flags & DRR_FLAG_SPILL_BLOCK))
+ return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING));
+ } else {
dsflags |= DS_HOLD_FLAG_DECRYPT;
+ }
if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) {
/* %recv does not exist; continue in tofs */
@@ -764,6 +773,9 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
return (SET_ERROR(EINVAL));
}
+ if (drc->drc_drrb->drr_flags & DRR_FLAG_SPILL_BLOCK)
+ drc->drc_spill = B_TRUE;
+
drba.drba_origin = origin;
drba.drba_cookie = drc;
drba.drba_cred = CRED();
@@ -835,7 +847,8 @@ struct receive_writer_arg {
/* A map from guid to dataset to help handle dedup'd streams. */
avl_tree_t *guid_to_ds_map;
boolean_t resumable;
- boolean_t raw;
+ boolean_t raw; /* DMU_BACKUP_FEATURE_RAW set */
+ boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */
uint64_t last_object;
uint64_t last_offset;
uint64_t max_object; /* highest object ID referenced in stream */
@@ -1151,10 +1164,19 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
drro->drr_raw_bonuslen)
return (SET_ERROR(EINVAL));
} else {
- if (drro->drr_flags != 0 || drro->drr_raw_bonuslen != 0 ||
- drro->drr_indblkshift != 0 || drro->drr_nlevels != 0 ||
- drro->drr_nblkptr != 0)
+ /*
+ * The DRR_OBJECT_SPILL flag is valid when the DRR_BEGIN
+ * record indicates this by setting DRR_FLAG_SPILL_BLOCK.
+ */
+ if (((drro->drr_flags & ~(DRR_OBJECT_SPILL))) ||
+ (!rwa->spill && DRR_OBJECT_HAS_SPILL(drro->drr_flags))) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (drro->drr_raw_bonuslen != 0 || drro->drr_nblkptr != 0 ||
+ drro->drr_indblkshift != 0 || drro->drr_nlevels != 0) {
return (SET_ERROR(EINVAL));
+ }
}
err = dmu_object_info(rwa->os, drro->drr_object, &doi);
@@ -1312,7 +1334,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
}
if (object == DMU_NEW_OBJECT) {
- /* currently free, want to be allocated */
+ /* Currently free, wants to be allocated */
err = dmu_object_claim_dnsize(rwa->os, drro->drr_object,
drro->drr_type, drro->drr_blksz,
drro->drr_bonustype, drro->drr_bonuslen,
@@ -1321,11 +1343,19 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
drro->drr_blksz != doi.doi_data_block_size ||
drro->drr_bonustype != doi.doi_bonus_type ||
drro->drr_bonuslen != doi.doi_bonus_size) {
- /* currently allocated, but with different properties */
+ /* Currently allocated, but with different properties */
err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object,
drro->drr_type, drro->drr_blksz,
drro->drr_bonustype, drro->drr_bonuslen,
- dn_slots << DNODE_SHIFT, tx);
+ dn_slots << DNODE_SHIFT, rwa->spill ?
+ DRR_OBJECT_HAS_SPILL(drro->drr_flags) : B_FALSE, tx);
+ } else if (rwa->spill && !DRR_OBJECT_HAS_SPILL(drro->drr_flags)) {
+ /*
+ * Currently allocated, the existing version of this object
+ * may reference a spill block that is no longer allocated
+ * at the source and needs to be freed.
+ */
+ err = dmu_object_rm_spill(rwa->os, drro->drr_object, tx);
}
if (err != 0) {
@@ -1665,6 +1695,17 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
return (SET_ERROR(EINVAL));
+ /*
+ * This is an unmodified spill block which was added to the stream
+ * to resolve an issue with incorrectly removing spill blocks. It
+ * should be ignored by current versions of the code which support
+ * the DRR_FLAG_SPILL_BLOCK flag.
+ */
+ if (rwa->spill && DRR_SPILL_IS_UNMODIFIED(drrs->drr_flags)) {
+ dmu_return_arcbuf(abuf);
+ return (0);
+ }
+
if (rwa->raw) {
if (!DMU_OT_IS_VALID(drrs->drr_type) ||
drrs->drr_compressiontype >= ZIO_COMPRESS_FUNCTIONS ||
@@ -1699,9 +1740,16 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
return (err);
}
- if (db_spill->db_size < drrs->drr_length)
+ /*
+ * Spill blocks may both grow and shrink. When a change in size
+ * occurs any existing dbuf must be updated to match the logical
+ * size of the provided arc_buf_t.
+ */
+ if (db_spill->db_size != drrs->drr_length) {
+ dmu_buf_will_fill(db_spill, tx);
VERIFY(0 == dbuf_spill_set_blksz(db_spill,
drrs->drr_length, tx));
+ }
if (rwa->byteswap && !arc_is_encrypted(abuf) &&
arc_get_compression(abuf) == ZIO_COMPRESS_OFF) {
@@ -2575,6 +2623,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
rwa->byteswap = drc->drc_byteswap;
rwa->resumable = drc->drc_resumable;
rwa->raw = drc->drc_raw;
+ rwa->spill = drc->drc_spill;
rwa->os->os_raw_receive = drc->drc_raw;
(void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc,
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index ad64d666b..a6ff5ce3e 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -64,6 +64,8 @@ int zfs_send_corrupt_data = B_FALSE;
int zfs_send_queue_length = SPA_MAXBLOCKSIZE;
/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
int zfs_send_set_freerecords_bit = B_TRUE;
+/* Set this tunable to FALSE is disable sending unmodified spill blocks. */
+int zfs_send_unmodified_spill_blocks = B_TRUE;
/*
* Use this to override the recordsize calculation for fast zfs send estimates.
@@ -99,6 +101,8 @@ typedef struct dump_bytes_io {
int dbi_len;
} dump_bytes_io_t;
+static int do_dump(dmu_sendarg_t *dsa, struct send_block_record *data);
+
static void
dump_bytes_cb(void *arg)
{
@@ -436,6 +440,12 @@ dump_spill(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, void *data)
drrs->drr_length = blksz;
drrs->drr_toguid = dsp->dsa_toguid;
+ /* See comment in dump_dnode() for full details */
+ if (zfs_send_unmodified_spill_blocks &&
+ (bp->blk_birth <= dsp->dsa_fromtxg)) {
+ drrs->drr_flags |= DRR_SPILL_UNMODIFIED;
+ }
+
/* handle raw send fields */
if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) {
ASSERT(BP_IS_PROTECTED(bp));
@@ -587,6 +597,14 @@ dump_dnode(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object,
}
}
+ /*
+ * DRR_OBJECT_SPILL is set for every dnode which references a
+ * spill block. This allows the receiving pool to definitively
+ * determine when a spill block should be kept or freed.
+ */
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
+ drro->drr_flags |= DRR_OBJECT_SPILL;
+
if (dump_record(dsp, DN_BONUS(dnp), bonuslen) != 0)
return (SET_ERROR(EINTR));
@@ -594,8 +612,34 @@ dump_dnode(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object,
if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
(dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0)
return (SET_ERROR(EINTR));
+
+ /*
+ * Send DRR_SPILL records for unmodified spill blocks. This is useful
+ * because changing certain attributes of the object (e.g. blocksize)
+ * can cause old versions of ZFS to incorrectly remove a spill block.
+ * Including these records in the stream forces an up to date version
+ * to always be written ensuring they're never lost. Current versions
+ * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can
+ * ignore these unmodified spill blocks.
+ */
+ if (zfs_send_unmodified_spill_blocks &&
+ (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
+ (DN_SPILL_BLKPTR(dnp)->blk_birth <= dsp->dsa_fromtxg)) {
+ struct send_block_record record;
+
+ bzero(&record, sizeof (struct send_block_record));
+ record.eos_marker = B_FALSE;
+ record.bp = *DN_SPILL_BLKPTR(dnp);
+ SET_BOOKMARK(&(record.zb), dmu_objset_id(dsp->dsa_os),
+ object, 0, DMU_SPILL_BLKID);
+
+ if (do_dump(dsp, &record) != 0)
+ return (SET_ERROR(EINTR));
+ }
+
if (dsp->dsa_err != 0)
return (SET_ERROR(EINTR));
+
return (0);
}
@@ -1036,6 +1080,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
/* raw send implies compressok */
if (compressok || rawok)
featureflags |= DMU_BACKUP_FEATURE_COMPRESSED;
+
if (rawok && os->os_encrypted)
featureflags |= DMU_BACKUP_FEATURE_RAW;
@@ -1064,6 +1109,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
if (zfs_send_set_freerecords_bit)
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;
+ drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_SPILL_BLOCK;
+
if (ancestor_zb != NULL) {
drr->drr_u.drr_begin.drr_fromguid =
ancestor_zb->zbm_guid;
@@ -1084,6 +1131,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
dsp->dsa_os = os;
dsp->dsa_off = off;
dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
+ dsp->dsa_fromtxg = fromtxg;
dsp->dsa_pending_op = PENDING_NONE;
dsp->dsa_featureflags = featureflags;
dsp->dsa_resume_object = resumeobj;
@@ -1552,4 +1600,8 @@ MODULE_PARM_DESC(zfs_send_corrupt_data, "Allow sending corrupt data");
module_param(zfs_send_queue_length, int, 0644);
MODULE_PARM_DESC(zfs_send_queue_length, "Maximum send queue length");
+
+module_param(zfs_send_unmodified_spill_blocks, int, 0644);
+MODULE_PARM_DESC(zfs_send_unmodified_spill_blocks,
+ "Send unmodified spill blocks");
#endif
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 78a90f68f..38ec646ba 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -660,7 +660,8 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
void
dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
- dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots,
+ boolean_t keep_spill, dmu_tx_t *tx)
{
int nblkptr;
@@ -710,7 +711,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = bonustype;
if (dn->dn_nblkptr != nblkptr)
dn->dn_next_nblkptr[tx->tx_txg & TXG_MASK] = nblkptr;
- if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR && !keep_spill) {
dbuf_rm_spill(dn, tx);
dnode_rm_spill(dn, tx);
}