diff options
author | Matthew Ahrens <[email protected]> | 2020-06-09 10:41:01 -0700 |
---|---|---|
committer | GitHub <[email protected]> | 2020-06-09 10:41:01 -0700 |
commit | 7bcb7f0840d1857370dd1f9ee0ad48f9b7939dfd (patch) | |
tree | 5582990412f2058fe8b796dbe240205bba027dd0 /module/zfs/dmu_recv.c | |
parent | 6722be2823b5ef39d647e440541806c72b3dbf9b (diff) |
File incorrectly zeroed when receiving incremental stream that toggles -L
Background:
By increasing the recordsize property above the default of 128KB, a
filesystem may have "large" blocks. By default, a send stream of such a
filesystem does not contain large WRITE records, instead it decreases
objects' block sizes to 128KB and splits the large blocks into 128KB
blocks, allowing the large-block filesystem to be received by a system
that does not support the `large_blocks` feature. A send stream
generated by `zfs send -L` (or `--large-block`) preserves the large
block size on the receiving system, by using large WRITE records.
When receiving an incremental send stream for a filesystem with large
blocks, if the send stream's -L flag was toggled, a bug is encountered
in which the file's contents are incorrectly zeroed out. The contents
of any blocks that were not modified by this send stream will be lost.
"Toggled" means that the previous send used `-L`, but this incremental
does not use `-L` (-L to no-L); or that the previous send did not use
`-L`, but this incremental does use `-L` (no-L to -L).
Changes:
This commit addresses the problem with several changes to the semantics
of zfs send/receive:
1. "-L to no-L" incrementals are rejected. If the previous send used
`-L`, but this incremental does not use `-L`, the `zfs receive` will
fail with this error message:
incremental send stream requires -L (--large-block), to match
previous receive.
2. "no-L to -L" incrementals are handled correctly, preserving the
smaller (128KB) block size of any already-received files that used large
blocks on the sending system but were split by `zfs send` without the
`-L` flag.
3. A new send stream format flag is added, `SWITCH_TO_LARGE_BLOCKS`.
This feature indicates that we can correctly handle "no-L to -L"
incrementals. This flag is currently not set on any send streams. In
the future, we intend for incremental send streams of snapshots that
have large blocks to use `-L` by default, and these streams will also
have the `SWITCH_TO_LARGE_BLOCKS` feature set. This ensures that streams
from the default use of `zfs send` won't encounter the bug mentioned
above, because they can't be received by software with the bug.
Implementation notes:
To facilitate accessing the ZPL's generation number,
`zfs_space_delta_cb()` has been renamed to `zpl_get_file_info()` and
restructured to fill in a struct with ZPL-specific info including owner
and generation.
In the "no-L to -L" case, if this is a compressed send stream (from
`zfs send -cL`), large WRITE records that are being written to small
(128KB) blocksize files need to be decompressed so that they can be
written split up into multiple blocks. The zio pipeline will recompress
each smaller block individually.
A new test case, `send-L_toggle`, is added, which tests the "no-L to -L"
case and verifies that we get an error for the "-L to no-L" case.
Reviewed-by: Paul Dagnelie <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: Matthew Ahrens <[email protected]>
Closes #6224
Closes #10383
Diffstat (limited to 'module/zfs/dmu_recv.c')
-rw-r--r-- | module/zfs/dmu_recv.c | 410 |
1 files changed, 321 insertions, 89 deletions
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 29fbe854d..17ebeb682 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -104,6 +104,7 @@ struct receive_writer_arg { boolean_t resumable; boolean_t raw; /* DMU_BACKUP_FEATURE_RAW set */ boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */ + boolean_t full; /* this is a full send stream */ uint64_t last_object; uint64_t last_offset; uint64_t max_object; /* highest object ID referenced in stream */ @@ -333,6 +334,21 @@ redact_check(dmu_recv_begin_arg_t *drba, dsl_dataset_t *origin) return (ret); } +/* + * If we previously received a stream with --large-block, we don't support + * receiving an incremental on top of it without --large-block. This avoids + * forcing a read-modify-write or trying to re-aggregate a string of WRITE + * records. + */ +static int +recv_check_large_blocks(dsl_dataset_t *ds, uint64_t featureflags) +{ + if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_LARGE_BLOCKS) && + !(featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS)) + return (SET_ERROR(ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH)); + return (0); +} + static int recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, uint64_t fromguid, uint64_t featureflags) @@ -445,6 +461,12 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, return (SET_ERROR(EINVAL)); } + error = recv_check_large_blocks(snap, featureflags); + if (error != 0) { + dsl_dataset_rele(snap, FTAG); + return (error); + } + dsl_dataset_rele(snap, FTAG); } else { /* if full, then must be forced */ @@ -479,7 +501,6 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, } return (0); - } /* @@ -725,6 +746,13 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) } } + error = recv_check_large_blocks(ds, featureflags); + if (error != 0) { + dsl_dataset_rele_flags(origin, dsflags, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (error); + } + dsl_dataset_rele_flags(origin, dsflags, FTAG); } @@ -1050,6 +1078,13 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) } } } + + error = recv_check_large_blocks(ds, drc->drc_featureflags); + if (error != 0) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (error); + } + dsl_dataset_rele_flags(ds, dsflags, FTAG); return (0); } @@ -1289,14 +1324,251 @@ save_resume_state(struct receive_writer_arg *rwa, rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read; } +static int +receive_object_is_same_generation(objset_t *os, uint64_t object, + dmu_object_type_t old_bonus_type, dmu_object_type_t new_bonus_type, + const void *new_bonus, boolean_t *samegenp) +{ + zfs_file_info_t zoi; + int err; + + dmu_buf_t *old_bonus_dbuf; + err = dmu_bonus_hold(os, object, FTAG, &old_bonus_dbuf); + if (err != 0) + return (err); + err = dmu_get_file_info(os, old_bonus_type, old_bonus_dbuf->db_data, + &zoi); + dmu_buf_rele(old_bonus_dbuf, FTAG); + if (err != 0) + return (err); + uint64_t old_gen = zoi.zfi_generation; + + err = dmu_get_file_info(os, new_bonus_type, new_bonus, &zoi); + if (err != 0) + return (err); + uint64_t new_gen = zoi.zfi_generation; + + *samegenp = (old_gen == new_gen); + return (0); +} + +static int +receive_handle_existing_object(const struct receive_writer_arg *rwa, + const struct drr_object *drro, const dmu_object_info_t *doi, + const void *bonus_data, + uint64_t *object_to_hold, uint32_t *new_blksz) +{ + uint32_t indblksz = drro->drr_indblkshift ? + 1ULL << drro->drr_indblkshift : 0; + int nblkptr = deduce_nblkptr(drro->drr_bonustype, + drro->drr_bonuslen); + uint8_t dn_slots = drro->drr_dn_slots != 0 ? + drro->drr_dn_slots : DNODE_MIN_SLOTS; + boolean_t do_free_range = B_FALSE; + int err; + + *object_to_hold = drro->drr_object; + + /* nblkptr should be bounded by the bonus size and type */ + if (rwa->raw && nblkptr != drro->drr_nblkptr) + return (SET_ERROR(EINVAL)); + + /* + * After the previous send stream, the sending system may + * have freed this object, and then happened to re-allocate + * this object number in a later txg. In this case, we are + * receiving a different logical file, and the block size may + * appear to be different. i.e. we may have a different + * block size for this object than what the send stream says. + * In this case we need to remove the object's contents, + * so that its structure can be changed and then its contents + * entirely replaced by subsequent WRITE records. + * + * If this is a -L (--large-block) incremental stream, and + * the previous stream was not -L, the block size may appear + * to increase. i.e. we may have a smaller block size for + * this object than what the send stream says. In this case + * we need to keep the object's contents and block size + * intact, so that we don't lose parts of the object's + * contents that are not changed by this incremental send + * stream. + * + * We can distinguish between the two above cases by using + * the ZPL's generation number (see + * receive_object_is_same_generation()). However, we only + * want to rely on the generation number when absolutely + * necessary, because with raw receives, the generation is + * encrypted. We also want to minimize dependence on the + * ZPL, so that other types of datasets can also be received + * (e.g. ZVOLs, although note that ZVOLS currently do not + * reallocate their objects or change their structure). + * Therefore, we check a number of different cases where we + * know it is safe to discard the object's contents, before + * using the ZPL's generation number to make the above + * distinction. + */ + if (drro->drr_blksz != doi->doi_data_block_size) { + if (rwa->raw) { + /* + * RAW streams always have large blocks, so + * we are sure that the data is not needed + * due to changing --large-block to be on. + * Which is fortunate since the bonus buffer + * (which contains the ZPL generation) is + * encrypted, and the key might not be + * loaded. + */ + do_free_range = B_TRUE; + } else if (rwa->full) { + /* + * This is a full send stream, so it always + * replaces what we have. Even if the + * generation numbers happen to match, this + * can not actually be the same logical file. + * This is relevant when receiving a full + * send as a clone. + */ + do_free_range = B_TRUE; + } else if (drro->drr_type != + DMU_OT_PLAIN_FILE_CONTENTS || + doi->doi_type != DMU_OT_PLAIN_FILE_CONTENTS) { + /* + * PLAIN_FILE_CONTENTS are the only type of + * objects that have ever been stored with + * large blocks, so we don't need the special + * logic below. ZAP blocks can shrink (when + * there's only one block), so we don't want + * to hit the error below about block size + * only increasing. + */ + do_free_range = B_TRUE; + } else if (doi->doi_max_offset <= + doi->doi_data_block_size) { + /* + * There is only one block. We can free it, + * because its contents will be replaced by a + * WRITE record. This can not be the no-L -> + * -L case, because the no-L case would have + * resulted in multiple blocks. If we + * supported -L -> no-L, it would not be safe + * to free the file's contents. Fortunately, + * that is not allowed (see + * recv_check_large_blocks()). + */ + do_free_range = B_TRUE; + } else { + boolean_t is_same_gen; + err = receive_object_is_same_generation(rwa->os, + drro->drr_object, doi->doi_bonus_type, + drro->drr_bonustype, bonus_data, &is_same_gen); + if (err != 0) + return (SET_ERROR(EINVAL)); + + if (is_same_gen) { + /* + * This is the same logical file, and + * the block size must be increasing. + * It could only decrease if + * --large-block was changed to be + * off, which is checked in + * recv_check_large_blocks(). + */ + if (drro->drr_blksz <= + doi->doi_data_block_size) + return (SET_ERROR(EINVAL)); + /* + * We keep the existing blocksize and + * contents. + */ + *new_blksz = + doi->doi_data_block_size; + } else { + do_free_range = B_TRUE; + } + } + } + + /* nblkptr can only decrease if the object was reallocated */ + if (nblkptr < doi->doi_nblkptr) + do_free_range = B_TRUE; + + /* number of slots can only change on reallocation */ + if (dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) + do_free_range = B_TRUE; + + /* + * For raw sends we also check a few other fields to + * ensure we are preserving the objset structure exactly + * as it was on the receive side: + * - A changed indirect block size + * - A smaller nlevels + */ + if (rwa->raw) { + if (indblksz != doi->doi_metadata_block_size) + do_free_range = B_TRUE; + if (drro->drr_nlevels < doi->doi_indirection) + do_free_range = B_TRUE; + } + + if (do_free_range) { + err = dmu_free_long_range(rwa->os, drro->drr_object, + 0, DMU_OBJECT_END); + if (err != 0) + return (SET_ERROR(EINVAL)); + } + + /* + * The dmu does not currently support decreasing nlevels + * or changing the number of dnode slots on an object. For + * non-raw sends, this does not matter and the new object + * can just use the previous one's nlevels. For raw sends, + * however, the structure of the received dnode (including + * nlevels and dnode slots) must match that of the send + * side. Therefore, instead of using dmu_object_reclaim(), + * we must free the object completely and call + * dmu_object_claim_dnsize() instead. + */ + if ((rwa->raw && drro->drr_nlevels < doi->doi_indirection) || + dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) { + err = dmu_free_long_object(rwa->os, drro->drr_object); + if (err != 0) + return (SET_ERROR(EINVAL)); + + txg_wait_synced(dmu_objset_pool(rwa->os), 0); + *object_to_hold = DMU_NEW_OBJECT; + } + + /* + * For raw receives, free everything beyond the new incoming + * maxblkid. Normally this would be done with a DRR_FREE + * record that would come after this DRR_OBJECT record is + * processed. However, for raw receives we manually set the + * maxblkid from the drr_maxblkid and so we must first free + * everything above that blkid to ensure the DMU is always + * consistent with itself. We will never free the first block + * of the object here because a maxblkid of 0 could indicate + * an object with a single block or one with no blocks. This + * free may be skipped when dmu_free_long_range() was called + * above since it covers the entire object's contents. + */ + if (rwa->raw && *object_to_hold != DMU_NEW_OBJECT && !do_free_range) { + err = dmu_free_long_range(rwa->os, drro->drr_object, + (drro->drr_maxblkid + 1) * doi->doi_data_block_size, + DMU_OBJECT_END); + if (err != 0) + return (SET_ERROR(EINVAL)); + } + return (0); +} + noinline static int receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, void *data) { dmu_object_info_t doi; dmu_tx_t *tx; - uint64_t object; int err; + uint32_t new_blksz = drro->drr_blksz; uint8_t dn_slots = drro->drr_dn_slots != 0 ? drro->drr_dn_slots : DNODE_MIN_SLOTS; @@ -1360,86 +1632,10 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, * Raw receives will also check that the indirect structure of the * dnode hasn't changed. */ + uint64_t object_to_hold; if (err == 0) { - uint32_t indblksz = drro->drr_indblkshift ? - 1ULL << drro->drr_indblkshift : 0; - int nblkptr = deduce_nblkptr(drro->drr_bonustype, - drro->drr_bonuslen); - boolean_t did_free = B_FALSE; - - object = drro->drr_object; - - /* nblkptr should be bounded by the bonus size and type */ - if (rwa->raw && nblkptr != drro->drr_nblkptr) - return (SET_ERROR(EINVAL)); - - /* - * Check for indicators that the object was freed and - * reallocated. For all sends, these indicators are: - * - A changed block size - * - A smaller nblkptr - * - A changed dnode size - * For raw sends we also check a few other fields to - * ensure we are preserving the objset structure exactly - * as it was on the receive side: - * - A changed indirect block size - * - A smaller nlevels - */ - if (drro->drr_blksz != doi.doi_data_block_size || - nblkptr < doi.doi_nblkptr || - dn_slots != doi.doi_dnodesize >> DNODE_SHIFT || - (rwa->raw && - (indblksz != doi.doi_metadata_block_size || - drro->drr_nlevels < doi.doi_indirection))) { - err = dmu_free_long_range(rwa->os, drro->drr_object, - 0, DMU_OBJECT_END); - if (err != 0) - return (SET_ERROR(EINVAL)); - else - did_free = B_TRUE; - } - - /* - * The dmu does not currently support decreasing nlevels - * or changing the number of dnode slots on an object. For - * non-raw sends, this does not matter and the new object - * can just use the previous one's nlevels. For raw sends, - * however, the structure of the received dnode (including - * nlevels and dnode slots) must match that of the send - * side. Therefore, instead of using dmu_object_reclaim(), - * we must free the object completely and call - * dmu_object_claim_dnsize() instead. - */ - if ((rwa->raw && drro->drr_nlevels < doi.doi_indirection) || - dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) { - err = dmu_free_long_object(rwa->os, drro->drr_object); - if (err != 0) - return (SET_ERROR(EINVAL)); - - txg_wait_synced(dmu_objset_pool(rwa->os), 0); - object = DMU_NEW_OBJECT; - } - - /* - * For raw receives, free everything beyond the new incoming - * maxblkid. Normally this would be done with a DRR_FREE - * record that would come after this DRR_OBJECT record is - * processed. However, for raw receives we manually set the - * maxblkid from the drr_maxblkid and so we must first free - * everything above that blkid to ensure the DMU is always - * consistent with itself. We will never free the first block - * of the object here because a maxblkid of 0 could indicate - * an object with a single block or one with no blocks. This - * free may be skipped when dmu_free_long_range() was called - * above since it covers the entire object's contents. - */ - if (rwa->raw && object != DMU_NEW_OBJECT && !did_free) { - err = dmu_free_long_range(rwa->os, drro->drr_object, - (drro->drr_maxblkid + 1) * doi.doi_data_block_size, - DMU_OBJECT_END); - if (err != 0) - return (SET_ERROR(EINVAL)); - } + err = receive_handle_existing_object(rwa, drro, &doi, data, + &object_to_hold, &new_blksz); } else if (err == EEXIST) { /* * The object requested is currently an interior slot of a @@ -1454,10 +1650,10 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, return (SET_ERROR(EINVAL)); /* object was freed and we are about to allocate a new one */ - object = DMU_NEW_OBJECT; + object_to_hold = DMU_NEW_OBJECT; } else { /* object is free and we are about to allocate a new one */ - object = DMU_NEW_OBJECT; + object_to_hold = DMU_NEW_OBJECT; } /* @@ -1492,27 +1688,27 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, } tx = dmu_tx_create(rwa->os); - dmu_tx_hold_bonus(tx, object); - dmu_tx_hold_write(tx, object, 0, 0); + dmu_tx_hold_bonus(tx, object_to_hold); + dmu_tx_hold_write(tx, object_to_hold, 0, 0); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } - if (object == DMU_NEW_OBJECT) { + if (object_to_hold == DMU_NEW_OBJECT) { /* Currently free, wants to be allocated */ err = dmu_object_claim_dnsize(rwa->os, drro->drr_object, - drro->drr_type, drro->drr_blksz, + drro->drr_type, new_blksz, drro->drr_bonustype, drro->drr_bonuslen, dn_slots << DNODE_SHIFT, tx); } else if (drro->drr_type != doi.doi_type || - drro->drr_blksz != doi.doi_data_block_size || + new_blksz != doi.doi_data_block_size || drro->drr_bonustype != doi.doi_bonus_type || drro->drr_bonuslen != doi.doi_bonus_size) { /* Currently allocated, but with different properties */ err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object, - drro->drr_type, drro->drr_blksz, + drro->drr_type, new_blksz, drro->drr_bonustype, drro->drr_bonuslen, dn_slots << DNODE_SHIFT, rwa->spill ? DRR_OBJECT_HAS_SPILL(drro->drr_flags) : B_FALSE, tx); @@ -1578,6 +1774,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, * For non-new objects block size and indirect block * shift cannot change and nlevels can only increase. */ + ASSERT3U(new_blksz, ==, drro->drr_blksz); VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object, drro->drr_blksz, drro->drr_indblkshift, tx)); VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object, @@ -1707,6 +1904,40 @@ flush_write_batch_impl(struct receive_writer_arg *rwa) DRR_WRITE_PAYLOAD_SIZE(drrw)); } + /* + * If we are receiving an incremental large-block stream into + * a dataset that previously did a non-large-block receive, + * the WRITE record may be larger than the object's block + * size. dmu_assign_arcbuf_by_dnode() handles this as long + * as the arcbuf is not compressed, so decompress it here if + * necessary. + */ + if (drrw->drr_logical_size != dn->dn_datablksz && + arc_get_compression(abuf) != ZIO_COMPRESS_OFF) { + ASSERT3U(drrw->drr_logical_size, >, dn->dn_datablksz); + zbookmark_phys_t zb = { + .zb_objset = dmu_objset_id(rwa->os), + .zb_object = rwa->last_object, + .zb_level = 0, + .zb_blkid = + drrw->drr_offset >> dn->dn_datablkshift, + }; + + /* + * The size of loaned arc bufs is counted in + * arc_loaned_bytes. When we untransform + * (decompress) the buf, its size increases. To + * ensure that arc_loaned_bytes remains accurate, we + * need to return (un-loan) the buf (with its + * compressed size) and then re-loan it (with its + * new, uncompressed size). + */ + arc_return_buf(abuf, FTAG); + VERIFY0(arc_untransform(abuf, dmu_objset_spa(rwa->os), + &zb, B_FALSE)); + arc_loan_inuse_buf(abuf, FTAG); + } + err = dmu_assign_arcbuf_by_dnode(dn, drrw->drr_offset, abuf, tx); if (err != 0) { @@ -2710,6 +2941,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) rwa->resumable = drc->drc_resumable; rwa->raw = drc->drc_raw; rwa->spill = drc->drc_spill; + rwa->full = (drc->drc_drr_begin->drr_u.drr_begin.drr_fromguid == 0); rwa->os->os_raw_receive = drc->drc_raw; list_create(&rwa->write_batch, sizeof (struct receive_record_arg), offsetof(struct receive_record_arg, node.bqn_node)); |