diff options
author | Matthew Ahrens <[email protected]> | 2019-06-13 13:12:39 -0700 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2019-06-13 13:12:39 -0700 |
commit | 53dce5acc652800fcfca1b83e22a00c5e4fc9e87 (patch) | |
tree | 209821821bba809af812e0ad209ce3a600fc1067 /module/zfs/vdev_removal.c | |
parent | be89734a29fda5a0f5780d953789fb7e91b2a529 (diff) |
panic in removal_remap test on 4K devices
If the zfs_remove_max_segment tunable is changed to be not a multiple of
the sector size, then the device removal code will malfunction and try
to create mappings that are smaller than one sector, leading to a panic.
On debug bits this assertion will fail in spa_vdev_copy_segment():
ASSERT3U(DVA_GET_ASIZE(&dst), ==, size);
On nondebug, the system panics with a stack like:
metaslab_free_concrete()
metaslab_free_impl()
metaslab_free_impl_cb()
vdev_indirect_remap()
free_from_removing_vdev()
metaslab_free_impl()
metaslab_free_dva()
metaslab_free()
Fortunately, the default for zfs_remove_max_segment is 1MB, so this
can't occur by default. We hit it during this test because
removal_remap.ksh changes zfs_remove_max_segment to 1KB. When testing on
4KB-sector disks, we hit the bug.
This change makes the zfs_remove_max_segment tunable more robust,
automatically rounding it up to a multiple of the sector size. We also
turn some key assertions into VERIFY's so that similar bugs would be
caught before they are encoded on disk (and thus avoid a
panic-reboot-loop).
Reviewed-by: Sean Eric Fagan <[email protected]>
Reviewed-by: Pavel Zakharov <[email protected]>
Reviewed-by: Serapheim Dimitropoulos <[email protected]>
Reviewed-by: Sebastien Roy <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: Matthew Ahrens <[email protected]>
External-issue: DLPX-61342
Closes #8893
Diffstat (limited to 'module/zfs/vdev_removal.c')
-rw-r--r-- | module/zfs/vdev_removal.c | 33 |
1 files changed, 26 insertions, 7 deletions
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 536a982ec..6f64edd8c 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -100,6 +100,8 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024; * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If * there is a performance problem with attempting to allocate large blocks, * consider decreasing this. + * + * See also the accessor function spa_remove_max_segment(). */ int zfs_remove_max_segment = SPA_MAXBLOCKSIZE; @@ -951,8 +953,10 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, vdev_indirect_mapping_entry_t *entry; dva_t dst = {{ 0 }}; uint64_t start = range_tree_min(segs); + ASSERT0(P2PHASE(start, 1 << spa->spa_min_ashift)); ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE); + ASSERT0(P2PHASE(maxalloc, 1 << spa->spa_min_ashift)); uint64_t size = range_tree_span(segs); if (range_tree_span(segs) > maxalloc) { @@ -983,6 +987,7 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, } } ASSERT3U(size, <=, maxalloc); + ASSERT0(P2PHASE(size, 1 << spa->spa_min_ashift)); /* * An allocation class might not have any remaining vdevs or space @@ -1026,11 +1031,11 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, /* * We can't have any padding of the allocated size, otherwise we will - * misunderstand what's allocated, and the size of the mapping. - * The caller ensures this will be true by passing in a size that is - * aligned to the worst (highest) ashift in the pool. + * misunderstand what's allocated, and the size of the mapping. We + * prevent padding by ensuring that all devices in the pool have the + * same ashift, and the allocation size is a multiple of the ashift. */ - ASSERT3U(DVA_GET_ASIZE(&dst), ==, size); + VERIFY3U(DVA_GET_ASIZE(&dst), ==, size); entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP); DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start); @@ -1364,6 +1369,20 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, } /* + * The size of each removal mapping is limited by the tunable + * zfs_remove_max_segment, but we must adjust this to be a multiple of the + * pool's ashift, so that we don't try to split individual sectors regardless + * of the tunable value. (Note that device removal requires that all devices + * have the same ashift, so there's no difference between spa_min_ashift and + * spa_max_ashift.) The raw tunable should not be used elsewhere. + */ +uint64_t +spa_remove_max_segment(spa_t *spa) +{ + return (P2ROUNDUP(zfs_remove_max_segment, 1 << spa->spa_max_ashift)); +} + +/* * The removal thread operates in open context. It iterates over all * allocated space in the vdev, by loading each metaslab's spacemap. * For each contiguous segment of allocated space (capping the segment @@ -1385,7 +1404,7 @@ spa_vdev_remove_thread(void *arg) spa_t *spa = arg; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_copy_arg_t vca; - uint64_t max_alloc = zfs_remove_max_segment; + uint64_t max_alloc = spa_remove_max_segment(spa); uint64_t last_txg = 0; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); @@ -1511,7 +1530,7 @@ spa_vdev_remove_thread(void *arg) vd = vdev_lookup_top(spa, svr->svr_vdev_id); if (txg != last_txg) - max_alloc = zfs_remove_max_segment; + max_alloc = spa_remove_max_segment(spa); last_txg = txg; spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx); |