diff options
author | Mike Gerdts <[email protected]> | 2019-06-30 23:38:07 +0000 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2019-07-05 15:35:15 -0700 |
commit | 341166c8435f54936cdea366d096b1f5556292c1 (patch) | |
tree | 1b353d7f68cefe36a1c08655120a3eb78f86a849 /tests | |
parent | 6dbca94f0c0276108748487bbfbaa7140ebb000b (diff) |
OpenZFS 9318 - vol_volsize_to_reservation does not account for raidz skip blocks
When a volume is created in a pool with raidz vdevs and
volblocksize != 128k, the volume can reference more space than is
reserved with the automatically calculated refreservation. There
are two deficiencies in vol_volsize_to_reservation that contribute
to this:
1) Skip blocks may be added to keep each allocation a multiple
of parity + 1. This is the dominating factor when volblocksize
is close to 2^ashift.
2) raidz deflation for 128 KB blocks is different for most other
block sizes.
See "The theory of raidz space accounting" comment in
libzfs_dataset.c for a full explanation.
Authored by: Mike Gerdts <[email protected]>
Reviewed by: Richard Elling <[email protected]>
Reviewed by: Sanjay Nadkarni <[email protected]>
Reviewed by: Jerry Jelinek <[email protected]>
Reviewed by: Matt Ahrens <[email protected]>
Reviewed by: Kody Kantor <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Approved by: Dan McDonald <[email protected]>
Ported-by: Mike Gerdts <[email protected]>
Porting Notes:
* ZTS: wait for zvols to exist before writing
* ZTS: use log_must_busy with {zpool|zfs} destroy
OpenZFS-issue: https://www.illumos.org/issues/9318
OpenZFS-commit: https://github.com/illumos/illumos-gate/commit/b73ccab0
Closes #8973
Diffstat (limited to 'tests')
4 files changed, 332 insertions, 2 deletions
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 5d33b2058..d8b2a0b42 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -756,7 +756,8 @@ tags = ['functional', 'refquota'] [tests/functional/refreserv] tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos', - 'refreserv_004_pos', 'refreserv_005_pos'] + 'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_multi_raidz', + 'refreserv_raidz'] tags = ['functional', 'refreserv'] [tests/functional/removal] diff --git a/tests/zfs-tests/tests/functional/refreserv/Makefile.am b/tests/zfs-tests/tests/functional/refreserv/Makefile.am index 96f25d444..bd760a1f0 100644 --- a/tests/zfs-tests/tests/functional/refreserv/Makefile.am +++ b/tests/zfs-tests/tests/functional/refreserv/Makefile.am @@ -6,7 +6,9 @@ dist_pkgdata_SCRIPTS = \ refreserv_002_pos.ksh \ refreserv_003_pos.ksh \ refreserv_004_pos.ksh \ - refreserv_005_pos.ksh + refreserv_005_pos.ksh \ + refreserv_multi_raidz.ksh \ + refreserv_raidz.ksh dist_pkgdata_DATA = \ refreserv.cfg diff --git a/tests/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz.ksh b/tests/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz.ksh new file mode 100755 index 000000000..803e391c9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz.ksh @@ -0,0 +1,197 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/refreserv/refreserv.cfg + +# +# DESCRIPTION: +# raidz refreservation=auto picks worst raidz vdev +# +# STRATEGY: +# 1. Create a pool with a single raidz vdev +# 2. For each block size [512b, 1k, 128k] or [4k, 8k, 128k] +# - create a volume +# - remember its refreservation +# - destroy the volume +# 3. Destroy the pool +# 4. Recreate the pool with one more disk in the vdev, then repeat steps +# 2 and 3. +# +# NOTES: +# 1. This test will use up to 14 disks but can cover the key concepts with +# 5 disks. +# 2. If the disks are a mixture of 4Kn and 512n/512e, failures are likely. +# + +verify_runnable "global" + +typeset -a alldisks=($DISKS) + +# The larger the volsize, the better zvol_volsize_to_reservation() is at +# guessing the right number - though it is horrible with tiny blocks. At 10M on +# ashift=12, the estimate may be over 26% too high. +volsize=100 + +function cleanup +{ + default_cleanup_noexit + default_setup_noexit "${alldisks[0]}" +} + +log_assert "raidz refreservation=auto picks worst raidz vdev" +log_onexit cleanup + +poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + +# Testing tiny block sizes on ashift=12 pools causes so much size inflation +# that small test disks may fill before creating small volumes. However, +# testing 512b and 1K blocks on ashift=9 pools is an ok approximation for +# testing the problems that arise from 4K and 8K blocks on ashift=12 pools. +bps=$(lsblk -nrdo min-io /dev/${alldisks[0]}) +case "$bps" in +512) + allshifts=(9 10 17) + ;; +4096) + allshifts=(12 13 17) + ;; +*) + log_fail "bytes/sector: $bps != (512|4096)" + ;; +esac +log_note "Testing in ashift=${allshifts[0]} mode" + +typeset -A sizes= + +# +# Determine the refreservation for a $volsize MiB volume on each raidz type at +# various block sizes. +# +for parity in 1 2 3; do + raid=raidz$parity + typeset -A sizes["$raid"] + + # Ensure we hit scenarios with and without skip blocks + for ndisks in $((parity * 2)) $((parity * 2 + 1)); do + typeset -a disks=(${alldisks[0..$((ndisks - 1))]}) + + if (( ${#disks[@]} < ndisks )); then + log_note "Too few disks to test $raid-$ndisks" + continue + fi + + typeset -A sizes["$raid"]["$ndisks"] + + log_must zpool create "$TESTPOOL" "$raid" "${disks[@]}" + + for bits in "${allshifts[@]}"; do + vbs=$((1 << bits)) + log_note "Gathering refreservation for $raid-$ndisks" \ + "volblocksize=$vbs" + + vol=$TESTPOOL/$TESTVOL + log_must zfs create -V ${volsize}m \ + -o volblocksize=$vbs "$vol" + + refres=$(zfs get -Hpo value refreservation "$vol") + log_must test -n "$refres" + sizes["$raid"]["$ndisks"]["$vbs"]=$refres + + log_must_busy zfs destroy "$vol" + done + + log_must_busy zpool destroy "$TESTPOOL" + done +done + +# A little extra info is always helpful when diagnosing problems. To +# pretty-print what you find in the log, do this in ksh: +# typeset -A sizes=(...) +# print -v sizes +log_note "sizes=$(print -C sizes)" + +# +# Helper furnction for checking that refreservation is calculated properly in +# multi-vdev pools. "Properly" is defined as assuming that all vdevs are as +# space inefficient as the worst one. +# +function check_vdevs { + typeset raid=$1 + typeset nd1=$2 + typeset nd2=$3 + typeset -a disks1 disks2 + typeset vbs vol refres refres1 refres2 expect + + disks1=(${alldisks[0..$((nd1 - 1))]}) + disks2=(${alldisks[$nd1..$((nd1 + nd2 - 1))]}) + if (( ${#disks2[@]} < nd2 )); then + log_note "Too few disks to test $raid-$nd1 + $raid=$nd2" + return + fi + + log_must zpool create -f "$TESTPOOL" \ + "$raid" "${disks1[@]}" "$raid" "${disks2[@]}" + + for bits in "${allshifts[@]}"; do + vbs=$((1 << bits)) + log_note "Verifying $raid-$nd1 $raid-$nd2 volblocksize=$vbs" + + vol=$TESTPOOL/$TESTVOL + log_must zfs create -V ${volsize}m -o volblocksize=$vbs "$vol" + refres=$(zfs get -Hpo value refreservation "$vol") + log_must test -n "$refres" + + refres1=${sizes["$raid"]["$nd1"]["$vbs"]} + refres2=${sizes["$raid"]["$nd2"]["$vbs"]} + + if (( refres1 > refres2 )); then + log_note "Expecting refres ($refres) to match refres" \ + "from $raid-$nd1 ($refres1)" + log_must test "$refres" -eq "$refres1" + else + log_note "Expecting refres ($refres) to match refres" \ + "from $raid-$nd1 ($refres2)" + log_must test "$refres" -eq "$refres2" + fi + + log_must zfs destroy "$vol" + done + + log_must zpool destroy "$TESTPOOL" +} + +# +# Verify that multi-vdev pools use the last optimistic size for all the +# permutations within a particular raidz variant. +# +for raid in "${!sizes[@]}"; do + # ksh likes to create a [0] item for us. Thanks, ksh! + [[ $raid == "0" ]] && continue + + for nd1 in "${!sizes["$raid"][@]}"; do + # And with an empty array we get one key, ''. Thanks, ksh! + [[ $nd1 == "0" || -z "$nd1" ]] && continue + + for nd2 in "${!sizes["$raid"][@]}"; do + [[ $nd2 == "0" || -z "$nd2" ]] && continue + + check_vdevs "$raid" "$nd1" "$nd2" + done + done +done + +log_pass "raidz refreservation=auto picks worst raidz vdev" diff --git a/tests/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh b/tests/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh new file mode 100755 index 000000000..7b1f84afe --- /dev/null +++ b/tests/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh @@ -0,0 +1,130 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/refreserv/refreserv.cfg + +# +# DESCRIPTION: +# raidz refreservation=auto accounts for extra parity and skip blocks +# +# STRATEGY: +# 1. Create a pool with a single raidz vdev +# 2. For each block size [512b, 1k, 128k] or [4k, 8k, 128k] +# - create a volume +# - fully overwrite it +# - verify that referenced is less than or equal to reservation +# - destroy the volume +# 3. Destroy the pool +# 4. Recreate the pool with one more disk in the vdev, then repeat steps +# 2 and 3. +# 5. Repeat all steps above for raidz2 and raidz3. +# +# NOTES: +# 1. This test will use up to 14 disks but can cover the key concepts with +# 5 disks. +# 2. If the disks are a mixture of 4Kn and 512n/512e, failures are likely. +# + +verify_runnable "global" + +typeset -a alldisks=($DISKS) + +# The larger the volsize, the better zvol_volsize_to_reservation() is at +# guessing the right number. At 10M on ashift=12, the estimate may be over 26% +# too high. +volsize=100 + +function cleanup +{ + default_cleanup_noexit + default_setup_noexit "${alldisks[0]}" +} + +log_assert "raidz refreservation=auto accounts for extra parity and skip blocks" +log_onexit cleanup + +poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + +# Testing tiny block sizes on ashift=12 pools causes so much size inflation +# that small test disks may fill before creating small volumes. However, +# testing 512b and 1K blocks on ashift=9 pools is an ok approximation for +# testing the problems that arise from 4K and 8K blocks on ashift=12 pools. +bps=$(lsblk -nrdo min-io /dev/${alldisks[0]}) +log_must test "$bps" -eq 512 -o "$bps" -eq 4096 +case "$bps" in +512) + allshifts=(9 10 17) + maxpct=151 + ;; +4096) + allshifts=(12 13 17) + maxpct=110 + ;; +*) + log_fail "bytes/sector: $bps != (512|4096)" + ;; +esac +log_note "Testing in ashift=${allshifts[0]} mode" + +# This loop handles all iterations of steps 1 through 4 described in strategy +# comment above, +for parity in 1 2 3; do + raid=raidz$parity + + # Ensure we hit scenarios with and without skip blocks + for ndisks in $((parity * 2)) $((parity * 2 + 1)); do + typeset -a disks=(${alldisks[0..$((ndisks - 1))]}) + + if (( ${#disks[@]} < ndisks )); then + log_note "Too few disks to test $raid-$ndisks" + continue + fi + + log_must zpool create "$TESTPOOL" "$raid" "${disks[@]}" + + for bits in "${allshifts[@]}"; do + vbs=$((1 << bits)) + log_note "Testing $raid-$ndisks volblocksize=$vbs" + + vol=$TESTPOOL/$TESTVOL + log_must zfs create -V ${volsize}m \ + -o volblocksize=$vbs "$vol" + block_device_wait "/dev/zvol/$vol" + log_must dd if=/dev/zero of=/dev/zvol/$vol \ + bs=1024k count=$volsize + sync + + ref=$(zfs get -Hpo value referenced "$vol") + refres=$(zfs get -Hpo value refreservation "$vol") + log_must test -n "$ref" + log_must test -n "$refres" + + typeset -F2 deltapct=$((refres * 100.0 / ref)) + log_note "$raid-$ndisks refreservation $refres" \ + "is $deltapct% of reservation $res" + + log_must test "$ref" -le "$refres" + log_must test "$deltapct" -le $maxpct + + log_must_busy zfs destroy "$vol" + done + + log_must_busy zpool destroy "$TESTPOOL" + done +done + +log_pass "raidz refreservation=auto accounts for extra parity and skip blocks" |