summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoradilger <[email protected]>2020-06-18 12:22:11 -0600
committerGitHub <[email protected]>2020-06-18 11:22:11 -0700
commitf734301d2267cbb33eaffbca195fc93f1dae7b74 (patch)
tree9cca50f0352df34f2ccf7142e09be836b46c9565
parentd553fb9b9e18e04d1c85bf1930bcf0f46757d32e (diff)
linux: add basic fallocate(mode=0/2) compatibility
Implement semi-compatible functionality for mode=0 (preallocation) and mode=FALLOC_FL_KEEP_SIZE (preallocation beyond EOF) for ZPL. Since ZFS does COW and snapshots, preallocating blocks for a file cannot guarantee that writes to the file will not run out of space. Even if the first overwrite was guaranteed, it would not handle any later overwrite of blocks due to COW, so strict compliance is futile. Instead, make a best-effort check that at least enough free space is currently available in the pool (with a bit of margin), then create a sparse file of the requested size and continue on with life. This does not handle all cases (e.g. several fallocate() calls before writing into the files when the filesystem is nearly full), which would require a more complex mechanism to be implemented, probably based on a modified version of dmu_prealloc(), but is usable as-is. A new module option zfs_fallocate_reserve_percent is used to control the reserve margin for any single fallocate call. By default, this is 110% of the requested preallocation size, so an additional 10% of available space is reserved for overhead to allow the application a good chance of finishing the write when the fallocate() succeeds. If the heuristics of this basic fallocate implementation are not desirable, the old non-functional behavior of returning EOPNOTSUPP for calls can be restored by setting zfs_fallocate_reserve_percent=0. The parameter of zfs_statvfs() is changed to take an inode instead of a dentry, since no dentry is available in zfs_fallocate_common(). A few tests from @behlendorf cover basic fallocate functionality. Reviewed-by: Richard Laager <[email protected]> Reviewed-by: Arshad Hussain <[email protected]> Reviewed-by: Matthew Ahrens <[email protected]> Co-authored-by: Brian Behlendorf <[email protected]> Signed-off-by: Andreas Dilger <[email protected]> Issue #326 Closes #10408
-rw-r--r--configure.ac1
-rw-r--r--include/os/linux/zfs/sys/zfs_vfsops.h2
-rw-r--r--man/man5/zfs-module-parameters.519
-rw-r--r--module/os/linux/zfs/zfs_vfsops.c6
-rw-r--r--module/os/linux/zfs/zpl_file.c83
-rw-r--r--module/os/linux/zfs/zpl_super.c2
-rw-r--r--tests/runfiles/linux.run4
-rw-r--r--tests/zfs-tests/tests/functional/Makefile.am1
-rw-r--r--tests/zfs-tests/tests/functional/fallocate/Makefile.am6
-rwxr-xr-xtests/zfs-tests/tests/functional/fallocate/cleanup.ksh27
-rwxr-xr-xtests/zfs-tests/tests/functional/fallocate/fallocate_prealloc.ksh63
-rwxr-xr-xtests/zfs-tests/tests/functional/fallocate/fallocate_punch-hole.ksh97
-rwxr-xr-xtests/zfs-tests/tests/functional/fallocate/setup.ksh29
13 files changed, 317 insertions, 23 deletions
diff --git a/configure.ac b/configure.ac
index 867c3351d..79246833d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -325,6 +325,7 @@ AC_CONFIG_FILES([
tests/zfs-tests/tests/functional/devices/Makefile
tests/zfs-tests/tests/functional/events/Makefile
tests/zfs-tests/tests/functional/exec/Makefile
+ tests/zfs-tests/tests/functional/fallocate/Makefile
tests/zfs-tests/tests/functional/fault/Makefile
tests/zfs-tests/tests/functional/features/Makefile
tests/zfs-tests/tests/functional/features/async_destroy/Makefile
diff --git a/include/os/linux/zfs/sys/zfs_vfsops.h b/include/os/linux/zfs/sys/zfs_vfsops.h
index 0cc659918..4e60a081a 100644
--- a/include/os/linux/zfs/sys/zfs_vfsops.h
+++ b/include/os/linux/zfs/sys/zfs_vfsops.h
@@ -210,7 +210,7 @@ extern int zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent);
extern void zfs_preumount(struct super_block *sb);
extern int zfs_umount(struct super_block *sb);
extern int zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm);
-extern int zfs_statvfs(struct dentry *dentry, struct kstatfs *statp);
+extern int zfs_statvfs(struct inode *ip, struct kstatfs *statp);
extern int zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp);
extern int zfs_prune(struct super_block *sb, unsigned long nr_to_scan,
int *objects);
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index 1ab592390..687b85d0b 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -1661,6 +1661,25 @@ Default value: \fB20\fR% of \fBzfs_dirty_data_max\fR.
.sp
.ne 2
.na
+\fBzfs_fallocate_reserve_percent\fR (uint)
+.ad
+.RS 12n
+Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be
+preallocated for a file in order to guarantee that later writes will not
+run out of space. Instead, fallocate() space preallocation only checks
+that sufficient space is currently available in the pool or the user's
+project quota allocation, and then creates a sparse file of the requested
+size. The requested space is multiplied by \fBzfs_fallocate_reserve_percent\fR
+to allow additional space for indirect blocks and other internal metadata.
+Setting this value to 0 disables support for fallocate(2) and returns
+EOPNOTSUPP for fallocate() space preallocation again.
+.sp
+Default value: \fB110\fR%
+.RE
+
+.sp
+.ne 2
+.na
\fBzfs_fletcher_4_impl\fR (string)
.ad
.RS 12n
diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c
index ea5971b0c..9561960bc 100644
--- a/module/os/linux/zfs/zfs_vfsops.c
+++ b/module/os/linux/zfs/zfs_vfsops.c
@@ -1088,9 +1088,9 @@ objs:
}
int
-zfs_statvfs(struct dentry *dentry, struct kstatfs *statp)
+zfs_statvfs(struct inode *ip, struct kstatfs *statp)
{
- zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
uint64_t refdbytes, availbytes, usedobjs, availobjs;
int err = 0;
@@ -1148,7 +1148,7 @@ zfs_statvfs(struct dentry *dentry, struct kstatfs *statp)
if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
dmu_objset_projectquota_present(zfsvfs->z_os)) {
- znode_t *zp = ITOZ(dentry->d_inode);
+ znode_t *zp = ITOZ(ip);
if (zp->z_pflags & ZFS_PROJINHERIT && zp->z_projid &&
zpl_is_valid_projid(zp->z_projid))
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index 0fad63a4f..c26ed5d09 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -34,6 +34,11 @@
#include <sys/zfs_vnops.h>
#include <sys/zfs_project.h>
+/*
+ * When using fallocate(2) to preallocate space, inflate the requested
+ * capacity check by 10% to account for the required metadata blocks.
+ */
+unsigned int zfs_fallocate_reserve_percent = 110;
static int
zpl_open(struct inode *ip, struct file *filp)
@@ -721,20 +726,23 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc)
}
/*
- * The only flag combination which matches the behavior of zfs_space()
- * is FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE
+ * The flag combination which matches the behavior of zfs_space() is
+ * FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE
* flag was introduced in the 2.6.38 kernel.
+ *
+ * The original mode=0 (allocate space) behavior can be reasonably emulated
+ * by checking if enough space exists and creating a sparse file, as real
+ * persistent space reservation is not possible due to COW, snapshots, etc.
*/
static long
zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)
{
cred_t *cr = CRED();
- flock64_t bf;
loff_t olen;
fstrans_cookie_t cookie;
- int error;
+ int error = 0;
- if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ if ((mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) != 0)
return (-EOPNOTSUPP);
if (offset < 0 || len <= 0)
@@ -743,21 +751,54 @@ zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)
spl_inode_lock(ip);
olen = i_size_read(ip);
- if (offset > olen) {
- spl_inode_unlock(ip);
- return (0);
- }
- if (offset + len > olen)
- len = olen - offset;
- bf.l_type = F_WRLCK;
- bf.l_whence = SEEK_SET;
- bf.l_start = offset;
- bf.l_len = len;
- bf.l_pid = 0;
-
crhold(cr);
cookie = spl_fstrans_mark();
- error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr);
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ flock64_t bf;
+
+ if (offset > olen)
+ goto out_unmark;
+
+ if (offset + len > olen)
+ len = olen - offset;
+ bf.l_type = F_WRLCK;
+ bf.l_whence = SEEK_SET;
+ bf.l_start = offset;
+ bf.l_len = len;
+ bf.l_pid = 0;
+
+ error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr);
+ } else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) {
+ unsigned int percent = zfs_fallocate_reserve_percent;
+ struct kstatfs statfs;
+
+ /* Legacy mode, disable fallocate compatibility. */
+ if (percent == 0) {
+ error = -EOPNOTSUPP;
+ goto out_unmark;
+ }
+
+ /*
+ * Use zfs_statvfs() instead of dmu_objset_space() since it
+ * also checks project quota limits, which are relevant here.
+ */
+ error = zfs_statvfs(ip, &statfs);
+ if (error)
+ goto out_unmark;
+
+ /*
+ * Shrink available space a bit to account for overhead/races.
+ * We know the product previously fit into availbytes from
+ * dmu_objset_space(), so the smaller product will also fit.
+ */
+ if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) {
+ error = -ENOSPC;
+ goto out_unmark;
+ }
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen)
+ error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE);
+ }
+out_unmark:
spl_fstrans_unmark(cookie);
spl_inode_unlock(ip);
@@ -1030,3 +1071,9 @@ const struct file_operations zpl_dir_file_operations = {
.compat_ioctl = zpl_compat_ioctl,
#endif
};
+
+/* BEGIN CSTYLED */
+module_param(zfs_fallocate_reserve_percent, uint, 0644);
+MODULE_PARM_DESC(zfs_fallocate_reserve_percent,
+ "Percentage of length to use for the available capacity check");
+/* END CSTYLED */
diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c
index 08cf75862..75adff517 100644
--- a/module/os/linux/zfs/zpl_super.c
+++ b/module/os/linux/zfs/zpl_super.c
@@ -138,7 +138,7 @@ zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
int error;
cookie = spl_fstrans_mark();
- error = -zfs_statvfs(dentry, statp);
+ error = -zfs_statvfs(dentry->d_inode, statp);
spl_fstrans_unmark(cookie);
ASSERT3S(error, <=, 0);
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 897a6a955..a800e6bb8 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -94,6 +94,10 @@ tags = ['functional', 'devices']
tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter']
tags = ['functional', 'events']
+[tests/functional/fallocate:Linux]
+tests = ['fallocate_prealloc', 'fallocate_punch-hole']
+tags = ['functional', 'fallocate']
+
[tests/functional/fault:Linux]
tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_replace_001_pos',
'auto_spare_001_pos', 'auto_spare_002_pos', 'auto_spare_multiple',
diff --git a/tests/zfs-tests/tests/functional/Makefile.am b/tests/zfs-tests/tests/functional/Makefile.am
index 776222f08..2df78d260 100644
--- a/tests/zfs-tests/tests/functional/Makefile.am
+++ b/tests/zfs-tests/tests/functional/Makefile.am
@@ -22,6 +22,7 @@ SUBDIRS = \
devices \
events \
exec \
+ fallocate \
fault \
features \
grow \
diff --git a/tests/zfs-tests/tests/functional/fallocate/Makefile.am b/tests/zfs-tests/tests/functional/fallocate/Makefile.am
new file mode 100644
index 000000000..5ff366d24
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/fallocate/Makefile.am
@@ -0,0 +1,6 @@
+pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/fallocate
+dist_pkgdata_SCRIPTS = \
+ setup.ksh \
+ cleanup.ksh \
+ fallocate_prealloc.ksh \
+ fallocate_punch-hole.ksh
diff --git a/tests/zfs-tests/tests/functional/fallocate/cleanup.ksh b/tests/zfs-tests/tests/functional/fallocate/cleanup.ksh
new file mode 100755
index 000000000..bdfa61471
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/fallocate/cleanup.ksh
@@ -0,0 +1,27 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/fallocate/fallocate_prealloc.ksh b/tests/zfs-tests/tests/functional/fallocate/fallocate_prealloc.ksh
new file mode 100755
index 000000000..7bb020fe5
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/fallocate/fallocate_prealloc.ksh
@@ -0,0 +1,63 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Test fallocate(2) preallocation.
+#
+# STRATEGY:
+# 1. Verify mode 0 fallocate is supported.
+# 2. Verify default 10% reserve space is honored by setting a quota.
+#
+
+verify_runnable "global"
+
+FILE=$TESTDIR/$TESTFILE0
+
+function cleanup
+{
+ log_must zfs set quota=none $TESTPOOL
+
+ [[ -e $TESTDIR ]] && log_must rm -Rf $TESTDIR/*
+}
+
+log_assert "Ensure sparse files can be preallocated"
+
+log_onexit cleanup
+
+# Pre-allocate a sparse 1GB file.
+log_must fallocate -l $((1024 * 1024 * 1024)) $FILE
+log_must rm -Rf $TESTDIR/*
+
+# Verify that an additional ~10% reserve space is required.
+log_must zfs set quota=100M $TESTPOOL
+log_mustnot fallocate -l $((150 * 1024 * 1024)) $FILE
+log_mustnot fallocate -l $((110 * 1024 * 1024)) $FILE
+log_must fallocate -l $((90 * 1024 * 1024)) $FILE
+
+log_pass "Ensure sparse files can be preallocated"
diff --git a/tests/zfs-tests/tests/functional/fallocate/fallocate_punch-hole.ksh b/tests/zfs-tests/tests/functional/fallocate/fallocate_punch-hole.ksh
new file mode 100755
index 000000000..6a8faa487
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/fallocate/fallocate_punch-hole.ksh
@@ -0,0 +1,97 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Test `fallocate --punch-hole`
+#
+# STRATEGY:
+# 1. Create a dense file
+# 2. Punch an assortment of holes in the file and verify the result.
+#
+
+verify_runnable "global"
+
+FILE=$TESTDIR/$TESTFILE0
+BLKSZ=$(get_prop recordsize $TESTPOOL)
+
+function cleanup
+{
+ [[ -e $TESTDIR ]] && log_must rm -f $FILE
+}
+
+function check_disk_size
+{
+ typeset expected_size=$1
+
+ disk_size=$(du $TESTDIR/file | awk '{print $1}')
+ if [ $disk_size -ne $expected_size ]; then
+ log_fail "Incorrect size: $disk_size != $expected_size"
+ fi
+}
+
+function check_apparent_size
+{
+ typeset expected_size=$1
+
+ apparent_size=$(stat_size)
+ if [ $apparent_size -ne $expected_size ]; then
+ log_fail "Incorrect size: $apparent_size != $expected_size"
+ fi
+}
+
+log_assert "Ensure holes can be punched in files making them sparse"
+
+log_onexit cleanup
+
+# Create a dense file and check it is the correct size.
+log_must file_write -o create -f $FILE -b $BLKSZ -c 8
+log_must check_disk_size $((131072 * 8))
+
+# Punch a hole for the first full block.
+log_must fallocate --punch-hole --offset 0 --length $BLKSZ $FILE
+log_must check_disk_size $((131072 * 7))
+
+# Partially punch a hole in the second block.
+log_must fallocate --punch-hole --offset $BLKSZ --length $((BLKSZ / 2)) $FILE
+log_must check_disk_size $((131072 * 7))
+
+# Punch a hole which overlaps the third and forth block.
+log_must fallocate --punch-hole --offset $(((BLKSZ * 2) + (BLKSZ / 2))) \
+ --length $((BLKSZ)) $FILE
+log_must check_disk_size $((131072 * 7))
+
+# Punch a hole from the fifth block past the end of file. The apparent
+# file size should not change since --keep-size is implied.
+apparent_size=$(stat_size $FILE)
+log_must fallocate --punch-hole --offset $((BLKSZ * 4)) \
+ --length $((BLKSZ * 10)) $FILE
+log_must check_disk_size $((131072 * 4))
+log_must check_apparent_size $apparent_size
+
+log_pass "Ensure holes can be punched in files making them sparse"
diff --git a/tests/zfs-tests/tests/functional/fallocate/setup.ksh b/tests/zfs-tests/tests/functional/fallocate/setup.ksh
new file mode 100755
index 000000000..32334d396
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/fallocate/setup.ksh
@@ -0,0 +1,29 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+
+#
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+DISK=${DISKS%% *}
+default_setup $DISK