4 files changed, 65 insertions, 35 deletions
diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c
index f2d539103..a972c720d 100644
--- a/module/os/freebsd/zfs/zfs_vfsops.c
+++ b/module/os/freebsd/zfs/zfs_vfsops.c
@@ -89,10 +89,6 @@ int zfs_debug_level;
 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
 	"Debug level");
 
-int zfs_bclone_enabled = 1;
-SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN,
-	&zfs_bclone_enabled, 0, "Enable block cloning");
-
 struct zfs_jailparam {
 	int mount_snapshot;
 };
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index b7b89b8af..a32307c39 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -4255,9 +4255,4 @@ EXPORT_SYMBOL(zfs_map);
 /* CSTYLED */
 module_param(zfs_delete_blocks, ulong, 0644);
 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
-
-/* CSTYLED */
-module_param(zfs_bclone_enabled, uint, 0644);
-MODULE_PARM_DESC(zfs_bclone_enabled, "Enable block cloning");
-
 #endif
diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c
index 73476ff40..3065d54fa 100644
--- a/module/os/linux/zfs/zpl_file_range.c
+++ b/module/os/linux/zfs/zpl_file_range.c
@@ -31,8 +31,6 @@
 #include <sys/zfs_vnops.h>
 #include <sys/zfeature.h>
 
-int zfs_bclone_enabled = 1;
-
 /*
  * Clone part of a file via block cloning.
  *
@@ -40,7 +38,7 @@ int zfs_bclone_enabled = 1;
  * care of that depending on how it was called.
  */
 static ssize_t
-__zpl_clone_file_range(struct file *src_file, loff_t src_off,
+zpl_clone_file_range_impl(struct file *src_file, loff_t src_off,
     struct file *dst_file, loff_t dst_off, size_t len)
 {
 	struct inode *src_i = file_inode(src_file);
@@ -96,11 +94,12 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 {
 	ssize_t ret;
 
+	/* Flags is reserved for future extensions and must be zero. */
 	if (flags != 0)
 		return (-EINVAL);
 
-	/* Try to do it via zfs_clone_range() */
-	ret = __zpl_clone_file_range(src_file, src_off,
+	/* Try to do it via zfs_clone_range() and allow shortening. */
+	ret = zpl_clone_file_range_impl(src_file, src_off,
 	    dst_file, dst_off, len);
 
 #ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
@@ -137,6 +136,11 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
  * FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the
  * range in both files and if they're the same, arrange for them to be backed
  * by the same storage.
+ *
+ * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given range
+ * if we want. It's designed for filesystems that may need to shorten the
+ * length for alignment, EOF, or any other requirement. ZFS may shorten the
+ * request when there is outstanding dirty data which hasn't been written.
  */
 loff_t
 zpl_remap_file_range(struct file *src_file, loff_t src_off,
@@ -145,24 +149,21 @@ zpl_remap_file_range(struct file *src_file, loff_t src_off,
 	if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN))
 		return (-EINVAL);
 
-	/*
-	 * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given
-	 * range if we want. Its designed for filesystems that make data past
-	 * EOF available, and don't want it to be visible in both files. ZFS
-	 * doesn't do that, so we just turn the flag off.
-	 */
-	flags &= ~REMAP_FILE_CAN_SHORTEN;
-
+	/* No support for dedup yet */
 	if (flags & REMAP_FILE_DEDUP)
-		/* No support for dedup yet */
 		return (-EOPNOTSUPP);
 
 	/* Zero length means to clone everything to the end of the file */
 	if (len == 0)
 		len = i_size_read(file_inode(src_file)) - src_off;
 
-	return (__zpl_clone_file_range(src_file, src_off,
-	    dst_file, dst_off, len));
+	ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
+	    dst_file, dst_off, len);
+
+	if (!(flags & REMAP_FILE_CAN_SHORTEN) && ret >= 0 && ret != len)
+		ret = -EINVAL;
+
+	return (ret);
 }
 #endif /* HAVE_VFS_REMAP_FILE_RANGE */
 
@@ -179,8 +180,14 @@ zpl_clone_file_range(struct file *src_file, loff_t src_off,
 	if (len == 0)
 		len = i_size_read(file_inode(src_file)) - src_off;
 
-	return (__zpl_clone_file_range(src_file, src_off,
-	    dst_file, dst_off, len));
+	/* The entire length must be cloned or this is an error. */
+	ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
+	    dst_file, dst_off, len);
+
+	if (ret >= 0 && ret != len)
+		ret = -EINVAL;
+
+	return (ret);
 }
 #endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */
 
@@ -214,8 +221,7 @@ zpl_ioctl_ficlone(struct file *dst_file, void *arg)
 
 	size_t len = i_size_read(file_inode(src_file));
 
-	ssize_t ret =
-	    __zpl_clone_file_range(src_file, 0, dst_file, 0, len);
+	ssize_t ret = zpl_clone_file_range_impl(src_file, 0, dst_file, 0, len);
 
 	fput(src_file);
 
@@ -253,7 +259,7 @@ zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg)
 	if (len == 0)
 		len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset;
 
-	ssize_t ret = __zpl_clone_file_range(src_file, fcr.fcr_src_offset,
+	ssize_t ret = zpl_clone_file_range_impl(src_file, fcr.fcr_src_offset,
 	    dst_file, fcr.fcr_dest_offset, len);
 
 	fput(src_file);
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index c8ff7b643..7f39ad6fc 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -58,6 +58,26 @@
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
 
+/*
+ * Enable the experimental block cloning feature.  If this setting is 0, then
+ * even if feature@block_cloning is enabled, attempts to clone blocks will act
+ * as though the feature is disabled.
+ */
+int zfs_bclone_enabled = 1;
+
+/*
+ * When set zfs_clone_range() waits for dirty data to be written to disk.
+ * This allows the clone operation to reliably succeed when a file is modified
+ * and then immediately cloned. For small files this may be slower than making
+ * a copy of the file and is therefore not the default.  However, in certain
+ * scenarios this behavior may be desirable so a tunable is provided.
+ */
+static int zfs_bclone_wait_dirty = 0;
+
+/*
+ * Maximum bytes to read per chunk in zfs_read().
+ */
+static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024;
 
 int
 zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
@@ -182,8 +202,6 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
 	return (error);
 }
 
-static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */
-
 /*
  * Read bytes from specified file into supplied buffer.
  *
@@ -1049,6 +1067,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 	size_t		maxblocks, nbps;
 	uint_t		inblksz;
 	uint64_t	clear_setid_bits_txg = 0;
+	uint64_t	last_synced_txg = 0;
 
 	inoff = *inoffp;
 	outoff = *outoffp;
@@ -1287,15 +1306,23 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 		}
 
 		nbps = maxblocks;
+		last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos));
 		error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps,
 		    &nbps);
 		if (error != 0) {
 			/*
 			 * If we are trying to clone a block that was created
-			 * in the current transaction group, error will be
-			 * EAGAIN here, which we can just return to the caller
-			 * so it can fallback if it likes.
+			 * in the current transaction group, the error will be
+			 * EAGAIN here.  Based on zfs_bclone_wait_dirty either
+			 * return a shortened range to the caller so it can
+			 * fallback, or wait for the next TXG and check again.
 			 */
+			if (error == EAGAIN && zfs_bclone_wait_dirty) {
+				txg_wait_synced(dmu_objset_pool(inos),
+				    last_synced_txg + 1);
+				continue;
+			}
+
 			break;
 		}
 
@@ -1517,3 +1544,9 @@ EXPORT_SYMBOL(zfs_clone_range_replay);
 
 ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW,
 	"Bytes to read per chunk");
+
+ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW,
+	"Enable block cloning");
+
+ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW,
+	"Wait for dirty blocks when cloning");