summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--module/zfs/dmu_object.c4
-rw-r--r--module/zfs/dnode.c68
-rw-r--r--module/zfs/dnode_sync.c54
-rw-r--r--tests/runfiles/linux.run2
-rw-r--r--tests/zfs-tests/tests/functional/rsend/Makefile.am3
-rw-r--r--tests/zfs-tests/tests/functional/rsend/rsend.kshlib47
-rwxr-xr-xtests/zfs-tests/tests/functional/rsend/send_hole_birth.ksh123
7 files changed, 270 insertions, 31 deletions
diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c
index b9960782e..afee869b6 100644
--- a/module/zfs/dmu_object.c
+++ b/module/zfs/dmu_object.c
@@ -311,6 +311,10 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
return (err);
ASSERT(dn->dn_type != DMU_OT_NONE);
+ /*
+ * If we don't create this free range, we'll leak indirect blocks when
+ * we get to freeing the dnode in syncing context.
+ */
dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
dnode_free(dn, tx);
dnode_rele(dn, FTAG);
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 7939a7ba6..0be72e90a 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -1889,6 +1889,72 @@ dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
}
}
+/*
+ * Dirty all the in-core level-1 dbufs in the range specified by start_blkid
+ * and end_blkid.
+ */
+static void
+dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
+ dmu_tx_t *tx)
+{
+ dmu_buf_impl_t db_search;
+ dmu_buf_impl_t *db;
+ avl_index_t where;
+
+ mutex_enter(&dn->dn_dbufs_mtx);
+
+ db_search.db_level = 1;
+ db_search.db_blkid = start_blkid + 1;
+ db_search.db_state = DB_SEARCH;
+ for (;;) {
+
+ db = avl_find(&dn->dn_dbufs, &db_search, &where);
+ if (db == NULL)
+ db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+
+ if (db == NULL || db->db_level != 1 ||
+ db->db_blkid >= end_blkid) {
+ break;
+ }
+
+ /*
+ * Setup the next blkid we want to search for.
+ */
+ db_search.db_blkid = db->db_blkid + 1;
+ ASSERT3U(db->db_blkid, >=, start_blkid);
+
+ /*
+ * If the dbuf transitions to DB_EVICTING while we're trying
+ * to dirty it, then we will be unable to discover it in
+ * the dbuf hash table. This will result in a call to
+ * dbuf_create() which needs to acquire the dn_dbufs_mtx
+ * lock. To avoid a deadlock, we drop the lock before
+ * dirtying the level-1 dbuf.
+ */
+ mutex_exit(&dn->dn_dbufs_mtx);
+ dnode_dirty_l1(dn, db->db_blkid, tx);
+ mutex_enter(&dn->dn_dbufs_mtx);
+ }
+
+#ifdef ZFS_DEBUG
+ /*
+ * Walk all the in-core level-1 dbufs and verify they have been dirtied.
+ */
+ db_search.db_level = 1;
+ db_search.db_blkid = start_blkid + 1;
+ db_search.db_state = DB_SEARCH;
+ db = avl_find(&dn->dn_dbufs, &db_search, &where);
+ if (db == NULL)
+ db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+ for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
+ if (db->db_level != 1 || db->db_blkid >= end_blkid)
+ break;
+ ASSERT(db->db_dirtycnt > 0);
+ }
+#endif
+ mutex_exit(&dn->dn_dbufs_mtx);
+}
+
void
dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
{
@@ -2040,6 +2106,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
if (last != first)
dnode_dirty_l1(dn, last, tx);
+ dnode_dirty_l1range(dn, first, last, tx);
+
int shift = dn->dn_datablkshift + dn->dn_indblkshift -
SPA_BLKPTRSHIFT;
for (uint64_t i = first + 1; i < last; i++) {
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c
index 22b401ab5..3202faf49 100644
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -230,9 +230,24 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
}
#endif
+/*
+ * We don't usually free the indirect blocks here. If in one txg we have a
+ * free_range and a write to the same indirect block, it's important that we
+ * preserve the hole's birth times. Therefore, we don't free any any indirect
+ * blocks in free_children(). If an indirect block happens to turn into all
+ * holes, it will be freed by dbuf_write_children_ready, which happens at a
+ * point in the syncing process where we know for certain the contents of the
+ * indirect block.
+ *
+ * However, if we're freeing a dnode, its space accounting must go to zero
+ * before we actually try to free the dnode, or we will trip an assertion. In
+ * addition, we know the case described above cannot occur, because the dnode is
+ * being freed. Therefore, we free the indirect blocks immediately in that
+ * case.
+ */
static void
free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
- dmu_tx_t *tx)
+ boolean_t free_indirects, dmu_tx_t *tx)
{
dnode_t *dn;
blkptr_t *bp;
@@ -284,32 +299,16 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
rw_exit(&dn->dn_struct_rwlock);
ASSERT3P(bp, ==, subdb->db_blkptr);
- free_children(subdb, blkid, nblks, tx);
+ free_children(subdb, blkid, nblks, free_indirects, tx);
dbuf_rele(subdb, FTAG);
}
}
- /* If this whole block is free, free ourself too. */
- for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) {
- if (!BP_IS_HOLE(bp))
- break;
- }
- if (i == 1 << epbs) {
- /*
- * We only found holes. Grab the rwlock to prevent
- * anybody from reading the blocks we're about to
- * zero out.
- */
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (free_indirects) {
+ for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++)
+ ASSERT(BP_IS_HOLE(bp));
bzero(db->db.db_data, db->db.db_size);
- rw_exit(&dn->dn_struct_rwlock);
free_blocks(dn, db->db_blkptr, 1, tx);
- } else {
- /*
- * Partial block free; must be marked dirty so that it
- * will be written out.
- */
- ASSERT(db->db_dirtycnt > 0);
}
DB_DNODE_EXIT(db);
@@ -322,7 +321,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
*/
static void
dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
- dmu_tx_t *tx)
+ boolean_t free_indirects, dmu_tx_t *tx)
{
blkptr_t *bp = dn->dn_phys->dn_blkptr;
int dnlevel = dn->dn_phys->dn_nlevels;
@@ -362,7 +361,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
TRUE, FALSE, FTAG, &db));
rw_exit(&dn->dn_struct_rwlock);
- free_children(db, blkid, nblks, tx);
+ free_children(db, blkid, nblks, free_indirects, tx);
dbuf_rele(db, FTAG);
}
}
@@ -387,6 +386,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
typedef struct dnode_sync_free_range_arg {
dnode_t *dsfra_dnode;
dmu_tx_t *dsfra_tx;
+ boolean_t dsfra_free_indirects;
} dnode_sync_free_range_arg_t;
static void
@@ -396,7 +396,8 @@ dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
dnode_t *dn = dsfra->dsfra_dnode;
mutex_exit(&dn->dn_mtx);
- dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx);
+ dnode_sync_free_range_impl(dn, blkid, nblks,
+ dsfra->dsfra_free_indirects, dsfra->dsfra_tx);
mutex_enter(&dn->dn_mtx);
}
@@ -712,6 +713,11 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dnode_sync_free_range_arg_t dsfra;
dsfra.dsfra_dnode = dn;
dsfra.dsfra_tx = tx;
+ dsfra.dsfra_free_indirects = freeing_dnode;
+ if (freeing_dnode) {
+ ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff],
+ 0, dn->dn_maxblkid + 1));
+ }
mutex_enter(&dn->dn_mtx);
range_tree_vacate(dn->dn_free_ranges[txgoff],
dnode_sync_free_range, &dsfra);
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 89563189f..c3e3887a8 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -743,7 +743,7 @@ tests = ['rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos',
'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-cD',
'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize',
'send-c_recv_dedup', 'send_encrypted_files', 'send_encrypted_heirarchy',
- 'send_freeobjects', 'send_realloc_dnode_size']
+ 'send_freeobjects', 'send_realloc_dnode_size', 'send_hole_birth']
tags = ['functional', 'rsend']
[tests/functional/scrub_mirror]
diff --git a/tests/zfs-tests/tests/functional/rsend/Makefile.am b/tests/zfs-tests/tests/functional/rsend/Makefile.am
index fd00d6c49..79e01d190 100644
--- a/tests/zfs-tests/tests/functional/rsend/Makefile.am
+++ b/tests/zfs-tests/tests/functional/rsend/Makefile.am
@@ -39,7 +39,8 @@ dist_pkgdata_SCRIPTS = \
send-c_zstreamdump.ksh \
send-cpL_varied_recsize.ksh \
send_freeobjects.ksh \
- send_realloc_dnode_size.ksh
+ send_realloc_dnode_size.ksh \
+ send_hole_birth.ksh
dist_pkgdata_DATA = \
rsend.cfg \
diff --git a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib
index 909cd1677..72d2eb93d 100644
--- a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib
+++ b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib
@@ -153,6 +153,20 @@ function cleanup_pools
destroy_pool $POOL3
}
+function cmp_md5s {
+ typeset file1=$1
+ typeset file2=$2
+
+ eval md5sum $file1 | awk '{ print $1 }' > $BACKDIR/md5_file1
+ eval md5sum $file2 | awk '{ print $1 }' > $BACKDIR/md5_file2
+ diff $BACKDIR/md5_file1 $BACKDIR/md5_file2
+ typeset -i ret=$?
+
+ rm -f $BACKDIR/md5_file1 $BACKDIR/md5_file2
+
+ return $ret
+}
+
#
# Detect if the given two filesystems have same sub-datasets
#
@@ -388,13 +402,36 @@ function mk_files
maxsize=$2
file_id_offset=$3
fs=$4
+ bs=512
for ((i=0; i<$nfiles; i=i+1)); do
- dd if=/dev/urandom \
- of=/$fs/file-$maxsize-$((i+$file_id_offset)) \
- bs=$((($RANDOM * $RANDOM % ($maxsize - 1)) + 1)) \
- count=1 >/dev/null 2>&1 || log_fail \
- "Failed to create /$fs/file-$maxsize-$((i+$file_id_offset))"
+ file_name="/$fs/file-$maxsize-$((i+$file_id_offset))"
+ file_size=$((($RANDOM * $RANDOM % ($maxsize - 1)) + 1))
+
+ #
+ # Create an interesting mix of files which contain both
+ # data blocks and holes for more realistic test coverage.
+ # Half the files are created as sparse then partially filled,
+ # the other half is dense then a hole is punched in the file.
+ #
+ if [ $((RANDOM % 2)) -eq 0 ]; then
+ truncate -s $file_size $file_name || \
+ log_fail "Failed to create $file_name"
+ dd if=/dev/urandom of=$file_name \
+ bs=$bs count=$(($file_size / 2 / $bs)) \
+ seek=$(($RANDOM % (($file_size / 2 / $bs) + 1))) \
+ conv=notrunc >/dev/null 2>&1 || \
+ log_fail "Failed to create $file_name"
+ else
+ dd if=/dev/urandom of=$file_name \
+ bs=$file_size count=1 >/dev/null 2>&1 || \
+ log_fail "Failed to create $file_name"
+ dd if=/dev/zero of=$file_name \
+ bs=$bs count=$(($file_size / 2 / $bs)) \
+ seek=$(($RANDOM % (($file_size / 2 / $bs) + 1))) \
+ conv=notrunc >/dev/null 2>&1 || \
+ log_fail "Failed to create $file_name"
+ fi
done
echo Created $nfiles files of random sizes up to $maxsize bytes
}
diff --git a/tests/zfs-tests/tests/functional/rsend/send_hole_birth.ksh b/tests/zfs-tests/tests/functional/rsend/send_hole_birth.ksh
new file mode 100755
index 000000000..c2b5ff7a0
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/rsend/send_hole_birth.ksh
@@ -0,0 +1,123 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# DESCRIPTION:
+# Verify send streams which contain holes.
+#
+# STRATEGY:
+# 1. Create an initial file for the full send and snapshot.
+# 2. Permute the file with holes and snapshot.
+# 3. Send the full and incremental snapshots to a new pool.
+# 4. Verify the contents of the files match.
+#
+
+sendpool=$POOL
+sendfs=$sendpool/sendfs
+
+recvpool=$POOL2
+recvfs=$recvpool/recvfs
+
+verify_runnable "both"
+
+log_assert "Test hole_birth"
+log_onexit cleanup
+
+function cleanup
+{
+ cleanup_pool $sendpool
+ cleanup_pool $recvpool
+ set_tunable64 send_holes_without_birth_time 1
+}
+
+function send_and_verify
+{
+ log_must eval "zfs send $sendfs@snap1 > $BACKDIR/pool-snap1"
+ log_must eval "zfs receive -F $recvfs < $BACKDIR/pool-snap1"
+
+ log_must eval "zfs send -i $sendfs@snap1 $sendfs@snap2 " \
+ ">$BACKDIR/pool-snap1-snap2"
+ log_must eval "zfs receive $recvfs < $BACKDIR/pool-snap1-snap2"
+
+ log_must cmp_md5s /$sendfs/file1 /$recvfs/file1
+}
+
+# By default sending hole_birth times is disabled. This functionality needs
+# to be re-enabled for this test case to verify correctness. Once we're
+# comfortable that all hole_birth bugs has been resolved this behavior may
+# be re-enabled by default.
+log_must set_tunable64 send_holes_without_birth_time 0
+
+# Incremental send truncating the file and adding new data.
+log_must zfs create -o recordsize=4k $sendfs
+
+log_must truncate -s 1G /$sendfs/file1
+log_must dd if=/dev/urandom of=/$sendfs/file1 bs=4k count=11264 seek=1152
+log_must zfs snapshot $sendfs@snap1
+
+log_must truncate -s 4194304 /$sendfs/file1
+log_must dd if=/dev/urandom of=/$sendfs/file1 bs=4k count=152 seek=384 \
+ conv=notrunc
+log_must dd if=/dev/urandom of=/$sendfs/file1 bs=4k count=10 seek=1408 \
+ conv=notrunc
+log_must zfs snapshot $sendfs@snap2
+
+send_and_verify
+log_must cleanup_pool $sendpool
+log_must cleanup_pool $recvpool
+
+# Incremental send appending a hole and data.
+log_must zfs create -o recordsize=512 $sendfs
+
+log_must dd if=/dev/urandom of=/$sendfs/file1 bs=128k count=1 seek=1
+log_must zfs snapshot $sendfs@snap1
+
+log_must dd if=/dev/urandom of=/$sendfs/file1 bs=128k count=1
+log_must dd if=/dev/urandom of=/$sendfs/file1 bs=128k count=1 seek=3
+log_must zfs snapshot $sendfs@snap2
+
+send_and_verify
+log_must cleanup_pool $sendpool
+log_must cleanup_pool $recvpool
+
+# Incremental send truncating the file and adding new data.
+log_must zfs create -o recordsize=512 $sendfs
+
+log_must truncate -s 300M /$sendfs/file1
+log_must dd if=/dev/urandom of=/$sendfs/file1 bs=512 count=128k conv=notrunc
+log_must zfs snapshot $sendfs@snap1
+
+log_must truncate -s 10M /$sendfs/file1
+log_must dd if=/dev/urandom of=/$sendfs/file1 bs=512 count=1 seek=96k \
+ conv=notrunc
+log_must zfs snapshot $sendfs@snap2
+
+send_and_verify
+
+log_pass "Test hole_birth"