aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2017-08-08 08:38:53 -0700
committerGitHub <[email protected]>2017-08-08 08:38:53 -0700
commit9631681b75336ec6265d8fa5cecb353687c1f373 (patch)
treebbfcadbf6ca2a583d27629cf3428401e9f6a85c6
parentd19a6d5c80fb24451a7d76716eaf38d3a3f933c7 (diff)
Fix dnode allocation race
When performing concurrent object allocations using the new multi-threaded allocator and large dnodes it's possible to allocate overlapping large dnodes. This case should have been handled by detecting an error returned by dnode_hold_impl(). But that logic only checked the returned dnp was not-NULL, and the dnp variable was not reset to NULL when retrying. Resolve this issue by properly checking the return value of dnode_hold_impl(). Additionally, it was possible that dnode_hold_impl() would misreport a dnode as free when it was in fact in use. This could occurs for two reasons: * The per-slot zrl_lock must be held over the entire critical section which includes the alloc/free until the new dnode is assigned to children_dnodes. Additionally, all of the zrl_lock's in the range must be held to protect moving dnodes. * The dn->dn_ot_type cannot be solely relied upon to check the type. When allocating a new dnode its type will be DMU_OT_NONE after dnode_create(). Only latter when dnode_allocate() is called will it transition to the new type. This means there's a window when allocating where it can mistaken for a free dnode. Reviewed-by: Giuseppe Di Natale <[email protected]> Reviewed-by: Ned Bass <[email protected]> Reviewed-by: Tony Hutter <[email protected]> Reviewed-by: Olaf Faaland <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #6414 Closes #6439
-rw-r--r--module/zfs/dmu_object.c6
-rw-r--r--module/zfs/dmu_tx.c10
-rw-r--r--module/zfs/dnode.c119
-rw-r--r--module/zfs/zap_micro.c2
-rw-r--r--module/zfs/zfs_znode.c2
-rw-r--r--tests/runfiles/linux.run2
-rw-r--r--tests/zfs-tests/tests/functional/features/large_dnode/Makefile.am3
-rwxr-xr-xtests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh60
8 files changed, 136 insertions, 68 deletions
diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c
index 1c7b26d4b..14264ec30 100644
--- a/module/zfs/dmu_object.c
+++ b/module/zfs/dmu_object.c
@@ -61,6 +61,7 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
boolean_t restarted = B_FALSE;
uint64_t *cpuobj = NULL;
int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
+ int error;
kpreempt_disable();
cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
@@ -129,7 +130,6 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
uint64_t offset;
uint64_t blkfill;
int minlvl;
- int error;
if (os->os_rescan_dnodes) {
offset = 0;
os->os_rescan_dnodes = B_FALSE;
@@ -163,9 +163,9 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
* dmu_tx_assign(), but there is currently no mechanism
* to do so.
*/
- (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
+ error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
dn_slots, FTAG, &dn);
- if (dn != NULL) {
+ if (error == 0) {
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
/*
* Another thread could have allocated it; check
diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c
index 3b96c6bdd..7937615f2 100644
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@ -1242,11 +1242,13 @@ dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
void
dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
{
- dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx,
- tx->tx_objset, object, THT_SPILL, 0, 0);
+ dmu_tx_hold_t *txh;
- (void) refcount_add_many(&txh->txh_space_towrite,
- SPA_OLD_MAXBLOCKSIZE, FTAG);
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
+ THT_SPILL, 0, 0);
+ if (txh != NULL)
+ (void) refcount_add_many(&txh->txh_space_towrite,
+ SPA_OLD_MAXBLOCKSIZE, FTAG);
}
void
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 627cc8df1..41180bedf 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -391,7 +391,7 @@ dnode_setdblksz(dnode_t *dn, int size)
}
static dnode_t *
-dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
+dnode_create(objset_t *os, dnode_phys_t *dnp, int slots, dmu_buf_impl_t *db,
uint64_t object, dnode_handle_t *dnh)
{
dnode_t *dn;
@@ -424,11 +424,15 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
dn->dn_compress = dnp->dn_compress;
dn->dn_bonustype = dnp->dn_bonustype;
dn->dn_bonuslen = dnp->dn_bonuslen;
- dn->dn_num_slots = dnp->dn_extra_slots + 1;
dn->dn_maxblkid = dnp->dn_maxblkid;
dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
dn->dn_id_flags = 0;
+ if (slots && dn->dn_type == DMU_OT_NONE)
+ dn->dn_num_slots = slots;
+ else
+ dn->dn_num_slots = dnp->dn_extra_slots + 1;
+
dmu_zfetch_init(&dn->dn_zfetch, dn);
ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
@@ -1011,7 +1015,7 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
{
dnode_t *dn;
- dn = dnode_create(os, dnp, NULL, object, dnh);
+ dn = dnode_create(os, dnp, 0, NULL, object, dnh);
zrl_init(&dnh->dnh_zrlock);
DNODE_VERIFY(dn);
}
@@ -1062,36 +1066,26 @@ dnode_buf_evict_async(void *dbu)
*
* The dnode_phys_t buffer may not be in sync with the in-core dnode
* structure, so we try to check the dnode structure first and fall back
- * to the dnode_phys_t buffer it doesn't exist.
+ * to the dnode_phys_t buffer it doesn't exist. When an in-code dnode
+ * exists we can always trust dn->dn_num_slots to be accurate, even for
+ * a held dnode which has not yet been fully allocated.
*/
static boolean_t
-dnode_is_consumed(dmu_buf_impl_t *db, int idx)
+dnode_is_consumed(dnode_children_t *children, dnode_phys_t *dn_block, int idx)
{
- dnode_handle_t *dnh;
- dmu_object_type_t ot;
- dnode_children_t *children_dnodes;
- dnode_phys_t *dn_block;
- int skip;
- int i;
-
- children_dnodes = dmu_buf_get_user(&db->db);
- dn_block = (dnode_phys_t *)db->db.db_data;
+ int skip, i;
for (i = 0; i < idx; i += skip) {
- dnh = &children_dnodes->dnc_children[i];
+ dnode_handle_t *dnh = &children->dnc_children[i];
- zrl_add(&dnh->dnh_zrlock);
if (dnh->dnh_dnode != NULL) {
- ot = dnh->dnh_dnode->dn_type;
skip = dnh->dnh_dnode->dn_num_slots;
} else {
- ot = dn_block[i].dn_type;
- skip = dn_block[i].dn_extra_slots + 1;
+ if (dn_block[i].dn_type != DMU_OT_NONE)
+ skip = dn_block[i].dn_extra_slots + 1;
+ else
+ skip = 1;
}
- zrl_remove(&dnh->dnh_zrlock);
-
- if (ot == DMU_OT_NONE)
- skip = 1;
}
return (i > idx);
@@ -1107,27 +1101,19 @@ dnode_is_consumed(dmu_buf_impl_t *db, int idx)
* to the dnode_phys_t buffer it doesn't exist.
*/
static boolean_t
-dnode_is_allocated(dmu_buf_impl_t *db, int idx)
+dnode_is_allocated(dnode_children_t *children, dnode_phys_t *dn_block, int idx)
{
dnode_handle_t *dnh;
dmu_object_type_t ot;
- dnode_children_t *children_dnodes;
- dnode_phys_t *dn_block;
- if (dnode_is_consumed(db, idx))
+ if (dnode_is_consumed(children, dn_block, idx))
return (B_FALSE);
- children_dnodes = dmu_buf_get_user(&db->db);
- dn_block = (dnode_phys_t *)db->db.db_data;
-
- dnh = &children_dnodes->dnc_children[idx];
-
- zrl_add(&dnh->dnh_zrlock);
+ dnh = &children->dnc_children[idx];
if (dnh->dnh_dnode != NULL)
ot = dnh->dnh_dnode->dn_type;
else
ot = dn_block[idx].dn_type;
- zrl_remove(&dnh->dnh_zrlock);
return (ot != DMU_OT_NONE);
}
@@ -1142,32 +1128,27 @@ dnode_is_allocated(dmu_buf_impl_t *db, int idx)
* to the dnode_phys_t buffer it doesn't exist.
*/
static boolean_t
-dnode_is_free(dmu_buf_impl_t *db, int idx, int slots)
+dnode_is_free(dnode_children_t *children, dnode_phys_t *dn_block, int idx,
+ int slots)
{
- dnode_handle_t *dnh;
- dmu_object_type_t ot;
- dnode_children_t *children_dnodes;
- dnode_phys_t *dn_block;
- int i;
-
if (idx + slots > DNODES_PER_BLOCK)
return (B_FALSE);
- children_dnodes = dmu_buf_get_user(&db->db);
- dn_block = (dnode_phys_t *)db->db.db_data;
-
- if (dnode_is_consumed(db, idx))
+ if (dnode_is_consumed(children, dn_block, idx))
return (B_FALSE);
- for (i = idx; i < idx + slots; i++) {
- dnh = &children_dnodes->dnc_children[i];
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+ dmu_object_type_t ot;
+
+ if (dnh->dnh_dnode != NULL) {
+ if (dnh->dnh_dnode->dn_num_slots > 1)
+ return (B_FALSE);
- zrl_add(&dnh->dnh_zrlock);
- if (dnh->dnh_dnode != NULL)
ot = dnh->dnh_dnode->dn_type;
- else
+ } else {
ot = dn_block[i].dn_type;
- zrl_remove(&dnh->dnh_zrlock);
+ }
if (ot != DMU_OT_NONE)
return (B_FALSE);
@@ -1176,6 +1157,24 @@ dnode_is_free(dmu_buf_impl_t *db, int idx, int slots)
return (B_TRUE);
}
+static void
+dnode_hold_slots(dnode_children_t *children, int idx, int slots)
+{
+ for (int i = idx; i < MIN(idx + slots, DNODES_PER_BLOCK); i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+ zrl_add(&dnh->dnh_zrlock);
+ }
+}
+
+static void
+dnode_rele_slots(dnode_children_t *children, int idx, int slots)
+{
+ for (int i = idx; i < MIN(idx + slots, DNODES_PER_BLOCK); i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+ zrl_remove(&dnh->dnh_zrlock);
+ }
+}
+
/*
* errors:
* EINVAL - invalid object number.
@@ -1286,36 +1285,42 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
idx = object & (epb - 1);
dn_block_begin = (dnode_phys_t *)db->db.db_data;
- if ((flag & DNODE_MUST_BE_FREE) && !dnode_is_free(db, idx, slots)) {
+ dnode_hold_slots(children_dnodes, idx, slots);
+
+ if ((flag & DNODE_MUST_BE_FREE) &&
+ !dnode_is_free(children_dnodes, dn_block_begin, idx, slots)) {
+ dnode_rele_slots(children_dnodes, idx, slots);
dbuf_rele(db, FTAG);
return (SET_ERROR(ENOSPC));
} else if ((flag & DNODE_MUST_BE_ALLOCATED) &&
- !dnode_is_allocated(db, idx)) {
+ !dnode_is_allocated(children_dnodes, dn_block_begin, idx)) {
+ dnode_rele_slots(children_dnodes, idx, slots);
dbuf_rele(db, FTAG);
return (SET_ERROR(ENOENT));
}
dnh = &children_dnodes->dnc_children[idx];
- zrl_add(&dnh->dnh_zrlock);
dn = dnh->dnh_dnode;
if (dn == NULL)
- dn = dnode_create(os, dn_block_begin + idx, db, object, dnh);
+ dn = dnode_create(os, dn_block_begin + idx, slots, db,
+ object, dnh);
mutex_enter(&dn->dn_mtx);
type = dn->dn_type;
if (dn->dn_free_txg ||
((flag & DNODE_MUST_BE_FREE) && !refcount_is_zero(&dn->dn_holds))) {
mutex_exit(&dn->dn_mtx);
- zrl_remove(&dnh->dnh_zrlock);
+ dnode_rele_slots(children_dnodes, idx, slots);
dbuf_rele(db, FTAG);
return (SET_ERROR(type == DMU_OT_NONE ? ENOENT : EEXIST));
}
if (refcount_add(&dn->dn_holds, tag) == 1)
dbuf_add_ref(db, dnh);
+
mutex_exit(&dn->dn_mtx);
/* Now we can rely on the hold to prevent the dnode from moving. */
- zrl_remove(&dnh->dnh_zrlock);
+ dnode_rele_slots(children_dnodes, idx, slots);
DNODE_VERIFY(dn);
ASSERT3P(dn->dn_dbuf, ==, db);
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index 2cb9f42ae..3ebf995c6 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -675,7 +675,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
dmu_buf_t *db;
mzap_phys_t *zp;
- VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
+ VERIFY0(dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
#ifdef ZFS_DEBUG
{
diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
index a3a028583..e4115fef0 100644
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@@ -763,7 +763,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
}
zh = zfs_znode_hold_enter(zfsvfs, obj);
- VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
+ VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
/*
* If this is the root, fix up the half-initialized parent pointer
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 2017affa1..708cb4354 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -364,7 +364,7 @@ tests = ['async_destroy_001_pos']
[tests/functional/features/large_dnode]
tests = ['large_dnode_001_pos', 'large_dnode_002_pos', 'large_dnode_003_pos',
'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_006_pos',
- 'large_dnode_007_neg']
+ 'large_dnode_007_neg', 'large_dnode_008_pos']
[tests/functional/grow_pool]
tests = ['grow_pool_001_pos']
diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/Makefile.am b/tests/zfs-tests/tests/functional/features/large_dnode/Makefile.am
index 6afda5362..69ec5e18a 100644
--- a/tests/zfs-tests/tests/functional/features/large_dnode/Makefile.am
+++ b/tests/zfs-tests/tests/functional/features/large_dnode/Makefile.am
@@ -8,4 +8,5 @@ dist_pkgdata_SCRIPTS = \
large_dnode_004_neg.ksh \
large_dnode_005_pos.ksh \
large_dnode_006_pos.ksh \
- large_dnode_007_neg.ksh
+ large_dnode_007_neg.ksh \
+ large_dnode_008_pos.ksh
diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh
new file mode 100755
index 000000000..1f900b5ef
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh
@@ -0,0 +1,60 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
+# Use is subject to license terms.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Run many xattrtests on a dataset with large dnodes and xattr=sa to
+# stress concurrent allocation of large dnodes.
+#
+
+TEST_FS=$TESTPOOL/large_dnode
+
+verify_runnable "both"
+
+function cleanup
+{
+ datasetexists $TEST_FS && log_must zfs destroy $TEST_FS
+}
+
+log_onexit cleanup
+log_assert "xattrtest runs concurrently on dataset with large dnodes"
+
+log_must zfs create $TEST_FS
+log_must zfs set dnsize=auto $TEST_FS
+log_must zfs set xattr=sa $TEST_FS
+
+for ((i=0; i < 100; i++)); do
+ dir="/$TEST_FS/dir.$i"
+ log_must mkdir "$dir"
+ log_must eval "xattrtest -R -r -y -x 1 -f 1024 -k -p $dir &"
+done
+
+log_must wait
+
+log_pass