summaryrefslogtreecommitdiffstats
path: root/module
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2019-01-02 11:46:04 -0800
committerGitHub <[email protected]>2019-01-02 11:46:04 -0800
commit65ca2c1eb9af3c3fb2d221aabc2afa5db7b0f8cc (patch)
tree6c1405e1391f99350b90fcefe41f0d279f02644f /module
parent06f3fc2a4b097545259935d54634c5c6f49ed20f (diff)
Fix 'zpool remap' freeing race
The dmu_objset_remap_indirects_impl() logic depends on dnode_hold() returning ENOENT for dnodes which will be freed and should be skipped. This behavior can only be relied upon when taking a new hold and while the caller has an open transaction. This ensures that the open txg cannot advance and that a concurrent free will end up in the same txg (which is critical). Relying on an existing hold will not prevent dnode_free() from succeeding. The solution is to take an additional dnode_hold() after assigning the transaction. This ensures the remap will never dirty the dnode if it was freed while we were waiting in dmu_tx_assign(, TXG_WAIT). Randomly set zfs_object_remap_one_indirect_delay_ms in ztest. This increases the likelihood of an operation racing with the remap. Converted from ticks to milliseconds. Reviewed by: Matt Ahrens <[email protected]> Reviewed by: Tom Caputi <[email protected]> Reviewed by: Igor Kozhukhov <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #8215
Diffstat (limited to 'module')
-rw-r--r--module/zfs/dmu.c34
1 files changed, 24 insertions, 10 deletions
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 3eed512e5..e8d0ce3be 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -76,9 +76,9 @@ int zfs_dmu_offset_next_sync = 0;
/*
* This can be used for testing, to ensure that certain actions happen
* while in the middle of a remap (which might otherwise complete too
- * quickly).
+ * quickly). Used by ztest(8).
*/
-int zfs_object_remap_one_indirect_delay_ticks = 0;
+int zfs_object_remap_one_indirect_delay_ms = 0;
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" },
@@ -1075,6 +1075,7 @@ dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn,
uint64_t last_removal_txg, uint64_t offset)
{
uint64_t l1blkid = dbuf_whichblock(dn, 1, offset);
+ dnode_t *dn_tx;
int err = 0;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
@@ -1093,7 +1094,9 @@ dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn,
/*
* If this L1 was already written after the last removal, then we've
- * already tried to remap it.
+ * already tried to remap it. An additional hold is taken after the
+ * dmu_tx_assign() to handle the case where the dnode is freed while
+ * waiting for the next open txg.
*/
if (birth <= last_removal_txg &&
dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 &&
@@ -1102,7 +1105,11 @@ dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn,
dmu_tx_hold_remap_l1indirect(tx, dn->dn_object);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err == 0) {
- (void) dbuf_dirty(dbuf, tx);
+ err = dnode_hold(os, dn->dn_object, FTAG, &dn_tx);
+ if (err == 0) {
+ (void) dbuf_dirty(dbuf, tx);
+ dnode_rele(dn_tx, FTAG);
+ }
dmu_tx_commit(tx);
} else {
dmu_tx_abort(tx);
@@ -1111,7 +1118,7 @@ dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn,
dbuf_rele(dbuf, FTAG);
- delay(zfs_object_remap_one_indirect_delay_ticks);
+ delay(MSEC_TO_TICK(zfs_object_remap_one_indirect_delay_ms));
return (err);
}
@@ -1133,7 +1140,7 @@ dmu_object_remap_indirects(objset_t *os, uint64_t object,
{
uint64_t offset, l1span;
int err;
- dnode_t *dn;
+ dnode_t *dn, *dn_tx;
err = dnode_hold(os, object, FTAG, &dn);
if (err != 0) {
@@ -1148,13 +1155,20 @@ dmu_object_remap_indirects(objset_t *os, uint64_t object,
/*
* If the dnode has no indirect blocks, we cannot dirty them.
* We still want to remap the blkptr(s) in the dnode if
- * appropriate, so mark it as dirty.
+ * appropriate, so mark it as dirty. An additional hold is
+ * taken after the dmu_tx_assign() to handle the case where
+ * the dnode is freed while waiting for the next open txg.
*/
if (err == 0 && dnode_needs_remap(dn)) {
dmu_tx_t *tx = dmu_tx_create(os);
- dmu_tx_hold_bonus(tx, dn->dn_object);
- if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) {
- dnode_setdirty(dn, tx);
+ dmu_tx_hold_bonus(tx, object);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err == 0) {
+ err = dnode_hold(os, object, FTAG, &dn_tx);
+ if (err == 0) {
+ dnode_setdirty(dn_tx, tx);
+ dnode_rele(dn_tx, FTAG);
+ }
dmu_tx_commit(tx);
} else {
dmu_tx_abort(tx);