diff options
author | Justin T. Gibbs <[email protected]> | 2015-04-02 14:44:32 +1100 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2015-04-28 16:25:34 -0700 |
commit | 0c66c32d1d8b64a261cceb5f50a9e86777c5d0b2 (patch) | |
tree | 82f5630e8a4e77931e9992db3a7fac1964414716 /module/zfs/dmu_objset.c | |
parent | d683ddbb7272a179da3918cc4f922d92a2195ba2 (diff) |
Illumos 5056 - ZFS deadlock on db_mtx and dn_holds
5056 ZFS deadlock on db_mtx and dn_holds
Author: Justin Gibbs <[email protected]>
Reviewed by: Will Andrews <[email protected]>
Reviewed by: Matt Ahrens <[email protected]>
Reviewed by: George Wilson <[email protected]>
Approved by: Dan McDonald <[email protected]>
References:
https://www.illumos.org/issues/5056
https://github.com/illumos/illumos-gate/commit/bc9014e
Porting Notes:
sa_handle_get_from_db():
- the original patch includes an otherwise unmentioned fix for a
possible usage of an uninitialised variable
dmu_objset_open_impl():
- Under Illumos list_link_init() is the same as filling a list_node_t
with NULLs, so they don't notice if they miss doing list_link_init()
on a zero'd containing structure (e.g. allocated with kmem_zalloc as
here). Under Linux, not so much: an uninitialised list_node_t goes
"Boom!" some time later when it's used or destroyed.
dmu_objset_evict_dbufs():
- reduce stack usage using kmem_alloc()
Ported-by: Chris Dunlop <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Diffstat (limited to 'module/zfs/dmu_objset.c')
-rw-r--r-- | module/zfs/dmu_objset.c | 115 |
1 files changed, 73 insertions, 42 deletions
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index a02591b98..6399a0a75 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -23,6 +23,7 @@ * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -347,7 +348,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os); } - if (!dsl_dataset_is_snapshot(ds)) { + if (!ds->ds_is_snapshot) { if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), @@ -404,7 +405,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, os->os_secondary_cache = ZFS_CACHE_ALL; } - if (ds == NULL || !dsl_dataset_is_snapshot(ds)) + if (ds == NULL || !ds->ds_is_snapshot) os->os_zil_header = os->os_phys->os_zil_header; os->os_zil = zil_alloc(os, &os->os_zil_header); @@ -419,20 +420,19 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); + list_link_init(&os->os_evicting_node); + mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); - DMU_META_DNODE(os) = dnode_special_open(os, - &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, - &os->os_meta_dnode); + dnode_special_open(os, &os->os_phys->os_meta_dnode, + DMU_META_DNODE_OBJECT, &os->os_meta_dnode); if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { - DMU_USERUSED_DNODE(os) = dnode_special_open(os, - &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, - &os->os_userused_dnode); - DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, - &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, - &os->os_groupused_dnode); + dnode_special_open(os, &os->os_phys->os_userused_dnode, + DMU_USERUSED_OBJECT, &os->os_userused_dnode); + dnode_special_open(os, &os->os_phys->os_groupused_dnode, + DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode); } *osp = os; @@ -520,7 +520,7 @@ dmu_objset_own(const char *name, dmu_objset_type_t type, } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { dsl_dataset_disown(ds, tag); return (SET_ERROR(EINVAL)); - } else if (!readonly && dsl_dataset_is_snapshot(ds)) { + } else if (!readonly && ds->ds_is_snapshot) { dsl_dataset_disown(ds, tag); return (SET_ERROR(EROFS)); } @@ -576,41 +576,57 @@ dmu_objset_disown(objset_t *os, void *tag) void dmu_objset_evict_dbufs(objset_t *os) { + dnode_t *dn_marker; dnode_t *dn; - mutex_enter(&os->os_lock); + dn_marker = kmem_alloc(sizeof (dnode_t), KM_SLEEP); - /* process the mdn last, since the other dnodes have holds on it */ - list_remove(&os->os_dnodes, DMU_META_DNODE(os)); - list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); + mutex_enter(&os->os_lock); + dn = list_head(&os->os_dnodes); + while (dn != NULL) { + /* + * Skip dnodes without holds. We have to do this dance + * because dnode_add_ref() only works if there is already a + * hold. If the dnode has no holds, then it has no dbufs. + */ + if (dnode_add_ref(dn, FTAG)) { + list_insert_after(&os->os_dnodes, dn, dn_marker); + mutex_exit(&os->os_lock); - /* - * Find the first dnode with holds. We have to do this dance - * because dnode_add_ref() only works if you already have a - * hold. If there are no holds then it has no dbufs so OK to - * skip. - */ - for (dn = list_head(&os->os_dnodes); - dn && !dnode_add_ref(dn, FTAG); - dn = list_next(&os->os_dnodes, dn)) - continue; + dnode_evict_dbufs(dn); + dnode_rele(dn, FTAG); - while (dn) { - dnode_t *next_dn = dn; + mutex_enter(&os->os_lock); + dn = list_next(&os->os_dnodes, dn_marker); + list_remove(&os->os_dnodes, dn_marker); + } else { + dn = list_next(&os->os_dnodes, dn); + } + } + mutex_exit(&os->os_lock); - do { - next_dn = list_next(&os->os_dnodes, next_dn); - } while (next_dn && !dnode_add_ref(next_dn, FTAG)); + kmem_free(dn_marker, sizeof (dnode_t)); - mutex_exit(&os->os_lock); - dnode_evict_dbufs(dn); - dnode_rele(dn, FTAG); - mutex_enter(&os->os_lock); - dn = next_dn; + if (DMU_USERUSED_DNODE(os) != NULL) { + dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os)); + dnode_evict_dbufs(DMU_USERUSED_DNODE(os)); } - mutex_exit(&os->os_lock); + dnode_evict_dbufs(DMU_META_DNODE(os)); } +/* + * Objset eviction processing is split into into two pieces. + * The first marks the objset as evicting, evicts any dbufs that + * have a refcount of zero, and then queues up the objset for the + * second phase of eviction. Once os->os_dnodes has been cleared by + * dnode_buf_pageout()->dnode_destroy(), the second phase is executed. + * The second phase closes the special dnodes, dequeues the objset from + * the list of those undergoing eviction, and finally frees the objset. + * + * NOTE: Due to asynchronous eviction processing (invocation of + * dnode_buf_pageout()), it is possible for the meta dnode for the + * objset to have no holds even though os->os_dnodes is not empty. + */ void dmu_objset_evict(objset_t *os) { @@ -622,7 +638,7 @@ dmu_objset_evict(objset_t *os) ASSERT(!dmu_objset_is_dirty(os, t)); if (ds) { - if (!dsl_dataset_is_snapshot(ds)) { + if (!ds->ds_is_snapshot) { VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os)); @@ -656,8 +672,24 @@ dmu_objset_evict(objset_t *os) if (os->os_sa) sa_tear_down(os); + os->os_evicting = B_TRUE; dmu_objset_evict_dbufs(os); + mutex_enter(&os->os_lock); + spa_evicting_os_register(os->os_spa, os); + if (list_is_empty(&os->os_dnodes)) { + mutex_exit(&os->os_lock); + dmu_objset_evict_done(os); + } else { + mutex_exit(&os->os_lock); + } +} + +void +dmu_objset_evict_done(objset_t *os) +{ + ASSERT3P(list_head(&os->os_dnodes), ==, NULL); + dnode_special_close(&os->os_meta_dnode); if (DMU_USERUSED_DNODE(os)) { dnode_special_close(&os->os_userused_dnode); @@ -665,8 +697,6 @@ dmu_objset_evict(objset_t *os) } zil_free(os->os_zil); - ASSERT3P(list_head(&os->os_dnodes), ==, NULL); - VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); /* @@ -681,6 +711,7 @@ dmu_objset_evict(objset_t *os) mutex_destroy(&os->os_lock); mutex_destroy(&os->os_obj_lock); mutex_destroy(&os->os_user_ptr_lock); + spa_evicting_os_deregister(os->os_spa, os); kmem_free(os, sizeof (objset_t)); } @@ -888,7 +919,7 @@ dmu_objset_clone_check(void *arg, dmu_tx_t *tx) } /* You can only clone snapshots, not the head datasets. */ - if (!dsl_dataset_is_snapshot(origin)) { + if (!origin->ds_is_snapshot) { dsl_dataset_rele(origin, FTAG); return (SET_ERROR(EINVAL)); } @@ -1453,7 +1484,7 @@ int dmu_objset_is_snapshot(objset_t *os) { if (os->os_dsl_dataset != NULL) - return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); + return (os->os_dsl_dataset->ds_is_snapshot); else return (B_FALSE); } |