diff options
author | Matthew Ahrens <mahrens@delphix.com> | 2018-05-31 10:29:12 -0700 |
---|---|---|
committer | Brian Behlendorf <behlendorf1@llnl.gov> | 2018-06-13 11:05:06 -0700 |
commit | 1fac63e56f370f675b23687ee2e634744c54e818 (patch) | |
tree | 1655183ce851b290e08c0d191ac01aeb87e2caf1 /module/zfs | |
parent | 232dd8b956703bcc774524e78110d13de93dd5bd (diff) |
OpenZFS 9577 - remove zfs_dbuf_evict_key tsd
The zfs_dbuf_evict_key TSD (thread-specific data) is not necessary -
we can instead pass a flag down in a few places to prevent recursive
dbuf eviction. Making this change has 3 benefits:
1. The code semantics are easier to understand.
2. On Linux, performance is improved, because creating/removing
TSD values (by setting to NULL vs non-NULL) is expensive, and
we do it very often.
3. According to Nexenta, the current semantics can cause a
deadlock when concurrently calling dmu_objset_evict_dbufs()
(which is rare today, but they are working on a "parallel
unmount" change that triggers this more easily):
Porting Notes:
* Minor conflict with OpenZFS 9337 which has not yet been ported.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
OpenZFS-issue: https://illumos.org/issues/9577
OpenZFS-commit: https://github.com/openzfs/openzfs/pull/645
External-issue: DLPX-58547
Closes #7602
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/dbuf.c | 69 | ||||
-rw-r--r-- | module/zfs/dnode.c | 7 | ||||
-rw-r--r-- | module/zfs/dnode_sync.c | 17 |
3 files changed, 42 insertions, 51 deletions
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 6e2f20e50..2411c8d4f 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -155,8 +155,6 @@ static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, void *tag, dmu_buf_impl_t **dbp, int depth); static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); -uint_t zfs_dbuf_evict_key; - static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); @@ -596,14 +594,6 @@ dbuf_evict_one(void) ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); - /* - * Set the thread's tsd to indicate that it's processing evictions. - * Once a thread stops evicting from the dbuf cache it will - * reset its tsd to NULL. - */ - ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL); - (void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE); - dmu_buf_impl_t *db = multilist_sublist_tail(mls); while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { db = multilist_sublist_prev(mls, db); @@ -628,7 +618,6 @@ dbuf_evict_one(void) } else { multilist_sublist_unlock(mls); } - (void) tsd_set(zfs_dbuf_evict_key, NULL); } /* @@ -682,29 +671,6 @@ dbuf_evict_thread(void *unused) static void dbuf_evict_notify(void) { - - /* - * We use thread specific data to track when a thread has - * started processing evictions. This allows us to avoid deeply - * nested stacks that would have a call flow similar to this: - * - * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() - * ^ | - * | | - * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ - * - * The dbuf_eviction_thread will always have its tsd set until - * that thread exits. All other threads will only set their tsd - * if they are participating in the eviction process. This only - * happens if the eviction thread is unable to process evictions - * fast enough. To keep the dbuf cache size in check, other threads - * can evict from the dbuf cache directly. Those threads will set - * their tsd values so that we ensure that they only evict one dbuf - * from the dbuf cache. - */ - if (tsd_get(zfs_dbuf_evict_key) != NULL) - return; - /* * We check if we should evict without holding the dbuf_evict_lock, * because it's OK to occasionally make the wrong decision here, @@ -801,7 +767,6 @@ retry: dbuf_cache_multilist_index_func); refcount_create(&dbuf_cache_size); - tsd_create(&zfs_dbuf_evict_key, NULL); dbuf_evict_thread_exit = B_FALSE; mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); @@ -858,7 +823,6 @@ dbuf_fini(void) cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); } mutex_exit(&dbuf_evict_lock); - tsd_destroy(&zfs_dbuf_evict_key); mutex_destroy(&dbuf_evict_lock); cv_destroy(&dbuf_evict_cv); @@ -1152,7 +1116,7 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); - dbuf_rele_and_unlock(db, NULL); + dbuf_rele_and_unlock(db, NULL, B_FALSE); } static int @@ -2441,7 +2405,8 @@ dbuf_destroy(dmu_buf_impl_t *db) * value in dnode_move(), since DB_DNODE_EXIT doesn't actually * release any lock. */ - dnode_rele(dn, db); + mutex_enter(&dn->dn_mtx); + dnode_rele_and_unlock(dn, db, B_TRUE); db->db_dnode_handle = NULL; dbuf_hash_remove(db); @@ -2467,8 +2432,10 @@ dbuf_destroy(dmu_buf_impl_t *db) * If this dbuf is referenced from an indirect dbuf, * decrement the ref count on the indirect dbuf. */ - if (parent && parent != dndb) - dbuf_rele(parent, db); + if (parent && parent != dndb) { + mutex_enter(&parent->db_mtx); + dbuf_rele_and_unlock(parent, db, B_TRUE); + } } /* @@ -3195,7 +3162,7 @@ void dbuf_rele(dmu_buf_impl_t *db, void *tag) { mutex_enter(&db->db_mtx); - dbuf_rele_and_unlock(db, tag); + dbuf_rele_and_unlock(db, tag, B_FALSE); } void @@ -3206,10 +3173,19 @@ dmu_buf_rele(dmu_buf_t *db, void *tag) /* * dbuf_rele() for an already-locked dbuf. This is necessary to allow - * db_dirtycnt and db_holds to be updated atomically. + * db_dirtycnt and db_holds to be updated atomically. The 'evicting' + * argument should be set if we are already in the dbuf-evicting code + * path, in which case we don't want to recursively evict. This allows us to + * avoid deeply nested stacks that would have a call flow similar to this: + * + * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() + * ^ | + * | | + * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ + * */ void -dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) +dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting) { int64_t holds; @@ -3310,7 +3286,8 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) refcount_count(&dbuf_cache_size)); mutex_exit(&db->db_mtx); - dbuf_evict_notify(); + if (!evicting) + dbuf_evict_notify(); } if (do_arc_evict) @@ -3647,7 +3624,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; - dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); return; } @@ -4020,7 +3997,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; - dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); } static void diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 7620e11f0..ab687f7cc 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1571,11 +1571,11 @@ void dnode_rele(dnode_t *dn, void *tag) { mutex_enter(&dn->dn_mtx); - dnode_rele_and_unlock(dn, tag); + dnode_rele_and_unlock(dn, tag, B_FALSE); } void -dnode_rele_and_unlock(dnode_t *dn, void *tag) +dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting) { uint64_t refs; /* Get while the hold prevents the dnode from moving. */ @@ -1606,7 +1606,8 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag) * that the handle has zero references, but that will be * asserted anyway when the handle gets destroyed. */ - dbuf_rele(db, dnh); + mutex_enter(&db->db_mtx); + dbuf_rele_and_unlock(db, dnh, evicting); } } diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 7d3850a5f..96e7bccc9 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -424,6 +424,19 @@ dnode_evict_dbufs(dnode_t *dn) avl_insert_here(&dn->dn_dbufs, db_marker, db, AVL_BEFORE); + /* + * We need to use the "marker" dbuf rather than + * simply getting the next dbuf, because + * dbuf_destroy() may actually remove multiple dbufs. + * It can call itself recursively on the parent dbuf, + * which may also be removed from dn_dbufs. The code + * flow would look like: + * + * dbuf_destroy(): + * dnode_rele_and_unlock(parent_dbuf, evicting=TRUE): + * if (!cacheable || pending_evict) + * dbuf_destroy() + */ dbuf_destroy(db); db_next = AVL_NEXT(&dn->dn_dbufs, db_marker); @@ -484,7 +497,7 @@ dnode_undirty_dbufs(list_t *list) list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); - dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); } } |