diff options
Diffstat (limited to 'module/zfs/dbuf.c')
-rw-r--r-- | module/zfs/dbuf.c | 271 |
1 files changed, 211 insertions, 60 deletions
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 49e23e1d7..dad090bf9 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -49,6 +49,7 @@ #include <sys/abd.h> #include <sys/vdev.h> #include <sys/cityhash.h> +#include <sys/spa_impl.h> kstat_t *dbuf_ksp; @@ -94,6 +95,18 @@ typedef struct dbuf_stats { * already created and in the dbuf hash table. */ kstat_named_t hash_insert_race; + /* + * Statistics about the size of the metadata dbuf cache. + */ + kstat_named_t metadata_cache_count; + kstat_named_t metadata_cache_size_bytes; + kstat_named_t metadata_cache_size_bytes_max; + /* + * For diagnostic purposes, this is incremented whenever we can't add + * something to the metadata cache because it's full, and instead put + * the data in the regular dbuf cache. + */ + kstat_named_t metadata_cache_overflow; } dbuf_stats_t; dbuf_stats_t dbuf_stats = { @@ -113,7 +126,11 @@ dbuf_stats_t dbuf_stats = { { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, - { "hash_insert_race", KSTAT_DATA_UINT64 } + { "hash_insert_race", KSTAT_DATA_UINT64 }, + { "metadata_cache_count", KSTAT_DATA_UINT64 }, + { "metadata_cache_size_bytes", KSTAT_DATA_UINT64 }, + { "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 }, + { "metadata_cache_overflow", KSTAT_DATA_UINT64 } }; #define DBUF_STAT_INCR(stat, val) \ @@ -175,24 +192,51 @@ static kcondvar_t dbuf_evict_cv; static boolean_t dbuf_evict_thread_exit; /* - * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that - * are not currently held but have been recently released. These dbufs - * are not eligible for arc eviction until they are aged out of the cache. - * Dbufs are added to the dbuf cache once the last hold is released. If a - * dbuf is later accessed and still exists in the dbuf cache, then it will - * be removed from the cache and later re-added to the head of the cache. - * Dbufs that are aged out of the cache will be immediately destroyed and - * become eligible for arc eviction. + * There are two dbuf caches; each dbuf can only be in one of them at a time. + * + * 1. Cache of metadata dbufs, to help make read-heavy administrative commands + * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs + * that represent the metadata that describes filesystems/snapshots/ + * bookmarks/properties/etc. We only evict from this cache when we export a + * pool, to short-circuit as much I/O as possible for all administrative + * commands that need the metadata. There is no eviction policy for this + * cache, because we try to only include types in it which would occupy a + * very small amount of space per object but create a large impact on the + * performance of these commands. Instead, after it reaches a maximum size + * (which should only happen on very small memory systems with a very large + * number of filesystem objects), we stop taking new dbufs into the + * metadata cache, instead putting them in the normal dbuf cache. + * + * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that + * are not currently held but have been recently released. These dbufs + * are not eligible for arc eviction until they are aged out of the cache. + * Dbufs that are aged out of the cache will be immediately destroyed and + * become eligible for arc eviction. + * + * Dbufs are added to these caches once the last hold is released. If a dbuf is + * later accessed and still exists in the dbuf cache, then it will be removed + * from the cache and later re-added to the head of the cache. + * + * If a given dbuf meets the requirements for the metadata cache, it will go + * there, otherwise it will be considered for the generic LRU dbuf cache. The + * caches and the refcounts tracking their sizes are stored in an array indexed + * by those caches' matching enum values (from dbuf_cached_state_t). */ -static multilist_t *dbuf_cache; -static refcount_t dbuf_cache_size; -unsigned long dbuf_cache_max_bytes = 0; +typedef struct dbuf_cache { + multilist_t *cache; + refcount_t size; +} dbuf_cache_t; +dbuf_cache_t dbuf_caches[DB_CACHE_MAX]; -/* Set the default size of the dbuf cache to log2 fraction of arc size. */ +/* Size limits for the caches */ +unsigned long dbuf_cache_max_bytes = 0; +unsigned long dbuf_metadata_cache_max_bytes = 0; +/* Set the default sizes of the caches to log2 fraction of arc size */ int dbuf_cache_shift = 5; +int dbuf_metadata_cache_shift = 6; /* - * The dbuf cache uses a three-stage eviction policy: + * The LRU dbuf cache uses a three-stage eviction policy: * - A low water marker designates when the dbuf eviction thread * should stop evicting from the dbuf cache. * - When we reach the maximum size (aka mid water mark), we @@ -382,6 +426,39 @@ dbuf_hash_insert(dmu_buf_impl_t *db) } /* + * This returns whether this dbuf should be stored in the metadata cache, which + * is based on whether it's from one of the dnode types that store data related + * to traversing dataset hierarchies. + */ +static boolean_t +dbuf_include_in_metadata_cache(dmu_buf_impl_t *db) +{ + DB_DNODE_ENTER(db); + dmu_object_type_t type = DB_DNODE(db)->dn_type; + DB_DNODE_EXIT(db); + + /* Check if this dbuf is one of the types we care about */ + if (DMU_OT_IS_METADATA_CACHED(type)) { + /* If we hit this, then we set something up wrong in dmu_ot */ + ASSERT(DMU_OT_IS_METADATA(type)); + + /* + * Sanity check for small-memory systems: don't allocate too + * much memory for this purpose. + */ + if (refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size) > + dbuf_metadata_cache_max_bytes) { + DBUF_STAT_BUMP(metadata_cache_overflow); + return (B_FALSE); + } + + return (B_TRUE); + } + + return (B_FALSE); +} + +/* * Remove an entry from the hash table. It must be in the EVICTING state. */ static void @@ -574,13 +651,15 @@ dbuf_cache_lowater_bytes(void) static inline boolean_t dbuf_cache_above_hiwater(void) { - return (refcount_count(&dbuf_cache_size) > dbuf_cache_hiwater_bytes()); + return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > + dbuf_cache_hiwater_bytes()); } static inline boolean_t dbuf_cache_above_lowater(void) { - return (refcount_count(&dbuf_cache_size) > dbuf_cache_lowater_bytes()); + return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > + dbuf_cache_lowater_bytes()); } /* @@ -589,8 +668,9 @@ dbuf_cache_above_lowater(void) static void dbuf_evict_one(void) { - int idx = multilist_get_random_index(dbuf_cache); - multilist_sublist_t *mls = multilist_sublist_lock(dbuf_cache, idx); + int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache); + multilist_sublist_t *mls = multilist_sublist_lock( + dbuf_caches[DB_DBUF_CACHE].cache, idx); ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); @@ -605,15 +685,17 @@ dbuf_evict_one(void) if (db != NULL) { multilist_sublist_remove(mls, db); multilist_sublist_unlock(mls); - (void) refcount_remove_many(&dbuf_cache_size, + (void) refcount_remove_many(&dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db); DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); DBUF_STAT_DECR(cache_levels_bytes[db->db_level], db->db.db_size); + ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE); + db->db_caching_status = DB_NO_CACHE; dbuf_destroy(db); DBUF_STAT_MAX(cache_size_bytes_max, - refcount_count(&dbuf_cache_size)); + refcount_count(&dbuf_caches[DB_DBUF_CACHE].size)); DBUF_STAT_BUMP(cache_total_evicts); } else { multilist_sublist_unlock(mls); @@ -676,7 +758,8 @@ dbuf_evict_notify(void) * because it's OK to occasionally make the wrong decision here, * and grabbing the lock results in massive lock contention. */ - if (refcount_count(&dbuf_cache_size) > dbuf_cache_target_bytes()) { + if (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > + dbuf_cache_target_bytes()) { if (dbuf_cache_above_hiwater()) dbuf_evict_one(); cv_signal(&dbuf_evict_cv); @@ -691,8 +774,10 @@ dbuf_kstat_update(kstat_t *ksp, int rw) if (rw == KSTAT_WRITE) { return (SET_ERROR(EACCES)); } else { + ds->metadata_cache_size_bytes.value.ui64 = + refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size); ds->cache_size_bytes.value.ui64 = - refcount_count(&dbuf_cache_size); + refcount_count(&dbuf_caches[DB_DBUF_CACHE].size); ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes(); ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes(); ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes(); @@ -746,15 +831,21 @@ retry: dbuf_stats_init(h); /* - * Setup the parameters for the dbuf cache. We set the size of the - * dbuf cache to 1/32nd (default) of the target size of the ARC. If - * the value has been specified as a module option and it's not - * greater than the target size of the ARC, then we honor that value. + * Setup the parameters for the dbuf caches. We set the sizes of the + * dbuf cache and the metadata cache to 1/32nd and 1/16th (default) + * of the target size of the ARC. If the values has been specified as + * a module option and they're not greater than the target size of the + * ARC, then we honor that value. */ if (dbuf_cache_max_bytes == 0 || dbuf_cache_max_bytes >= arc_target_bytes()) { dbuf_cache_max_bytes = arc_target_bytes() >> dbuf_cache_shift; } + if (dbuf_metadata_cache_max_bytes == 0 || + dbuf_metadata_cache_max_bytes >= arc_target_bytes()) { + dbuf_metadata_cache_max_bytes = + arc_target_bytes() >> dbuf_metadata_cache_shift; + } /* * All entries are queued via taskq_dispatch_ent(), so min/maxalloc @@ -762,10 +853,13 @@ retry: */ dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0); - dbuf_cache = multilist_create(sizeof (dmu_buf_impl_t), - offsetof(dmu_buf_impl_t, db_cache_link), - dbuf_cache_multilist_index_func); - refcount_create(&dbuf_cache_size); + for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { + dbuf_caches[dcs].cache = + multilist_create(sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_cache_link), + dbuf_cache_multilist_index_func); + refcount_create(&dbuf_caches[dcs].size); + } dbuf_evict_thread_exit = B_FALSE; mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); @@ -827,8 +921,10 @@ dbuf_fini(void) mutex_destroy(&dbuf_evict_lock); cv_destroy(&dbuf_evict_cv); - refcount_destroy(&dbuf_cache_size); - multilist_destroy(dbuf_cache); + for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { + refcount_destroy(&dbuf_caches[dcs].size); + multilist_destroy(dbuf_caches[dcs].cache); + } if (dbuf_ksp != NULL) { kstat_delete(dbuf_ksp); @@ -1116,7 +1212,7 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); - dbuf_rele_and_unlock(db, NULL, B_FALSE); + dbuf_rele_and_unlock(db, NULL); } @@ -2430,13 +2526,23 @@ dbuf_destroy(dmu_buf_impl_t *db) dbuf_clear_data(db); if (multilist_link_active(&db->db_cache_link)) { - multilist_remove(dbuf_cache, db); - (void) refcount_remove_many(&dbuf_cache_size, + ASSERT(db->db_caching_status == DB_DBUF_CACHE || + db->db_caching_status == DB_DBUF_METADATA_CACHE); + + multilist_remove(dbuf_caches[db->db_caching_status].cache, db); + (void) refcount_remove_many( + &dbuf_caches[db->db_caching_status].size, db->db.db_size, db); - DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); - DBUF_STAT_BUMPDOWN(cache_count); - DBUF_STAT_DECR(cache_levels_bytes[db->db_level], - db->db.db_size); + + if (db->db_caching_status == DB_DBUF_METADATA_CACHE) { + DBUF_STAT_BUMPDOWN(metadata_cache_count); + } else { + DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); + DBUF_STAT_BUMPDOWN(cache_count); + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], + db->db.db_size); + } + db->db_caching_status = DB_NO_CACHE; } ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); @@ -2474,7 +2580,7 @@ dbuf_destroy(dmu_buf_impl_t *db) * release any lock. */ mutex_enter(&dn->dn_mtx); - dnode_rele_and_unlock(dn, db, B_TRUE); + dnode_rele_and_unlock(dn, db); db->db_dnode_handle = NULL; dbuf_hash_remove(db); @@ -2491,6 +2597,7 @@ dbuf_destroy(dmu_buf_impl_t *db) ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); ASSERT(db->db_data_pending == NULL); + ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); ASSERT(!multilist_link_active(&db->db_cache_link)); kmem_cache_free(dbuf_kmem_cache, db); @@ -2502,7 +2609,7 @@ dbuf_destroy(dmu_buf_impl_t *db) */ if (parent && parent != dndb) { mutex_enter(&parent->db_mtx); - dbuf_rele_and_unlock(parent, db, B_TRUE); + dbuf_rele_and_unlock(parent, db); } } @@ -2640,6 +2747,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; db->db_state = DB_UNCACHED; + db->db_caching_status = DB_NO_CACHE; /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); return (db); @@ -2673,6 +2781,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, avl_add(&dn->dn_dbufs, db); db->db_state = DB_UNCACHED; + db->db_caching_status = DB_NO_CACHE; mutex_exit(&dn->dn_dbufs_mtx); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); @@ -3059,13 +3168,25 @@ __dbuf_hold_impl(struct dbuf_hold_impl_data *dh) if (multilist_link_active(&dh->dh_db->db_cache_link)) { ASSERT(refcount_is_zero(&dh->dh_db->db_holds)); - multilist_remove(dbuf_cache, dh->dh_db); - (void) refcount_remove_many(&dbuf_cache_size, + ASSERT(dh->dh_db->db_caching_status == DB_DBUF_CACHE || + dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE); + + multilist_remove( + dbuf_caches[dh->dh_db->db_caching_status].cache, + dh->dh_db); + (void) refcount_remove_many( + &dbuf_caches[dh->dh_db->db_caching_status].size, dh->dh_db->db.db_size, dh->dh_db); - DBUF_STAT_BUMPDOWN(cache_levels[dh->dh_db->db_level]); - DBUF_STAT_BUMPDOWN(cache_count); - DBUF_STAT_DECR(cache_levels_bytes[dh->dh_db->db_level], - dh->dh_db->db.db_size); + + if (dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE) { + DBUF_STAT_BUMPDOWN(metadata_cache_count); + } else { + DBUF_STAT_BUMPDOWN(cache_levels[dh->dh_db->db_level]); + DBUF_STAT_BUMPDOWN(cache_count); + DBUF_STAT_DECR(cache_levels_bytes[dh->dh_db->db_level], + dh->dh_db->db.db_size); + } + dh->dh_db->db_caching_status = DB_NO_CACHE; } (void) refcount_add(&dh->dh_db->db_holds, dh->dh_tag); DBUF_VERIFY(dh->dh_db); @@ -3230,7 +3351,7 @@ void dbuf_rele(dmu_buf_impl_t *db, void *tag) { mutex_enter(&db->db_mtx); - dbuf_rele_and_unlock(db, tag, B_FALSE); + dbuf_rele_and_unlock(db, tag); } void @@ -3253,7 +3374,7 @@ dmu_buf_rele(dmu_buf_t *db, void *tag) * */ void -dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting) +dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) { int64_t holds; @@ -3343,19 +3464,40 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting) db->db_pending_evict) { dbuf_destroy(db); } else if (!multilist_link_active(&db->db_cache_link)) { - multilist_insert(dbuf_cache, db); - (void) refcount_add_many(&dbuf_cache_size, + ASSERT3U(db->db_caching_status, ==, + DB_NO_CACHE); + + dbuf_cached_state_t dcs = + dbuf_include_in_metadata_cache(db) ? + DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE; + db->db_caching_status = dcs; + + multilist_insert(dbuf_caches[dcs].cache, db); + (void) refcount_add_many(&dbuf_caches[dcs].size, db->db.db_size, db); - DBUF_STAT_BUMP(cache_levels[db->db_level]); - DBUF_STAT_BUMP(cache_count); - DBUF_STAT_INCR(cache_levels_bytes[db->db_level], - db->db.db_size); - DBUF_STAT_MAX(cache_size_bytes_max, - refcount_count(&dbuf_cache_size)); + + if (dcs == DB_DBUF_METADATA_CACHE) { + DBUF_STAT_BUMP(metadata_cache_count); + DBUF_STAT_MAX( + metadata_cache_size_bytes_max, + refcount_count( + &dbuf_caches[dcs].size)); + } else { + DBUF_STAT_BUMP( + cache_levels[db->db_level]); + DBUF_STAT_BUMP(cache_count); + DBUF_STAT_INCR( + cache_levels_bytes[db->db_level], + db->db.db_size); + DBUF_STAT_MAX(cache_size_bytes_max, + refcount_count( + &dbuf_caches[dcs].size)); + } mutex_exit(&db->db_mtx); - if (!evicting) + if (db->db_caching_status == DB_DBUF_CACHE) { dbuf_evict_notify(); + } } if (do_arc_evict) @@ -3706,7 +3848,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; - dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); return; } @@ -4081,7 +4223,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; - dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); } static void @@ -4445,8 +4587,17 @@ MODULE_PARM_DESC(dbuf_cache_lowater_pct, "Percentage below dbuf_cache_max_bytes when the evict thread stops " "evicting dbufs."); +module_param(dbuf_metadata_cache_max_bytes, ulong, 0644); +MODULE_PARM_DESC(dbuf_metadata_cache_max_bytes, + "Maximum size in bytes of the dbuf metadata cache."); + module_param(dbuf_cache_shift, int, 0644); MODULE_PARM_DESC(dbuf_cache_shift, "Set the size of the dbuf cache to a log2 fraction of arc size."); + +module_param(dbuf_metadata_cache_shift, int, 0644); +MODULE_PARM_DESC(dbuf_cache_shift, + "Set the size of the dbuf metadata cache to a log2 fraction of " + "arc size."); /* END CSTYLED */ #endif |