OpenZFS 9337 - zfs get all is slow due to uncached metadata

This project's goal is to make read-heavy channel programs and zfs(1m) administrative commands faster by caching all the metadata that they will need in the dbuf layer. This will prevent the data from being evicted, so that any future call to i.e. zfs get all won't have to go to disk (very much). There are two parts: The dbuf_metadata_cache. We identify what to put into the cache based on the object type of each dbuf. Caching objset properties os {version,normalization,utf8only,casesensitivity} in the objset_t. The reason these needed to be cached is that although they are queried frequently, they aren't stored in a dbuf type which we can easily recognize and cache in the dbuf layer; instead, we have to explicitly store them. There's already existing infrastructure for maintaining cached properties in the objset setup code, so I simply used that. Performance Testing: - Disabled kmem_flags - Tuned dbuf_cache_max_bytes very low (128K) - Tuned zfs_arc_max very low (64M) Created test pool with 400 filesystems, and 100 snapshots per filesystem. Later on in testing, added 600 more filesystems (with no snapshots) to make sure scaling didn't look different between snapshots and filesystems. Results: | Test | Time (trunk / diff) | I/Os (trunk / diff) | +------------------------+---------------------+---------------------+ | zpool import | 0:05 / 0:06 | 12.9k / 12.9k | | zfs get all (uncached) | 1:36 / 0:53 | 16.7k / 5.7k | | zfs get all (cached) | 1:36 / 0:51 | 16.0k / 6.0k | Authored by: Matthew Ahrens <[email protected]> Reviewed by: Prakash Surya <[email protected]> Reviewed by: George Wilson <[email protected]> Reviewed by: Thomas Caputi <[email protected]> Reviewed by: Brian Behlendorf <[email protected]> Approved by: Richard Lowe <[email protected]> Ported-by: Alek Pinchuk <[email protected]> Signed-off-by: Alek Pinchuk <[email protected]> OpenZFS-issue: https://illumos.org/issues/9337 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/7dec52f Closes #7668
author: Matthew Ahrens <[email protected]> 2018-07-10 13:49:50 -0400
committer: Brian Behlendorf <[email protected]> 2018-07-12 10:49:27 -0700
commit: 2e5dc449c1a65e0b0bf730fd69c9b5804bd57ee8 (patch)
tree: 8bbb44d8e88afa71ae1071c7a29369721711b0dd /module
parent: e4e94ca3154a9e58ac20e5409c003895ec859964 (diff)
9 files changed, 343 insertions, 156 deletions
diff --git a/module/zcommon/zfs_comutil.c b/module/zcommon/zfs_comutil.c
index 1f74095cc..5daa6907c 100644
--- a/module/zcommon/zfs_comutil.c
+++ b/module/zcommon/zfs_comutil.c
@@ -204,10 +204,28 @@ const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS] = {
 	"pool split",
 };
 
+boolean_t
+zfs_dataset_name_hidden(const char *name)
+{
+	/*
+	 * Skip over datasets that are not visible in this zone,
+	 * internal datasets (which have a $ in their name), and
+	 * temporary datasets (which have a % in their name).
+	 */
+	if (strchr(name, '$') != NULL)
+		return (B_TRUE);
+	if (strchr(name, '%') != NULL)
+		return (B_TRUE);
+	if (!INGLOBALZONE(curproc) && !zone_dataset_visible(name, NULL))
+		return (B_TRUE);
+	return (B_FALSE);
+}
+
 #if defined(_KERNEL)
 EXPORT_SYMBOL(zfs_allocatable_devs);
 EXPORT_SYMBOL(zpool_get_load_policy);
 EXPORT_SYMBOL(zfs_zpl_version_map);
 EXPORT_SYMBOL(zfs_spa_version_map);
 EXPORT_SYMBOL(zfs_history_event_names);
+EXPORT_SYMBOL(zfs_dataset_name_hidden);
 #endif
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 49e23e1d7..dad090bf9 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -49,6 +49,7 @@
 #include <sys/abd.h>
 #include <sys/vdev.h>
 #include <sys/cityhash.h>
+#include <sys/spa_impl.h>
 
 kstat_t *dbuf_ksp;
 
@@ -94,6 +95,18 @@ typedef struct dbuf_stats {
 	 * already created and in the dbuf hash table.
 	 */
 	kstat_named_t hash_insert_race;
+	/*
+	 * Statistics about the size of the metadata dbuf cache.
+	 */
+	kstat_named_t metadata_cache_count;
+	kstat_named_t metadata_cache_size_bytes;
+	kstat_named_t metadata_cache_size_bytes_max;
+	/*
+	 * For diagnostic purposes, this is incremented whenever we can't add
+	 * something to the metadata cache because it's full, and instead put
+	 * the data in the regular dbuf cache.
+	 */
+	kstat_named_t metadata_cache_overflow;
 } dbuf_stats_t;
 
 dbuf_stats_t dbuf_stats = {
@@ -113,7 +126,11 @@ dbuf_stats_t dbuf_stats = {
 	{ "hash_elements_max",			KSTAT_DATA_UINT64 },
 	{ "hash_chains",			KSTAT_DATA_UINT64 },
 	{ "hash_chain_max",			KSTAT_DATA_UINT64 },
-	{ "hash_insert_race",			KSTAT_DATA_UINT64 }
+	{ "hash_insert_race",			KSTAT_DATA_UINT64 },
+	{ "metadata_cache_count",		KSTAT_DATA_UINT64 },
+	{ "metadata_cache_size_bytes",		KSTAT_DATA_UINT64 },
+	{ "metadata_cache_size_bytes_max",	KSTAT_DATA_UINT64 },
+	{ "metadata_cache_overflow",		KSTAT_DATA_UINT64 }
 };
 
 #define	DBUF_STAT_INCR(stat, val)	\
@@ -175,24 +192,51 @@ static kcondvar_t dbuf_evict_cv;
 static boolean_t dbuf_evict_thread_exit;
 
 /*
- * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
- * are not currently held but have been recently released. These dbufs
- * are not eligible for arc eviction until they are aged out of the cache.
- * Dbufs are added to the dbuf cache once the last hold is released. If a
- * dbuf is later accessed and still exists in the dbuf cache, then it will
- * be removed from the cache and later re-added to the head of the cache.
- * Dbufs that are aged out of the cache will be immediately destroyed and
- * become eligible for arc eviction.
+ * There are two dbuf caches; each dbuf can only be in one of them at a time.
+ *
+ * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
+ *    from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
+ *    that represent the metadata that describes filesystems/snapshots/
+ *    bookmarks/properties/etc. We only evict from this cache when we export a
+ *    pool, to short-circuit as much I/O as possible for all administrative
+ *    commands that need the metadata. There is no eviction policy for this
+ *    cache, because we try to only include types in it which would occupy a
+ *    very small amount of space per object but create a large impact on the
+ *    performance of these commands. Instead, after it reaches a maximum size
+ *    (which should only happen on very small memory systems with a very large
+ *    number of filesystem objects), we stop taking new dbufs into the
+ *    metadata cache, instead putting them in the normal dbuf cache.
+ *
+ * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
+ *    are not currently held but have been recently released. These dbufs
+ *    are not eligible for arc eviction until they are aged out of the cache.
+ *    Dbufs that are aged out of the cache will be immediately destroyed and
+ *    become eligible for arc eviction.
+ *
+ * Dbufs are added to these caches once the last hold is released. If a dbuf is
+ * later accessed and still exists in the dbuf cache, then it will be removed
+ * from the cache and later re-added to the head of the cache.
+ *
+ * If a given dbuf meets the requirements for the metadata cache, it will go
+ * there, otherwise it will be considered for the generic LRU dbuf cache. The
+ * caches and the refcounts tracking their sizes are stored in an array indexed
+ * by those caches' matching enum values (from dbuf_cached_state_t).
  */
-static multilist_t *dbuf_cache;
-static refcount_t dbuf_cache_size;
-unsigned long dbuf_cache_max_bytes = 0;
+typedef struct dbuf_cache {
+	multilist_t *cache;
+	refcount_t size;
+} dbuf_cache_t;
+dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
 
-/* Set the default size of the dbuf cache to log2 fraction of arc size. */
+/* Size limits for the caches */
+unsigned long dbuf_cache_max_bytes = 0;
+unsigned long dbuf_metadata_cache_max_bytes = 0;
+/* Set the default sizes of the caches to log2 fraction of arc size */
 int dbuf_cache_shift = 5;
+int dbuf_metadata_cache_shift = 6;
 
 /*
- * The dbuf cache uses a three-stage eviction policy:
+ * The LRU dbuf cache uses a three-stage eviction policy:
  *	- A low water marker designates when the dbuf eviction thread
  *	should stop evicting from the dbuf cache.
  *	- When we reach the maximum size (aka mid water mark), we
@@ -382,6 +426,39 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
 }
 
 /*
+ * This returns whether this dbuf should be stored in the metadata cache, which
+ * is based on whether it's from one of the dnode types that store data related
+ * to traversing dataset hierarchies.
+ */
+static boolean_t
+dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
+{
+	DB_DNODE_ENTER(db);
+	dmu_object_type_t type = DB_DNODE(db)->dn_type;
+	DB_DNODE_EXIT(db);
+
+	/* Check if this dbuf is one of the types we care about */
+	if (DMU_OT_IS_METADATA_CACHED(type)) {
+		/* If we hit this, then we set something up wrong in dmu_ot */
+		ASSERT(DMU_OT_IS_METADATA(type));
+
+		/*
+		 * Sanity check for small-memory systems: don't allocate too
+		 * much memory for this purpose.
+		 */
+		if (refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
+		    dbuf_metadata_cache_max_bytes) {
+			DBUF_STAT_BUMP(metadata_cache_overflow);
+			return (B_FALSE);
+		}
+
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/*
  * Remove an entry from the hash table.  It must be in the EVICTING state.
  */
 static void
@@ -574,13 +651,15 @@ dbuf_cache_lowater_bytes(void)
 static inline boolean_t
 dbuf_cache_above_hiwater(void)
 {
-	return (refcount_count(&dbuf_cache_size) > dbuf_cache_hiwater_bytes());
+	return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
+	    dbuf_cache_hiwater_bytes());
 }
 
 static inline boolean_t
 dbuf_cache_above_lowater(void)
 {
-	return (refcount_count(&dbuf_cache_size) > dbuf_cache_lowater_bytes());
+	return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
+	    dbuf_cache_lowater_bytes());
 }
 
 /*
@@ -589,8 +668,9 @@ dbuf_cache_above_lowater(void)
 static void
 dbuf_evict_one(void)
 {
-	int idx = multilist_get_random_index(dbuf_cache);
-	multilist_sublist_t *mls = multilist_sublist_lock(dbuf_cache, idx);
+	int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache);
+	multilist_sublist_t *mls = multilist_sublist_lock(
+	    dbuf_caches[DB_DBUF_CACHE].cache, idx);
 
 	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
 
@@ -605,15 +685,17 @@ dbuf_evict_one(void)
 	if (db != NULL) {
 		multilist_sublist_remove(mls, db);
 		multilist_sublist_unlock(mls);
-		(void) refcount_remove_many(&dbuf_cache_size,
+		(void) refcount_remove_many(&dbuf_caches[DB_DBUF_CACHE].size,
 		    db->db.db_size, db);
 		DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
 		DBUF_STAT_BUMPDOWN(cache_count);
 		DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
 		    db->db.db_size);
+		ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
+		db->db_caching_status = DB_NO_CACHE;
 		dbuf_destroy(db);
 		DBUF_STAT_MAX(cache_size_bytes_max,
-		    refcount_count(&dbuf_cache_size));
+		    refcount_count(&dbuf_caches[DB_DBUF_CACHE].size));
 		DBUF_STAT_BUMP(cache_total_evicts);
 	} else {
 		multilist_sublist_unlock(mls);
@@ -676,7 +758,8 @@ dbuf_evict_notify(void)
 	 * because it's OK to occasionally make the wrong decision here,
 	 * and grabbing the lock results in massive lock contention.
 	 */
-	if (refcount_count(&dbuf_cache_size) > dbuf_cache_target_bytes()) {
+	if (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
+	    dbuf_cache_target_bytes()) {
 		if (dbuf_cache_above_hiwater())
 			dbuf_evict_one();
 		cv_signal(&dbuf_evict_cv);
@@ -691,8 +774,10 @@ dbuf_kstat_update(kstat_t *ksp, int rw)
 	if (rw == KSTAT_WRITE) {
 		return (SET_ERROR(EACCES));
 	} else {
+		ds->metadata_cache_size_bytes.value.ui64 =
+		    refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size);
 		ds->cache_size_bytes.value.ui64 =
-		    refcount_count(&dbuf_cache_size);
+		    refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);
 		ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();
 		ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();
 		ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();
@@ -746,15 +831,21 @@ retry:
 	dbuf_stats_init(h);
 
 	/*
-	 * Setup the parameters for the dbuf cache. We set the size of the
-	 * dbuf cache to 1/32nd (default) of the target size of the ARC. If
-	 * the value has been specified as a module option and it's not
-	 * greater than the target size of the ARC, then we honor that value.
+	 * Setup the parameters for the dbuf caches. We set the sizes of the
+	 * dbuf cache and the metadata cache to 1/32nd and 1/16th (default)
+	 * of the target size of the ARC. If the values has been specified as
+	 * a module option and they're not greater than the target size of the
+	 * ARC, then we honor that value.
 	 */
 	if (dbuf_cache_max_bytes == 0 ||
 	    dbuf_cache_max_bytes >= arc_target_bytes()) {
 		dbuf_cache_max_bytes = arc_target_bytes() >> dbuf_cache_shift;
 	}
+	if (dbuf_metadata_cache_max_bytes == 0 ||
+	    dbuf_metadata_cache_max_bytes >= arc_target_bytes()) {
+		dbuf_metadata_cache_max_bytes =
+		    arc_target_bytes() >> dbuf_metadata_cache_shift;
+	}
 
 	/*
 	 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
@@ -762,10 +853,13 @@ retry:
 	 */
 	dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
 
-	dbuf_cache = multilist_create(sizeof (dmu_buf_impl_t),
-	    offsetof(dmu_buf_impl_t, db_cache_link),
-	    dbuf_cache_multilist_index_func);
-	refcount_create(&dbuf_cache_size);
+	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
+		dbuf_caches[dcs].cache =
+		    multilist_create(sizeof (dmu_buf_impl_t),
+		    offsetof(dmu_buf_impl_t, db_cache_link),
+		    dbuf_cache_multilist_index_func);
+		refcount_create(&dbuf_caches[dcs].size);
+	}
 
 	dbuf_evict_thread_exit = B_FALSE;
 	mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -827,8 +921,10 @@ dbuf_fini(void)
 	mutex_destroy(&dbuf_evict_lock);
 	cv_destroy(&dbuf_evict_cv);
 
-	refcount_destroy(&dbuf_cache_size);
-	multilist_destroy(dbuf_cache);
+	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
+		refcount_destroy(&dbuf_caches[dcs].size);
+		multilist_destroy(dbuf_caches[dcs].cache);
+	}
 
 	if (dbuf_ksp != NULL) {
 		kstat_delete(dbuf_ksp);
@@ -1116,7 +1212,7 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
 		db->db_state = DB_UNCACHED;
 	}
 	cv_broadcast(&db->db_changed);
-	dbuf_rele_and_unlock(db, NULL, B_FALSE);
+	dbuf_rele_and_unlock(db, NULL);
 }
 
 
@@ -2430,13 +2526,23 @@ dbuf_destroy(dmu_buf_impl_t *db)
 	dbuf_clear_data(db);
 
 	if (multilist_link_active(&db->db_cache_link)) {
-		multilist_remove(dbuf_cache, db);
-		(void) refcount_remove_many(&dbuf_cache_size,
+		ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
+		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
+
+		multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
+		(void) refcount_remove_many(
+		    &dbuf_caches[db->db_caching_status].size,
 		    db->db.db_size, db);
-		DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
-		DBUF_STAT_BUMPDOWN(cache_count);
-		DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
-		    db->db.db_size);
+
+		if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
+			DBUF_STAT_BUMPDOWN(metadata_cache_count);
+		} else {
+			DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
+			DBUF_STAT_BUMPDOWN(cache_count);
+			DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
+			    db->db.db_size);
+		}
+		db->db_caching_status = DB_NO_CACHE;
 	}
 
 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
@@ -2474,7 +2580,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
 		 * release any lock.
 		 */
 		mutex_enter(&dn->dn_mtx);
-		dnode_rele_and_unlock(dn, db, B_TRUE);
+		dnode_rele_and_unlock(dn, db);
 		db->db_dnode_handle = NULL;
 
 		dbuf_hash_remove(db);
@@ -2491,6 +2597,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
 	ASSERT(db->db_hash_next == NULL);
 	ASSERT(db->db_blkptr == NULL);
 	ASSERT(db->db_data_pending == NULL);
+	ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
 	ASSERT(!multilist_link_active(&db->db_cache_link));
 
 	kmem_cache_free(dbuf_kmem_cache, db);
@@ -2502,7 +2609,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
 	 */
 	if (parent && parent != dndb) {
 		mutex_enter(&parent->db_mtx);
-		dbuf_rele_and_unlock(parent, db, B_TRUE);
+		dbuf_rele_and_unlock(parent, db);
 	}
 }
 
@@ -2640,6 +2747,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		db->db.db_offset = DMU_BONUS_BLKID;
 		db->db_state = DB_UNCACHED;
+		db->db_caching_status = DB_NO_CACHE;
 		/* the bonus dbuf is not placed in the hash table */
 		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
 		return (db);
@@ -2673,6 +2781,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 	avl_add(&dn->dn_dbufs, db);
 
 	db->db_state = DB_UNCACHED;
+	db->db_caching_status = DB_NO_CACHE;
 	mutex_exit(&dn->dn_dbufs_mtx);
 	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
 
@@ -3059,13 +3168,25 @@ __dbuf_hold_impl(struct dbuf_hold_impl_data *dh)
 
 	if (multilist_link_active(&dh->dh_db->db_cache_link)) {
 		ASSERT(refcount_is_zero(&dh->dh_db->db_holds));
-		multilist_remove(dbuf_cache, dh->dh_db);
-		(void) refcount_remove_many(&dbuf_cache_size,
+		ASSERT(dh->dh_db->db_caching_status == DB_DBUF_CACHE ||
+		    dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE);
+
+		multilist_remove(
+		    dbuf_caches[dh->dh_db->db_caching_status].cache,
+		    dh->dh_db);
+		(void) refcount_remove_many(
+		    &dbuf_caches[dh->dh_db->db_caching_status].size,
 		    dh->dh_db->db.db_size, dh->dh_db);
-		DBUF_STAT_BUMPDOWN(cache_levels[dh->dh_db->db_level]);
-		DBUF_STAT_BUMPDOWN(cache_count);
-		DBUF_STAT_DECR(cache_levels_bytes[dh->dh_db->db_level],
-		    dh->dh_db->db.db_size);
+
+		if (dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE) {
+			DBUF_STAT_BUMPDOWN(metadata_cache_count);
+		} else {
+			DBUF_STAT_BUMPDOWN(cache_levels[dh->dh_db->db_level]);
+			DBUF_STAT_BUMPDOWN(cache_count);
+			DBUF_STAT_DECR(cache_levels_bytes[dh->dh_db->db_level],
+			    dh->dh_db->db.db_size);
+		}
+		dh->dh_db->db_caching_status = DB_NO_CACHE;
 	}
 	(void) refcount_add(&dh->dh_db->db_holds, dh->dh_tag);
 	DBUF_VERIFY(dh->dh_db);
@@ -3230,7 +3351,7 @@ void
 dbuf_rele(dmu_buf_impl_t *db, void *tag)
 {
 	mutex_enter(&db->db_mtx);
-	dbuf_rele_and_unlock(db, tag, B_FALSE);
+	dbuf_rele_and_unlock(db, tag);
 }
 
 void
@@ -3253,7 +3374,7 @@ dmu_buf_rele(dmu_buf_t *db, void *tag)
  *
  */
 void
-dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
+dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
 {
 	int64_t holds;
 
@@ -3343,19 +3464,40 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
 			    db->db_pending_evict) {
 				dbuf_destroy(db);
 			} else if (!multilist_link_active(&db->db_cache_link)) {
-				multilist_insert(dbuf_cache, db);
-				(void) refcount_add_many(&dbuf_cache_size,
+				ASSERT3U(db->db_caching_status, ==,
+				    DB_NO_CACHE);
+
+				dbuf_cached_state_t dcs =
+				    dbuf_include_in_metadata_cache(db) ?
+				    DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
+				db->db_caching_status = dcs;
+
+				multilist_insert(dbuf_caches[dcs].cache, db);
+				(void) refcount_add_many(&dbuf_caches[dcs].size,
 				    db->db.db_size, db);
-				DBUF_STAT_BUMP(cache_levels[db->db_level]);
-				DBUF_STAT_BUMP(cache_count);
-				DBUF_STAT_INCR(cache_levels_bytes[db->db_level],
-				    db->db.db_size);
-				DBUF_STAT_MAX(cache_size_bytes_max,
-				    refcount_count(&dbuf_cache_size));
+
+				if (dcs == DB_DBUF_METADATA_CACHE) {
+					DBUF_STAT_BUMP(metadata_cache_count);
+					DBUF_STAT_MAX(
+					    metadata_cache_size_bytes_max,
+					    refcount_count(
+					    &dbuf_caches[dcs].size));
+				} else {
+					DBUF_STAT_BUMP(
+					    cache_levels[db->db_level]);
+					DBUF_STAT_BUMP(cache_count);
+					DBUF_STAT_INCR(
+					    cache_levels_bytes[db->db_level],
+					    db->db.db_size);
+					DBUF_STAT_MAX(cache_size_bytes_max,
+					    refcount_count(
+					    &dbuf_caches[dcs].size));
+				}
 				mutex_exit(&db->db_mtx);
 
-				if (!evicting)
+				if (db->db_caching_status == DB_DBUF_CACHE) {
 					dbuf_evict_notify();
+				}
 			}
 
 			if (do_arc_evict)
@@ -3706,7 +3848,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
 		ASSERT(db->db_dirtycnt > 0);
 		db->db_dirtycnt -= 1;
-		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
+		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
 		return;
 	}
 
@@ -4081,7 +4223,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 	db->db_data_pending = NULL;
-	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
+	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
 }
 
 static void
@@ -4445,8 +4587,17 @@ MODULE_PARM_DESC(dbuf_cache_lowater_pct,
 	"Percentage below dbuf_cache_max_bytes when the evict thread stops "
 	"evicting dbufs.");
 
+module_param(dbuf_metadata_cache_max_bytes, ulong, 0644);
+MODULE_PARM_DESC(dbuf_metadata_cache_max_bytes,
+	"Maximum size in bytes of the dbuf metadata cache.");
+
 module_param(dbuf_cache_shift, int, 0644);
 MODULE_PARM_DESC(dbuf_cache_shift,
 	"Set the size of the dbuf cache to a log2 fraction of arc size.");
+
+module_param(dbuf_metadata_cache_shift, int, 0644);
+MODULE_PARM_DESC(dbuf_cache_shift,
+	"Set the size of the dbuf metadata cache to a log2 fraction of "
+	"arc size.");
 /* END CSTYLED */
 #endif
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 1cb967641..0d2f03e22 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -81,60 +81,60 @@ int zfs_dmu_offset_next_sync = 0;
 int zfs_object_remap_one_indirect_delay_ticks = 0;
 
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
-	{ DMU_BSWAP_UINT8,	TRUE,	FALSE,	"unallocated"		},
-	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"object directory"	},
-	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"object array"		},
-	{ DMU_BSWAP_UINT8,	TRUE,	FALSE,	"packed nvlist"		},
-	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"packed nvlist size"	},
-	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"bpobj"			},
-	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"bpobj header"		},
-	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"SPA space map header"	},
-	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"SPA space map"		},
-	{ DMU_BSWAP_UINT64,	TRUE,	TRUE,	"ZIL intent log"	},
-	{ DMU_BSWAP_DNODE,	TRUE,	TRUE,	"DMU dnode"		},
-	{ DMU_BSWAP_OBJSET,	TRUE,	FALSE,	"DMU objset"		},
-	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"DSL directory"		},
-	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"DSL directory child map"},
-	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"DSL dataset snap map"	},
-	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"DSL props"		},
-	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"DSL dataset"		},
-	{ DMU_BSWAP_ZNODE,	TRUE,	FALSE,	"ZFS znode"		},
-	{ DMU_BSWAP_OLDACL,	TRUE,	TRUE,	"ZFS V0 ACL"		},
-	{ DMU_BSWAP_UINT8,	FALSE,	TRUE,	"ZFS plain file"	},
-	{ DMU_BSWAP_ZAP,	TRUE,	TRUE,	"ZFS directory"		},
-	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"ZFS master node"	},
-	{ DMU_BSWAP_ZAP,	TRUE,	TRUE,	"ZFS delete queue"	},
-	{ DMU_BSWAP_UINT8,	FALSE,	TRUE,	"zvol object"		},
-	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"zvol prop"		},
-	{ DMU_BSWAP_UINT8,	FALSE,	TRUE,	"other uint8[]"		},
-	{ DMU_BSWAP_UINT64,	FALSE,	TRUE,	"other uint64[]"	},
-	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"other ZAP"		},
-	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"persistent error log"	},
-	{ DMU_BSWAP_UINT8,	TRUE,	FALSE,	"SPA history"		},
-	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"SPA history offsets"	},
-	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"Pool properties"	},
-	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"DSL permissions"	},
-	{ DMU_BSWAP_ACL,	TRUE,	TRUE,	"ZFS ACL"		},
-	{ DMU_BSWAP_UINT8,	TRUE,	TRUE,	"ZFS SYSACL"		},
-	{ DMU_BSWAP_UINT8,	TRUE,	TRUE,	"FUID table"		},
-	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"FUID table size"	},
-	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"DSL dataset next clones"},
-	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"scan work queue"	},
-	{ DMU_BSWAP_ZAP,	TRUE,	TRUE,	"ZFS user/group/project used" },
-	{ DMU_BSWAP_ZAP,	TRUE,	TRUE,	"ZFS user/group/project quota"},
-	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"snapshot refcount tags"},
-	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"DDT ZAP algorithm"	},
-	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"DDT statistics"	},
-	{ DMU_BSWAP_UINT8,	TRUE,	TRUE,	"System attributes"	},
-	{ DMU_BSWAP_ZAP,	TRUE,	TRUE,	"SA master node"	},
-	{ DMU_BSWAP_ZAP,	TRUE,	TRUE,	"SA attr registration"	},
-	{ DMU_BSWAP_ZAP,	TRUE,	TRUE,	"SA attr layouts"	},
-	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"scan translations"	},
-	{ DMU_BSWAP_UINT8,	FALSE,	TRUE,	"deduplicated block"	},
-	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"DSL deadlist map"	},
-	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"DSL deadlist map hdr"	},
-	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"DSL dir clones"	},
-	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"bpobj subobj"		}
+	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "unallocated"		},
+	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "object directory"	},
+	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "object array"		},
+	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "packed nvlist"		},
+	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "packed nvlist size"	},
+	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj"			},
+	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj header"		},
+	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA space map header"	},
+	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA space map"		},
+	{DMU_BSWAP_UINT64, TRUE,  FALSE, TRUE,  "ZIL intent log"	},
+	{DMU_BSWAP_DNODE,  TRUE,  FALSE, TRUE,  "DMU dnode"		},
+	{DMU_BSWAP_OBJSET, TRUE,  TRUE,  FALSE, "DMU objset"		},
+	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL directory"		},
+	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL directory child map"},
+	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dataset snap map"	},
+	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL props"		},
+	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL dataset"		},
+	{DMU_BSWAP_ZNODE,  TRUE,  FALSE, FALSE, "ZFS znode"		},
+	{DMU_BSWAP_OLDACL, TRUE,  FALSE, TRUE,  "ZFS V0 ACL"		},
+	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "ZFS plain file"	},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS directory"		},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "ZFS master node"	},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS delete queue"	},
+	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "zvol object"		},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "zvol prop"		},
+	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "other uint8[]"		},
+	{DMU_BSWAP_UINT64, FALSE, FALSE, TRUE,  "other uint64[]"	},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "other ZAP"		},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "persistent error log"	},
+	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "SPA history"		},
+	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA history offsets"	},
+	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "Pool properties"	},
+	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL permissions"	},
+	{DMU_BSWAP_ACL,    TRUE,  FALSE, TRUE,  "ZFS ACL"		},
+	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,  "ZFS SYSACL"		},
+	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,  "FUID table"		},
+	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "FUID table size"	},
+	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dataset next clones"},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "scan work queue"	},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS user/group/project used" },
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS user/group/project quota"},
+	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "snapshot refcount tags"},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "DDT ZAP algorithm"	},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "DDT statistics"	},
+	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,	"System attributes"	},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA master node"	},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA attr registration"	},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA attr layouts"	},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "scan translations"	},
+	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "deduplicated block"	},
+	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL deadlist map"	},
+	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL deadlist map hdr"	},
+	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dir clones"	},
+	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj subobj"		}
 };
 
 const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 07b00ffdf..5b18ed5cc 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -471,6 +471,14 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 		os->os_phys = os->os_phys_buf->b_data;
 		bzero(os->os_phys, size);
 	}
+	/*
+	 * These properties will be filled in by the logic in zfs_get_zplprop()
+	 * when they are queried for the first time.
+	 */
+	os->os_version = OBJSET_PROP_UNINITIALIZED;
+	os->os_normalization = OBJSET_PROP_UNINITIALIZED;
+	os->os_utf8only = OBJSET_PROP_UNINITIALIZED;
+	os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED;
 
 	/*
 	 * Note: the changed_cb will be called once before the register
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index fddad607d..7672a62fa 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -1574,11 +1574,11 @@ void
 dnode_rele(dnode_t *dn, void *tag)
 {
 	mutex_enter(&dn->dn_mtx);
-	dnode_rele_and_unlock(dn, tag, B_FALSE);
+	dnode_rele_and_unlock(dn, tag);
 }
 
 void
-dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
+dnode_rele_and_unlock(dnode_t *dn, void *tag)
 {
 	uint64_t refs;
 	/* Get while the hold prevents the dnode from moving. */
@@ -1610,7 +1610,7 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
 		 * asserted anyway when the handle gets destroyed.
 		 */
 		mutex_enter(&db->db_mtx);
-		dbuf_rele_and_unlock(db, dnh, evicting);
+		dbuf_rele_and_unlock(db, dnh);
 	}
 }
 
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c
index 830da26f8..22b401ab5 100644
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -438,7 +438,7 @@ dnode_evict_dbufs(dnode_t *dn)
 			 * flow would look like:
 			 *
 			 * dbuf_destroy():
-			 *   dnode_rele_and_unlock(parent_dbuf, evicting=TRUE):
+			 *   dnode_rele_and_unlock(parent_dbuf):
 			 *	if (!cacheable || pending_evict)
 			 *	  dbuf_destroy()
 			 */
@@ -502,7 +502,7 @@ dnode_undirty_dbufs(list_t *list)
 			list_destroy(&dr->dt.di.dr_children);
 		}
 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
-		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
+		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
 	}
 }
 
diff --git a/module/zfs/zcp_iter.c b/module/zfs/zcp_iter.c
index d37172c88..f26445520 100644
--- a/module/zfs/zcp_iter.c
+++ b/module/zfs/zcp_iter.c
@@ -33,6 +33,8 @@
 
 #include <sys/zcp.h>
 
+#include "zfs_comutil.h"
+
 typedef int (zcp_list_func_t)(lua_State *);
 typedef struct zcp_list_info {
 	const char *name;
@@ -232,20 +234,6 @@ zcp_snapshots_list(lua_State *state)
 	return (1);
 }
 
-/*
- * Note: channel programs only run in the global zone, so all datasets
- * are visible to this zone.
- */
-static boolean_t
-dataset_name_hidden(const char *name)
-{
-	if (strchr(name, '$') != NULL)
-		return (B_TRUE);
-	if (strchr(name, '%') != NULL)
-		return (B_TRUE);
-	return (B_FALSE);
-}
-
 static int
 zcp_children_iter(lua_State *state)
 {
@@ -275,7 +263,7 @@ zcp_children_iter(lua_State *state)
 	do {
 		err = dmu_dir_list_next(os,
 		    sizeof (childname) - (p - childname), p, NULL, &cursor);
-	} while (err == 0 && dataset_name_hidden(childname));
+	} while (err == 0 && zfs_dataset_name_hidden(childname));
 	dsl_dataset_rele(ds, FTAG);
 
 	if (err == ENOENT) {
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index e70207aa5..911bf884a 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -2252,23 +2252,6 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
 	return (err);
 }
 
-boolean_t
-dataset_name_hidden(const char *name)
-{
-	/*
-	 * Skip over datasets that are not visible in this zone,
-	 * internal datasets (which have a $ in their name), and
-	 * temporary datasets (which have a % in their name).
-	 */
-	if (strchr(name, '$') != NULL)
-		return (B_TRUE);
-	if (strchr(name, '%') != NULL)
-		return (B_TRUE);
-	if (!INGLOBALZONE(curproc) && !zone_dataset_visible(name, NULL))
-		return (B_TRUE);
-	return (B_FALSE);
-}
-
 /*
  * inputs:
  * zc_name		name of filesystem
@@ -2308,7 +2291,7 @@ top:
 		    NULL, &zc->zc_cookie);
 		if (error == ENOENT)
 			error = SET_ERROR(ESRCH);
-	} while (error == 0 && dataset_name_hidden(zc->zc_name));
+	} while (error == 0 && zfs_dataset_name_hidden(zc->zc_name));
 	dmu_objset_rele(os, FTAG);
 
 	/*
diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c
index b890bbaf9..a477c8669 100644
--- a/module/zfs/zfs_vfsops.c
+++ b/module/zfs/zfs_vfsops.c
@@ -2234,6 +2234,7 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
 	dmu_tx_commit(tx);
 
 	zfsvfs->z_version = newvers;
+	os->os_version = newvers;
 
 	zfs_set_fuid_feature(zfsvfs);
 
@@ -2246,13 +2247,42 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
 int
 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
 {
-	const char *pname;
-	int error = SET_ERROR(ENOENT);
+	uint64_t *cached_copy = NULL;
+
+	/*
+	 * Figure out where in the objset_t the cached copy would live, if it
+	 * is available for the requested property.
+	 */
+	if (os != NULL) {
+		switch (prop) {
+		case ZFS_PROP_VERSION:
+			cached_copy = &os->os_version;
+			break;
+		case ZFS_PROP_NORMALIZE:
+			cached_copy = &os->os_normalization;
+			break;
+		case ZFS_PROP_UTF8ONLY:
+			cached_copy = &os->os_utf8only;
+			break;
+		case ZFS_PROP_CASE:
+			cached_copy = &os->os_casesensitivity;
+			break;
+		default:
+			break;
+		}
+	}
+	if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
+		*value = *cached_copy;
+		return (0);
+	}
 
 	/*
-	 * Look up the file system's value for the property.  For the
-	 * version property, we look up a slightly different string.
+	 * If the property wasn't cached, look up the file system's value for
+	 * the property. For the version property, we look up a slightly
+	 * different string.
 	 */
+	const char *pname;
+	int error = ENOENT;
 	if (prop == ZFS_PROP_VERSION)
 		pname = ZPL_VERSION_STR;
 	else
@@ -2284,6 +2314,15 @@ zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
 		}
 		error = 0;
 	}
+
+	/*
+	 * If one of the methods for getting the property value above worked,
+	 * copy it into the objset_t's cache.
+	 */
+	if (error == 0 && cached_copy != NULL) {
+		*cached_copy = *value;
+	}
+
 	return (error);
 }
author	Matthew Ahrens <[email protected]>	2018-07-10 13:49:50 -0400
committer	Brian Behlendorf <[email protected]>	2018-07-12 10:49:27 -0700
commit	2e5dc449c1a65e0b0bf730fd69c9b5804bd57ee8 (patch)
tree	8bbb44d8e88afa71ae1071c7a29369721711b0dd /module
parent	e4e94ca3154a9e58ac20e5409c003895ec859964 (diff)