aboutsummaryrefslogtreecommitdiffstats
path: root/module
diff options
context:
space:
mode:
authorRob N <[email protected]>2023-11-18 08:25:53 +1100
committerGitHub <[email protected]>2023-11-17 13:25:53 -0800
commit92dc4ad83d95b88c87b03682f50bbbde20513244 (patch)
tree310d947c00672d5a4348516bec25b6e838110cf9 /module
parent6c6fae6fae5547482a9b6160125be4ce66ee8216 (diff)
Consider `dnode_t` allocations in dbuf cache size accounting
Entries in the dbuf cache contribute only the size of the dbuf data to the cache size. Attached "user" data is not counted. This can lead to the data currently "owned" by the cache consuming more memory accounting appears to show. In some cases (eg a metadnode data block with all child dnode_t slots allocated), the actual size can be as much as 3x as what the cache believes it to be. This is arguably correct behaviour, as the cache is only tracking the size of the dbuf data, not even the overhead of the dbuf_t. On the other hand, in the above case of dnodes, evicting cached metadnode dbufs is the only current way to reclaim the dnode objects, and can lead to the situation where the dbuf cache appears to be comfortably within its target memory window and yet is holding enormous amounts of slab memory that cannot be reclaimed. This commit adds a facility for a dbuf user to artificially inflate the apparent size of the dbuf for caching purposes. This at least allows for cache tuning to be adjusted to match something closer to the real memory overhead. metadnode dbufs carry a >1KiB allocation per dnode in their user data. This informs the dbuf cache machinery of that fact, allowing it to make better decisions when evicting dbufs. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Alexander Motin <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Rob Norris <[email protected]> Closes #15511
Diffstat (limited to 'module')
-rw-r--r--module/zfs/dbuf.c63
-rw-r--r--module/zfs/dbuf_stats.c13
-rw-r--r--module/zfs/dnode.c19
3 files changed, 78 insertions, 17 deletions
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 0a179fffb..5cd97b9fa 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -569,6 +569,21 @@ dbuf_evict_user(dmu_buf_impl_t *db)
*dbu->dbu_clear_on_evict_dbufp = NULL;
#endif
+ if (db->db_caching_status != DB_NO_CACHE) {
+ /*
+ * This is a cached dbuf, so the size of the user data is
+ * included in its cached amount. We adjust it here because the
+ * user data has already been detached from the dbuf, and the
+ * sync functions are not supposed to touch it (the dbuf might
+ * not exist anymore by the time the sync functions run.
+ */
+ uint64_t size = dbu->dbu_size;
+ (void) zfs_refcount_remove_many(
+ &dbuf_caches[db->db_caching_status].size, size, db);
+ if (db->db_caching_status == DB_DBUF_CACHE)
+ DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
+ }
+
/*
* There are two eviction callbacks - one that we call synchronously
* and one that we invoke via a taskq. The async one is useful for
@@ -770,12 +785,12 @@ dbuf_evict_one(void)
if (db != NULL) {
multilist_sublist_remove(mls, db);
multilist_sublist_unlock(mls);
+ uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
(void) zfs_refcount_remove_many(
- &dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db);
+ &dbuf_caches[DB_DBUF_CACHE].size, size, db);
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
DBUF_STAT_BUMPDOWN(cache_count);
- DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
- db->db.db_size);
+ DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
db->db_caching_status = DB_NO_CACHE;
dbuf_destroy(db);
@@ -3002,6 +3017,8 @@ dbuf_destroy(dmu_buf_impl_t *db)
db->db_caching_status == DB_DBUF_METADATA_CACHE);
multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
+
+ ASSERT0(dmu_buf_user_size(&db->db));
(void) zfs_refcount_remove_many(
&dbuf_caches[db->db_caching_status].size,
db->db.db_size, db);
@@ -3749,17 +3766,17 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db_caching_status == DB_DBUF_METADATA_CACHE);
multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
+
+ uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
(void) zfs_refcount_remove_many(
- &dbuf_caches[db->db_caching_status].size,
- db->db.db_size, db);
+ &dbuf_caches[db->db_caching_status].size, size, db);
if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
DBUF_STAT_BUMPDOWN(metadata_cache_count);
} else {
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
DBUF_STAT_BUMPDOWN(cache_count);
- DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
- db->db.db_size);
+ DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
}
db->db_caching_status = DB_NO_CACHE;
}
@@ -3978,7 +3995,8 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
db->db_caching_status = dcs;
multilist_insert(&dbuf_caches[dcs].cache, db);
- uint64_t db_size = db->db.db_size;
+ uint64_t db_size = db->db.db_size +
+ dmu_buf_user_size(&db->db);
size = zfs_refcount_add_many(
&dbuf_caches[dcs].size, db_size, db);
uint8_t db_level = db->db_level;
@@ -4074,6 +4092,35 @@ dmu_buf_get_user(dmu_buf_t *db_fake)
return (db->db_user);
}
+uint64_t
+dmu_buf_user_size(dmu_buf_t *db_fake)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ if (db->db_user == NULL)
+ return (0);
+ return (atomic_load_64(&db->db_user->dbu_size));
+}
+
+void
+dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+ ASSERT3P(db->db_user, !=, NULL);
+ ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd);
+ atomic_add_64(&db->db_user->dbu_size, nadd);
+}
+
+void
+dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+ ASSERT3P(db->db_user, !=, NULL);
+ ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub);
+ atomic_sub_64(&db->db_user->dbu_size, nsub);
+}
+
void
dmu_buf_user_evict_wait(void)
{
diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c
index e5dc2df30..ccee8997e 100644
--- a/module/zfs/dbuf_stats.c
+++ b/module/zfs/dbuf_stats.c
@@ -46,14 +46,14 @@ static int
dbuf_stats_hash_table_headers(char *buf, size_t size)
{
(void) snprintf(buf, size,
- "%-96s | %-119s | %s\n"
- "%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-5s %-5s %-7s %3s | "
+ "%-105s | %-119s | %s\n"
+ "%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-8s %-5s %-5s %-7s %3s | "
"%-5s %-5s %-9s %-6s %-8s %-12s "
"%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-6s | "
"%-6s %-6s %-8s %-8s %-6s %-6s %-6s %-8s %-8s\n",
"dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level",
- "blkid", "offset", "dbsize", "meta", "state", "dbholds", "dbc",
- "list", "atype", "flags", "count", "asize", "access",
+ "blkid", "offset", "dbsize", "usize", "meta", "state", "dbholds",
+ "dbc", "list", "atype", "flags", "count", "asize", "access",
"mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize",
"l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs",
"bsize", "lvls", "dholds", "blocks", "dsize");
@@ -75,8 +75,8 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
__dmu_object_info_from_dnode(dn, &doi);
nwritten = snprintf(buf, size,
- "%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-5d %-5d "
- "%-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu "
+ "%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-8llu "
+ "%-5d %-5d %-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu "
"%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-6lu | "
"%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-6lu %-8llu %-8llu\n",
/* dmu_buf_impl_t */
@@ -87,6 +87,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
(longlong_t)db->db_blkid,
(u_longlong_t)db->db.db_offset,
(u_longlong_t)db->db.db_size,
+ (u_longlong_t)dmu_buf_user_size(&db->db),
!!dbuf_is_metadata(db),
db->db_state,
(ulong_t)zfs_refcount_count(&db->db_holds),
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 79fd02dcb..029d9df8a 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -1237,9 +1237,11 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
return (B_TRUE);
}
-static void
+static uint_t
dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
{
+ uint_t reclaimed = 0;
+
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
for (int i = idx; i < idx + slots; i++) {
@@ -1251,8 +1253,11 @@ dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
dnode_destroy(dnh->dnh_dnode);
dnh->dnh_dnode = DN_SLOT_FREE;
+ reclaimed++;
}
}
+
+ return (reclaimed);
}
void
@@ -1565,6 +1570,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
} else {
dn = dnode_create(os, dn_block + idx, db,
object, dnh);
+ dmu_buf_add_user_size(&db->db,
+ sizeof (dnode_t));
}
}
@@ -1622,8 +1629,13 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
* to be freed. Single slot dnodes can be safely
* re-purposed as a performance optimization.
*/
- if (slots > 1)
- dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+ if (slots > 1) {
+ uint_t reclaimed =
+ dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+ if (reclaimed > 0)
+ dmu_buf_sub_user_size(&db->db,
+ reclaimed * sizeof (dnode_t));
+ }
dnh = &dnc->dnc_children[idx];
if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
@@ -1631,6 +1643,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
} else {
dn = dnode_create(os, dn_block + idx, db,
object, dnh);
+ dmu_buf_add_user_size(&db->db, sizeof (dnode_t));
}
mutex_enter(&dn->dn_mtx);