aboutsummaryrefslogtreecommitdiffstats
path: root/module
diff options
context:
space:
mode:
authorAlexander Motin <[email protected]>2024-11-10 17:29:25 -0500
committerBrian Behlendorf <[email protected]>2024-11-15 15:04:11 -0800
commitfd6e8c1d2a5c8d95d509b691b81c66597e354981 (patch)
treec0e097909e2d2bc381ee2c6ab0ab824c9248ce14 /module
parent309ce6303fa7e455e9adee0674a29685bdbbcc8d (diff)
BRT: Rework structures and locks to be per-vdev
While block cloning operation from the beginning was made per-vdev, before this change most of its data were protected by two pool- wide locks. It created lots of lock contention in many workload. This change makes most of block cloning data structures per-vdev, which allows to lock them separately. The only pool-wide lock now it spa_brt_lock, protecting array of per-vdev pointers and in most cases taken as reader. Also this splits per-vdev locks into three different ones: bv_pending_lock protects the AVL-tree of pending operations in open context, bv_mos_entries_lock protects BRT ZAP object from while being prefetched, and bv_lock protects the rest of per-vdev context during TXG commit process. There should be no functional difference aside of some optimizations. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Pawel Jakub Dawidek <[email protected]> Reviewed-by: Brian Atkinson <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc. Closes #16740
Diffstat (limited to 'module')
-rw-r--r--module/zfs/brt.c783
-rw-r--r--module/zfs/spa_misc.c17
2 files changed, 338 insertions, 462 deletions
diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index ea8c0735c..b1894e3fe 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -317,23 +317,24 @@ struct {
static int brt_entry_compare(const void *x1, const void *x2);
static int brt_pending_entry_compare(const void *x1, const void *x2);
+static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs);
static void
-brt_rlock(brt_t *brt)
+brt_rlock(spa_t *spa)
{
- rw_enter(&brt->brt_lock, RW_READER);
+ rw_enter(&spa->spa_brt_lock, RW_READER);
}
static void
-brt_wlock(brt_t *brt)
+brt_wlock(spa_t *spa)
{
- rw_enter(&brt->brt_lock, RW_WRITER);
+ rw_enter(&spa->spa_brt_lock, RW_WRITER);
}
static void
-brt_unlock(brt_t *brt)
+brt_unlock(spa_t *spa)
{
- rw_exit(&brt->brt_lock);
+ rw_exit(&spa->spa_brt_lock);
}
static uint16_t
@@ -394,14 +395,15 @@ brt_vdev_dump(brt_vdev_t *brtvd)
{
uint64_t idx;
+ uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d "
- "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
+ "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu",
(u_longlong_t)brtvd->bv_vdevid,
brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
(u_longlong_t)brtvd->bv_size,
(u_longlong_t)brtvd->bv_totalcount,
- (u_longlong_t)brtvd->bv_nblocks,
- (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks));
+ (u_longlong_t)nblocks,
+ (size_t)BT_SIZEOFMAP(nblocks));
if (brtvd->bv_totalcount > 0) {
zfs_dbgmsg(" entcounts:");
for (idx = 0; idx < brtvd->bv_size; idx++) {
@@ -415,51 +417,59 @@ brt_vdev_dump(brt_vdev_t *brtvd)
if (brtvd->bv_entcount_dirty) {
char *bitmap;
- bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP);
- for (idx = 0; idx < brtvd->bv_nblocks; idx++) {
+ bitmap = kmem_alloc(nblocks + 1, KM_SLEEP);
+ for (idx = 0; idx < nblocks; idx++) {
bitmap[idx] =
BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
}
bitmap[idx] = '\0';
zfs_dbgmsg(" dirty: %s", bitmap);
- kmem_free(bitmap, brtvd->bv_nblocks + 1);
+ kmem_free(bitmap, nblocks + 1);
}
}
#endif
static brt_vdev_t *
-brt_vdev(brt_t *brt, uint64_t vdevid)
+brt_vdev(spa_t *spa, uint64_t vdevid, boolean_t alloc)
{
- brt_vdev_t *brtvd;
+ brt_vdev_t *brtvd = NULL;
- ASSERT(RW_LOCK_HELD(&brt->brt_lock));
-
- if (vdevid < brt->brt_nvdevs) {
- brtvd = &brt->brt_vdevs[vdevid];
- } else {
- brtvd = NULL;
+ brt_rlock(spa);
+ if (vdevid < spa->spa_brt_nvdevs) {
+ brtvd = spa->spa_brt_vdevs[vdevid];
+ } else if (alloc) {
+ /* New VDEV was added. */
+ brt_unlock(spa);
+ brt_wlock(spa);
+ if (vdevid >= spa->spa_brt_nvdevs)
+ brt_vdevs_expand(spa, vdevid + 1);
+ brtvd = spa->spa_brt_vdevs[vdevid];
}
-
+ brt_unlock(spa);
return (brtvd);
}
static void
-brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
+brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
{
char name[64];
- ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+ ASSERT(brtvd->bv_initiated);
ASSERT0(brtvd->bv_mos_brtvdev);
ASSERT0(brtvd->bv_mos_entries);
ASSERT(brtvd->bv_entcount != NULL);
ASSERT(brtvd->bv_size > 0);
ASSERT(brtvd->bv_bitmap != NULL);
- ASSERT(brtvd->bv_nblocks > 0);
- brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
+ uint64_t mos_entries = zap_create_flags(spa->spa_meta_objset, 0,
ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx);
- VERIFY(brtvd->bv_mos_entries != 0);
+ VERIFY(mos_entries != 0);
+ VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd,
+ &brtvd->bv_mos_entries_dnode));
+ rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
+ brtvd->bv_mos_entries = mos_entries;
+ rw_exit(&brtvd->bv_mos_entries_lock);
BRT_DEBUG("MOS entries created, object=%llu",
(u_longlong_t)brtvd->bv_mos_entries);
@@ -468,7 +478,7 @@ brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
* We will keep array size (bv_size) and cummulative count for all
* bv_entcount[]s (bv_totalcount) in the bonus buffer.
*/
- brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos,
+ brtvd->bv_mos_brtvdev = dmu_object_alloc(spa->spa_meta_objset,
DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
VERIFY(brtvd->bv_mos_brtvdev != 0);
@@ -477,66 +487,62 @@ brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
(u_longlong_t)brtvd->bv_vdevid);
- VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
+ VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name,
sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
BRT_DEBUG("Pool directory object created, object=%s", name);
- spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
+ spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
}
static void
-brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
+brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd)
{
vdev_t *vd;
uint16_t *entcount;
ulong_t *bitmap;
- uint64_t nblocks, size;
+ uint64_t nblocks, onblocks, size;
- ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+ ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
- spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER);
- vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid);
- size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1;
- spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG);
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ vd = vdev_lookup_top(spa, brtvd->bv_vdevid);
+ size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1;
+ spa_config_exit(spa, SCL_VDEV, FTAG);
entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
if (!brtvd->bv_initiated) {
+ ASSERT0(avl_numnodes(&brtvd->bv_tree));
ASSERT0(brtvd->bv_size);
ASSERT(brtvd->bv_entcount == NULL);
ASSERT(brtvd->bv_bitmap == NULL);
- ASSERT0(brtvd->bv_nblocks);
-
- avl_create(&brtvd->bv_tree, brt_entry_compare,
- sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
} else {
ASSERT(brtvd->bv_size > 0);
ASSERT(brtvd->bv_entcount != NULL);
ASSERT(brtvd->bv_bitmap != NULL);
- ASSERT(brtvd->bv_nblocks > 0);
/*
* TODO: Allow vdev shrinking. We only need to implement
* shrinking the on-disk BRT VDEV object.
- * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset,
- * size, tx);
+ * dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
+ * offset, size, tx);
*/
ASSERT3U(brtvd->bv_size, <=, size);
memcpy(entcount, brtvd->bv_entcount,
sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
- memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
- BT_SIZEOFMAP(brtvd->bv_nblocks)));
vmem_free(brtvd->bv_entcount,
sizeof (entcount[0]) * brtvd->bv_size);
- kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
+ onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
+ memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
+ BT_SIZEOFMAP(onblocks)));
+ kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks));
}
brtvd->bv_size = size;
brtvd->bv_entcount = entcount;
brtvd->bv_bitmap = bitmap;
- brtvd->bv_nblocks = nblocks;
if (!brtvd->bv_initiated) {
brtvd->bv_need_byteswap = FALSE;
brtvd->bv_initiated = TRUE;
@@ -546,7 +552,7 @@ brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
}
static void
-brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd)
+brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd)
{
char name[64];
dmu_buf_t *db;
@@ -555,26 +561,27 @@ brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd)
snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
(u_longlong_t)brtvd->bv_vdevid);
- error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
- sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev);
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ name, sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev);
if (error != 0)
return;
ASSERT(brtvd->bv_mos_brtvdev != 0);
- error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db);
+ error = dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
+ FTAG, &db);
ASSERT0(error);
if (error != 0)
return;
bvphys = db->db_data;
- if (brt->brt_rangesize == 0) {
- brt->brt_rangesize = bvphys->bvp_rangesize;
+ if (spa->spa_brt_rangesize == 0) {
+ spa->spa_brt_rangesize = bvphys->bvp_rangesize;
} else {
- ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize);
+ ASSERT3U(spa->spa_brt_rangesize, ==, bvphys->bvp_rangesize);
}
ASSERT(!brtvd->bv_initiated);
- brt_vdev_realloc(brt, brtvd);
+ brt_vdev_realloc(spa, brtvd);
/* TODO: We don't support VDEV shrinking. */
ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
@@ -582,20 +589,22 @@ brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd)
/*
* If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
*/
- error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
+ error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
ASSERT0(error);
+ ASSERT(bvphys->bvp_mos_entries != 0);
+ VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd,
+ &brtvd->bv_mos_entries_dnode));
+ rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
- ASSERT(brtvd->bv_mos_entries != 0);
+ rw_exit(&brtvd->bv_mos_entries_lock);
brtvd->bv_need_byteswap =
(bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
brtvd->bv_totalcount = bvphys->bvp_totalcount;
brtvd->bv_usedspace = bvphys->bvp_usedspace;
brtvd->bv_savedspace = bvphys->bvp_savedspace;
- brt->brt_usedspace += brtvd->bv_usedspace;
- brt->brt_savedspace += brtvd->bv_savedspace;
dmu_buf_rele(db, FTAG);
@@ -605,107 +614,120 @@ brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd)
}
static void
-brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd)
+brt_vdev_dealloc(brt_vdev_t *brtvd)
{
-
- ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+ ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
ASSERT(brtvd->bv_initiated);
+ ASSERT0(avl_numnodes(&brtvd->bv_tree));
vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
brtvd->bv_entcount = NULL;
- kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
+ uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
+ kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks));
brtvd->bv_bitmap = NULL;
- ASSERT0(avl_numnodes(&brtvd->bv_tree));
- avl_destroy(&brtvd->bv_tree);
brtvd->bv_size = 0;
- brtvd->bv_nblocks = 0;
brtvd->bv_initiated = FALSE;
BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
}
static void
-brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
+brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
{
char name[64];
uint64_t count;
- dmu_buf_t *db;
- brt_vdev_phys_t *bvphys;
- ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+ ASSERT(brtvd->bv_initiated);
ASSERT(brtvd->bv_mos_brtvdev != 0);
ASSERT(brtvd->bv_mos_entries != 0);
+ ASSERT0(brtvd->bv_totalcount);
+ ASSERT0(brtvd->bv_usedspace);
+ ASSERT0(brtvd->bv_savedspace);
- VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count));
- VERIFY0(count);
- VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx));
- BRT_DEBUG("MOS entries destroyed, object=%llu",
- (u_longlong_t)brtvd->bv_mos_entries);
+ uint64_t mos_entries = brtvd->bv_mos_entries;
+ rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
brtvd->bv_mos_entries = 0;
+ rw_exit(&brtvd->bv_mos_entries_lock);
+ dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
+ brtvd->bv_mos_entries_dnode = NULL;
+ ASSERT0(zap_count(spa->spa_meta_objset, mos_entries, &count));
+ ASSERT0(count);
+ VERIFY0(zap_destroy(spa->spa_meta_objset, mos_entries, tx));
+ BRT_DEBUG("MOS entries destroyed, object=%llu",
+ (u_longlong_t)mos_entries);
- VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
- bvphys = db->db_data;
- ASSERT0(bvphys->bvp_totalcount);
- ASSERT0(bvphys->bvp_usedspace);
- ASSERT0(bvphys->bvp_savedspace);
- dmu_buf_rele(db, FTAG);
-
- VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx));
+ VERIFY0(dmu_object_free(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
+ tx));
BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
(u_longlong_t)brtvd->bv_mos_brtvdev);
brtvd->bv_mos_brtvdev = 0;
snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
(u_longlong_t)brtvd->bv_vdevid);
- VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx));
+ VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ name, tx));
BRT_DEBUG("Pool directory object removed, object=%s", name);
- brt_vdev_dealloc(brt, brtvd);
+ brtvd->bv_meta_dirty = FALSE;
+
+ rw_enter(&brtvd->bv_lock, RW_WRITER);
+ brt_vdev_dealloc(brtvd);
+ rw_exit(&brtvd->bv_lock);
- spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
+ spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
}
static void
-brt_vdevs_expand(brt_t *brt, uint64_t nvdevs)
+brt_vdevs_expand(spa_t *spa, uint64_t nvdevs)
{
- brt_vdev_t *brtvd, *vdevs;
- uint64_t vdevid;
+ brt_vdev_t **vdevs;
- ASSERT(RW_WRITE_HELD(&brt->brt_lock));
- ASSERT3U(nvdevs, >, brt->brt_nvdevs);
+ ASSERT(RW_WRITE_HELD(&spa->spa_brt_lock));
+ ASSERT3U(nvdevs, >, spa->spa_brt_nvdevs);
- vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP);
- if (brt->brt_nvdevs > 0) {
- ASSERT(brt->brt_vdevs != NULL);
+ vdevs = kmem_zalloc(sizeof (*spa->spa_brt_vdevs) * nvdevs, KM_SLEEP);
+ if (spa->spa_brt_nvdevs > 0) {
+ ASSERT(spa->spa_brt_vdevs != NULL);
- memcpy(vdevs, brt->brt_vdevs,
- sizeof (brt_vdev_t) * brt->brt_nvdevs);
- kmem_free(brt->brt_vdevs,
- sizeof (brt_vdev_t) * brt->brt_nvdevs);
+ memcpy(vdevs, spa->spa_brt_vdevs,
+ sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
+ kmem_free(spa->spa_brt_vdevs,
+ sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
}
- for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) {
- brtvd = &vdevs[vdevid];
+ spa->spa_brt_vdevs = vdevs;
+ for (uint64_t vdevid = spa->spa_brt_nvdevs; vdevid < nvdevs; vdevid++) {
+ brt_vdev_t *brtvd = kmem_zalloc(sizeof (*brtvd), KM_SLEEP);
+ rw_init(&brtvd->bv_lock, NULL, RW_DEFAULT, NULL);
brtvd->bv_vdevid = vdevid;
brtvd->bv_initiated = FALSE;
+ rw_init(&brtvd->bv_mos_entries_lock, NULL, RW_DEFAULT, NULL);
+ avl_create(&brtvd->bv_tree, brt_entry_compare,
+ sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
+ for (int i = 0; i < TXG_SIZE; i++) {
+ avl_create(&brtvd->bv_pending_tree[i],
+ brt_pending_entry_compare,
+ sizeof (brt_pending_entry_t),
+ offsetof(brt_pending_entry_t, bpe_node));
+ }
+ mutex_init(&brtvd->bv_pending_lock, NULL, MUTEX_DEFAULT, NULL);
+ spa->spa_brt_vdevs[vdevid] = brtvd;
}
BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
- (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs);
-
- brt->brt_vdevs = vdevs;
- brt->brt_nvdevs = nvdevs;
+ (u_longlong_t)spa->spa_brt_nvdevs, (u_longlong_t)nvdevs);
+ spa->spa_brt_nvdevs = nvdevs;
}
static boolean_t
-brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre)
+brt_vdev_lookup(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre)
{
uint64_t idx;
- ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+ ASSERT(RW_LOCK_HELD(&brtvd->bv_lock));
- idx = bre->bre_offset / brt->brt_rangesize;
+ idx = bre->bre_offset / spa->spa_brt_rangesize;
if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) {
/* VDEV wasn't expanded. */
return (brt_vdev_entcount_get(brtvd, idx) > 0);
@@ -715,30 +737,27 @@ brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre)
}
static void
-brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
- uint64_t dsize)
+brt_vdev_addref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
+ uint64_t dsize, uint64_t count)
{
uint64_t idx;
- ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+ ASSERT(RW_LOCK_HELD(&brtvd->bv_lock));
ASSERT(brtvd != NULL);
ASSERT(brtvd->bv_entcount != NULL);
- brt->brt_savedspace += dsize;
- brtvd->bv_savedspace += dsize;
+ brtvd->bv_savedspace += dsize * count;
brtvd->bv_meta_dirty = TRUE;
- if (bre->bre_refcount > 1) {
+ if (bre->bre_refcount > 0)
return;
- }
- brt->brt_usedspace += dsize;
brtvd->bv_usedspace += dsize;
- idx = bre->bre_offset / brt->brt_rangesize;
+ idx = bre->bre_offset / spa->spa_brt_rangesize;
if (idx >= brtvd->bv_size) {
/* VDEV has been expanded. */
- brt_vdev_realloc(brt, brtvd);
+ brt_vdev_realloc(spa, brtvd);
}
ASSERT3U(idx, <, brtvd->bv_size);
@@ -748,35 +767,27 @@ brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
brtvd->bv_entcount_dirty = TRUE;
idx = idx / BRT_BLOCKSIZE / 8;
BT_SET(brtvd->bv_bitmap, idx);
-
-#ifdef ZFS_DEBUG
- if (zfs_flags & ZFS_DEBUG_BRT)
- brt_vdev_dump(brtvd);
-#endif
}
static void
-brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
+brt_vdev_decref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
uint64_t dsize)
{
uint64_t idx;
- ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+ ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
ASSERT(brtvd != NULL);
ASSERT(brtvd->bv_entcount != NULL);
- brt->brt_savedspace -= dsize;
brtvd->bv_savedspace -= dsize;
brtvd->bv_meta_dirty = TRUE;
- if (bre->bre_refcount > 0) {
+ if (bre->bre_refcount > 0)
return;
- }
- brt->brt_usedspace -= dsize;
brtvd->bv_usedspace -= dsize;
- idx = bre->bre_offset / brt->brt_rangesize;
+ idx = bre->bre_offset / spa->spa_brt_rangesize;
ASSERT3U(idx, <, brtvd->bv_size);
ASSERT(brtvd->bv_totalcount > 0);
@@ -785,15 +796,10 @@ brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
brtvd->bv_entcount_dirty = TRUE;
idx = idx / BRT_BLOCKSIZE / 8;
BT_SET(brtvd->bv_bitmap, idx);
-
-#ifdef ZFS_DEBUG
- if (zfs_flags & ZFS_DEBUG_BRT)
- brt_vdev_dump(brtvd);
-#endif
}
static void
-brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
+brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
{
dmu_buf_t *db;
brt_vdev_phys_t *bvphys;
@@ -802,16 +808,18 @@ brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
ASSERT(brtvd->bv_mos_brtvdev != 0);
ASSERT(dmu_tx_is_syncing(tx));
- VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
+ VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
+ FTAG, &db));
if (brtvd->bv_entcount_dirty) {
/*
* TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
*/
- dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
+ dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
brtvd->bv_entcount, tx);
- memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks));
+ uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
+ memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks));
brtvd->bv_entcount_dirty = FALSE;
}
@@ -825,7 +833,7 @@ brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
}
bvphys->bvp_totalcount = brtvd->bv_totalcount;
- bvphys->bvp_rangesize = brt->brt_rangesize;
+ bvphys->bvp_rangesize = spa->spa_brt_rangesize;
bvphys->bvp_usedspace = brtvd->bv_usedspace;
bvphys->bvp_savedspace = brtvd->bv_savedspace;
dmu_buf_rele(db, FTAG);
@@ -834,47 +842,49 @@ brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
}
static void
-brt_vdevs_alloc(brt_t *brt, boolean_t load)
+brt_vdevs_alloc(spa_t *spa, boolean_t load)
{
- brt_vdev_t *brtvd;
- uint64_t vdevid;
-
- brt_wlock(brt);
-
- brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children);
+ brt_wlock(spa);
+ brt_vdevs_expand(spa, spa->spa_root_vdev->vdev_children);
if (load) {
- for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
- brtvd = &brt->brt_vdevs[vdevid];
+ for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs;
+ vdevid++) {
+ brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
+ rw_enter(&brtvd->bv_lock, RW_WRITER);
ASSERT(brtvd->bv_entcount == NULL);
-
- brt_vdev_load(brt, brtvd);
+ brt_vdev_load(spa, brtvd);
+ rw_exit(&brtvd->bv_lock);
}
}
- if (brt->brt_rangesize == 0) {
- brt->brt_rangesize = BRT_RANGESIZE;
+ if (spa->spa_brt_rangesize == 0) {
+ spa->spa_brt_rangesize = BRT_RANGESIZE;
}
-
- brt_unlock(brt);
+ brt_unlock(spa);
}
static void
-brt_vdevs_free(brt_t *brt)
+brt_vdevs_free(spa_t *spa)
{
- brt_vdev_t *brtvd;
- uint64_t vdevid;
-
- brt_wlock(brt);
-
- for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
- brtvd = &brt->brt_vdevs[vdevid];
+ for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
+ brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
+ rw_enter(&brtvd->bv_lock, RW_WRITER);
if (brtvd->bv_initiated)
- brt_vdev_dealloc(brt, brtvd);
+ brt_vdev_dealloc(brtvd);
+ rw_exit(&brtvd->bv_lock);
+ rw_destroy(&brtvd->bv_lock);
+ if (brtvd->bv_mos_entries != 0)
+ dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
+ rw_destroy(&brtvd->bv_mos_entries_lock);
+ avl_destroy(&brtvd->bv_tree);
+ for (int i = 0; i < TXG_SIZE; i++)
+ avl_destroy(&brtvd->bv_pending_tree[i]);
+ mutex_destroy(&brtvd->bv_pending_lock);
+ kmem_free(brtvd, sizeof (*brtvd));
}
- kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs);
-
- brt_unlock(brt);
+ kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) *
+ spa->spa_brt_nvdevs);
}
static void
@@ -897,53 +907,27 @@ brt_entry_compare(const void *x1, const void *x2)
}
static int
-brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
+brt_entry_lookup(spa_t *spa, brt_vdev_t *brtvd, brt_entry_t *bre, krw_t rw)
{
- uint64_t mos_entries;
- int error;
+ ASSERT(RW_LOCK_HELD(&brtvd->bv_lock));
- ASSERT(RW_LOCK_HELD(&brt->brt_lock));
-
- if (!brt_vdev_lookup(brt, brtvd, bre))
+ if (!brt_vdev_lookup(spa, brtvd, bre))
return (SET_ERROR(ENOENT));
- /*
- * Remember mos_entries object number. After we reacquire the BRT lock,
- * the brtvd pointer may be invalid.
- */
- mos_entries = brtvd->bv_mos_entries;
- if (mos_entries == 0)
+ if (brtvd->bv_mos_entries == 0)
return (SET_ERROR(ENOENT));
- brt_unlock(brt);
+ rw_exit(&brtvd->bv_lock);
- error = zap_lookup_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
- BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), &bre->bre_refcount);
+ int error = zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
+ &bre->bre_offset, BRT_KEY_WORDS, 1,
+ sizeof (bre->bre_refcount), &bre->bre_refcount);
- brt_wlock(brt);
+ rw_enter(&brtvd->bv_lock, rw);
return (error);
}
-static void
-brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
-{
- brt_vdev_t *brtvd;
- uint64_t mos_entries = 0;
-
- brt_rlock(brt);
- brtvd = brt_vdev(brt, vdevid);
- if (brtvd != NULL)
- mos_entries = brtvd->bv_mos_entries;
- brt_unlock(brt);
-
- if (mos_entries == 0)
- return;
-
- (void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
- (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
-}
-
/*
* Return TRUE if we _can_ have BRT entry for this bp. It might be false
* positive, but gives us quick answer if we should look into BRT, which
@@ -952,25 +936,24 @@ brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
boolean_t
brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
{
- brt_t *brt = spa->spa_brt;
- brt_vdev_t *brtvd;
brt_entry_t bre_search;
boolean_t mayexists = FALSE;
uint64_t vdevid;
- brt_entry_fill(bp, &bre_search, &vdevid);
+ if (spa->spa_brt_nvdevs == 0)
+ return (B_FALSE);
- brt_rlock(brt);
+ brt_entry_fill(bp, &bre_search, &vdevid);
- brtvd = brt_vdev(brt, vdevid);
- if (brtvd != NULL && brtvd->bv_initiated) {
- if (!avl_is_empty(&brtvd->bv_tree) ||
- brt_vdev_lookup(brt, brtvd, &bre_search)) {
- mayexists = TRUE;
- }
- }
+ brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
+ if (brtvd == NULL)
+ return (FALSE);
- brt_unlock(brt);
+ rw_enter(&brtvd->bv_lock, RW_READER);
+ if (brtvd->bv_initiated && (!avl_is_empty(&brtvd->bv_tree) ||
+ brt_vdev_lookup(spa, brtvd, &bre_search)))
+ mayexists = TRUE;
+ rw_exit(&brtvd->bv_lock);
return (mayexists);
}
@@ -978,46 +961,44 @@ brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
uint64_t
brt_get_dspace(spa_t *spa)
{
- brt_t *brt = spa->spa_brt;
-
- if (brt == NULL)
+ if (spa->spa_brt_nvdevs == 0)
return (0);
- return (brt->brt_savedspace);
+ brt_rlock(spa);
+ uint64_t s = 0;
+ for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
+ s += spa->spa_brt_vdevs[vdevid]->bv_savedspace;
+ brt_unlock(spa);
+ return (s);
}
uint64_t
brt_get_used(spa_t *spa)
{
- brt_t *brt = spa->spa_brt;
-
- if (brt == NULL)
+ if (spa->spa_brt_nvdevs == 0)
return (0);
- return (brt->brt_usedspace);
+ brt_rlock(spa);
+ uint64_t s = 0;
+ for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
+ s += spa->spa_brt_vdevs[vdevid]->bv_usedspace;
+ brt_unlock(spa);
+ return (s);
}
uint64_t
brt_get_saved(spa_t *spa)
{
- brt_t *brt = spa->spa_brt;
-
- if (brt == NULL)
- return (0);
-
- return (brt->brt_savedspace);
+ return (brt_get_dspace(spa));
}
uint64_t
brt_get_ratio(spa_t *spa)
{
- brt_t *brt = spa->spa_brt;
-
- if (brt->brt_usedspace == 0)
+ uint64_t used = brt_get_used(spa);
+ if (used == 0)
return (100);
-
- return ((brt->brt_usedspace + brt->brt_savedspace) * 100 /
- brt->brt_usedspace);
+ return ((used + brt_get_saved(spa)) * 100 / used);
}
static int
@@ -1148,82 +1129,58 @@ brt_entry_free(brt_entry_t *bre)
}
static void
-brt_entry_addref(brt_t *brt, const blkptr_t *bp)
+brt_entry_addref(spa_t *spa, brt_vdev_t *brtvd, const blkptr_t *bp,
+ uint64_t count)
{
- brt_vdev_t *brtvd;
brt_entry_t *bre, *racebre;
brt_entry_t bre_search;
avl_index_t where;
uint64_t vdevid;
int error;
- ASSERT(!RW_WRITE_HELD(&brt->brt_lock));
-
brt_entry_fill(bp, &bre_search, &vdevid);
+ ASSERT3U(brtvd->bv_vdevid, ==, vdevid);
- brt_wlock(brt);
-
- brtvd = brt_vdev(brt, vdevid);
- if (brtvd == NULL) {
- ASSERT3U(vdevid, >=, brt->brt_nvdevs);
-
- /* New VDEV was added. */
- brt_vdevs_expand(brt, vdevid + 1);
- brtvd = brt_vdev(brt, vdevid);
- }
- ASSERT(brtvd != NULL);
+ rw_enter(&brtvd->bv_lock, RW_WRITER);
if (!brtvd->bv_initiated)
- brt_vdev_realloc(brt, brtvd);
+ brt_vdev_realloc(spa, brtvd);
bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
if (bre != NULL) {
BRTSTAT_BUMP(brt_addref_entry_in_memory);
} else {
- /*
- * brt_entry_lookup() may drop the BRT (read) lock and
- * reacquire it (write).
- */
- error = brt_entry_lookup(brt, brtvd, &bre_search);
+ /* brt_entry_lookup() may drop the lock */
+ error = brt_entry_lookup(spa, brtvd, &bre_search, RW_WRITER);
/* bre_search now contains correct bre_refcount */
ASSERT(error == 0 || error == ENOENT);
if (error == 0)
BRTSTAT_BUMP(brt_addref_entry_on_disk);
else
BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
- /*
- * When the BRT lock was dropped, brt_vdevs[] may have been
- * expanded and reallocated, we need to update brtvd's pointer.
- */
- brtvd = brt_vdev(brt, vdevid);
- ASSERT(brtvd != NULL);
racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
if (racebre == NULL) {
bre = brt_entry_alloc(&bre_search);
- ASSERT(RW_WRITE_HELD(&brt->brt_lock));
avl_insert(&brtvd->bv_tree, bre, where);
- brt->brt_nentries++;
} else {
/*
- * The entry was added when the BRT lock was dropped in
+ * The entry was added when the lock was dropped in
* brt_entry_lookup().
*/
BRTSTAT_BUMP(brt_addref_entry_read_lost_race);
bre = racebre;
}
}
- bre->bre_refcount++;
- brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
+ brt_vdev_addref(spa, brtvd, bre, bp_get_dsize(spa, bp), count);
+ bre->bre_refcount += count;
- brt_unlock(brt);
+ rw_exit(&brtvd->bv_lock);
}
/* Return TRUE if block should be freed immediately. */
boolean_t
brt_entry_decref(spa_t *spa, const blkptr_t *bp)
{
- brt_t *brt = spa->spa_brt;
- brt_vdev_t *brtvd;
brt_entry_t *bre, *racebre;
brt_entry_t bre_search;
avl_index_t where;
@@ -1232,11 +1189,11 @@ brt_entry_decref(spa_t *spa, const blkptr_t *bp)
brt_entry_fill(bp, &bre_search, &vdevid);
- brt_wlock(brt);
-
- brtvd = brt_vdev(brt, vdevid);
+ brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
ASSERT(brtvd != NULL);
+ rw_enter(&brtvd->bv_lock, RW_WRITER);
+ ASSERT(brtvd->bv_initiated);
bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
if (bre != NULL) {
BRTSTAT_BUMP(brt_decref_entry_in_memory);
@@ -1245,18 +1202,10 @@ brt_entry_decref(spa_t *spa, const blkptr_t *bp)
BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
}
- /*
- * brt_entry_lookup() may drop the BRT lock and reacquire it.
- */
- error = brt_entry_lookup(brt, brtvd, &bre_search);
+ /* brt_entry_lookup() may drop the lock. */
+ error = brt_entry_lookup(spa, brtvd, &bre_search, RW_WRITER);
/* bre_search now contains correct bre_refcount */
ASSERT(error == 0 || error == ENOENT);
- /*
- * When the BRT lock was dropped, brt_vdevs[] may have been expanded
- * and reallocated, we need to update brtvd's pointer.
- */
- brtvd = brt_vdev(brt, vdevid);
- ASSERT(brtvd != NULL);
if (error == ENOENT) {
BRTSTAT_BUMP(brt_decref_entry_not_on_disk);
@@ -1267,7 +1216,7 @@ brt_entry_decref(spa_t *spa, const blkptr_t *bp)
racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
if (racebre != NULL) {
/*
- * The entry was added when the BRT lock was dropped in
+ * The entry was added when the lock was dropped in
* brt_entry_lookup().
*/
BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
@@ -1277,21 +1226,19 @@ brt_entry_decref(spa_t *spa, const blkptr_t *bp)
BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
bre = brt_entry_alloc(&bre_search);
- ASSERT(RW_WRITE_HELD(&brt->brt_lock));
avl_insert(&brtvd->bv_tree, bre, where);
- brt->brt_nentries++;
out:
if (bre == NULL) {
/*
* This is a free of a regular (not cloned) block.
*/
- brt_unlock(brt);
+ rw_exit(&brtvd->bv_lock);
BRTSTAT_BUMP(brt_decref_no_entry);
return (B_TRUE);
}
if (bre->bre_refcount == 0) {
- brt_unlock(brt);
+ rw_exit(&brtvd->bv_lock);
BRTSTAT_BUMP(brt_decref_free_data_now);
return (B_TRUE);
}
@@ -1302,9 +1249,9 @@ out:
BRTSTAT_BUMP(brt_decref_free_data_later);
else
BRTSTAT_BUMP(brt_decref_entry_still_referenced);
- brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
+ brt_vdev_decref(spa, brtvd, bre, bp_get_dsize_sync(spa, bp));
- brt_unlock(brt);
+ rw_exit(&brtvd->bv_lock);
return (B_FALSE);
}
@@ -1312,22 +1259,20 @@ out:
uint64_t
brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
{
- brt_t *brt = spa->spa_brt;
- brt_vdev_t *brtvd;
brt_entry_t bre_search, *bre;
uint64_t vdevid, refcnt;
int error;
brt_entry_fill(bp, &bre_search, &vdevid);
- brt_rlock(brt);
-
- brtvd = brt_vdev(brt, vdevid);
+ brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
ASSERT(brtvd != NULL);
+ rw_enter(&brtvd->bv_lock, RW_READER);
+ ASSERT(brtvd->bv_initiated);
bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
if (bre == NULL) {
- error = brt_entry_lookup(brt, brtvd, &bre_search);
+ error = brt_entry_lookup(spa, brtvd, &bre_search, RW_READER);
ASSERT(error == 0 || error == ENOENT);
if (error == ENOENT)
refcnt = 0;
@@ -1335,25 +1280,24 @@ brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
refcnt = bre_search.bre_refcount;
} else
refcnt = bre->bre_refcount;
+ rw_exit(&brtvd->bv_lock);
- brt_unlock(brt);
return (refcnt);
}
static void
-brt_prefetch(brt_t *brt, const blkptr_t *bp)
+brt_prefetch(brt_vdev_t *brtvd, const blkptr_t *bp)
{
- brt_entry_t bre;
- uint64_t vdevid;
-
- ASSERT(bp != NULL);
-
- if (!brt_zap_prefetch)
+ if (!brt_zap_prefetch || brtvd->bv_mos_entries == 0)
return;
- brt_entry_fill(bp, &bre, &vdevid);
-
- brt_entry_prefetch(brt, vdevid, &bre);
+ rw_enter(&brtvd->bv_mos_entries_lock, RW_READER);
+ if (brtvd->bv_mos_entries != 0) {
+ uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[0]);
+ (void) zap_prefetch_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
+ &offset, BRT_KEY_WORDS);
+ }
+ rw_exit(&brtvd->bv_mos_entries_lock);
}
static int
@@ -1363,15 +1307,10 @@ brt_pending_entry_compare(const void *x1, const void *x2)
const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
int cmp;
- cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
- DVA_GET_VDEV(&bp2->blk_dva[0]));
- if (cmp == 0) {
- cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
- DVA_GET_OFFSET(&bp2->blk_dva[0]));
- if (unlikely(cmp == 0)) {
- cmp = TREE_CMP(BP_GET_BIRTH(bp1), BP_GET_BIRTH(bp2));
- }
- }
+ cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
+ DVA_GET_OFFSET(&bp2->blk_dva[0]));
+ if (unlikely(cmp == 0))
+ cmp = TREE_CMP(BP_GET_BIRTH(bp1), BP_GET_BIRTH(bp2));
return (cmp);
}
@@ -1379,25 +1318,22 @@ brt_pending_entry_compare(const void *x1, const void *x2)
void
brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
{
- brt_t *brt;
- avl_tree_t *pending_tree;
- kmutex_t *pending_lock;
brt_pending_entry_t *bpe, *newbpe;
avl_index_t where;
uint64_t txg;
- brt = spa->spa_brt;
txg = dmu_tx_get_txg(tx);
ASSERT3U(txg, !=, 0);
- pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
- pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
+
+ uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
+ brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE);
+ avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP);
newbpe->bpe_bp = *bp;
newbpe->bpe_count = 1;
- mutex_enter(pending_lock);
-
+ mutex_enter(&brtvd->bv_pending_lock);
bpe = avl_find(pending_tree, newbpe, &where);
if (bpe == NULL) {
avl_insert(pending_tree, newbpe, where);
@@ -1405,8 +1341,7 @@ brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
} else {
bpe->bpe_count++;
}
-
- mutex_exit(pending_lock);
+ mutex_exit(&brtvd->bv_pending_lock);
if (newbpe != NULL) {
ASSERT(bpe != NULL);
@@ -1416,82 +1351,86 @@ brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
ASSERT(bpe == NULL);
/* Prefetch BRT entry for the syncing context. */
- brt_prefetch(brt, bp);
+ brt_prefetch(brtvd, bp);
}
}
void
brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
{
- brt_t *brt;
- avl_tree_t *pending_tree;
- kmutex_t *pending_lock;
brt_pending_entry_t *bpe, bpe_search;
uint64_t txg;
- brt = spa->spa_brt;
txg = dmu_tx_get_txg(tx);
ASSERT3U(txg, !=, 0);
- pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
- pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
- bpe_search.bpe_bp = *bp;
+ uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
+ brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE);
+ avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
- mutex_enter(pending_lock);
+ bpe_search.bpe_bp = *bp;
+ mutex_enter(&brtvd->bv_pending_lock);
bpe = avl_find(pending_tree, &bpe_search, NULL);
/* I believe we should always find bpe when this function is called. */
if (bpe != NULL) {
ASSERT(bpe->bpe_count > 0);
-
bpe->bpe_count--;
- if (bpe->bpe_count == 0) {
+ if (bpe->bpe_count == 0)
avl_remove(pending_tree, bpe);
- kmem_cache_free(brt_pending_entry_cache, bpe);
- }
+ else
+ bpe = NULL;
}
+ mutex_exit(&brtvd->bv_pending_lock);
- mutex_exit(pending_lock);
+ if (bpe)
+ kmem_cache_free(brt_pending_entry_cache, bpe);
}
void
brt_pending_apply(spa_t *spa, uint64_t txg)
{
- brt_t *brt = spa->spa_brt;
brt_pending_entry_t *bpe;
avl_tree_t *pending_tree;
- void *c;
ASSERT3U(txg, !=, 0);
- /*
- * We are in syncing context, so no other brt_pending_tree accesses
- * are possible for the TXG. Don't need to acquire brt_pending_lock.
- */
- pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
+ brt_rlock(spa);
+ for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
+ brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
+ brt_unlock(spa);
- c = NULL;
- while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
- boolean_t added_to_ddt;
+ /*
+ * We are in syncing context, so no other bv_pending_tree
+ * accesses are possible for the TXG. So we don't need to
+ * acquire bv_pending_lock.
+ */
+ pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
- for (int i = 0; i < bpe->bpe_count; i++) {
+ void *c = NULL;
+ while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
/*
* If the block has DEDUP bit set, it means that it
* already exists in the DEDUP table, so we can just
- * use that instead of creating new entry in
- * the BRT table.
+ * use that instead of creating new entry in the BRT.
*/
if (BP_GET_DEDUP(&bpe->bpe_bp)) {
- added_to_ddt = ddt_addref(spa, &bpe->bpe_bp);
+ for (uint64_t c = bpe->bpe_count; c > 0; c--) {
+ if (ddt_addref(spa, &bpe->bpe_bp))
+ continue;
+ brt_entry_addref(spa, brtvd,
+ &bpe->bpe_bp, c);
+ break;
+ }
} else {
- added_to_ddt = B_FALSE;
+ brt_entry_addref(spa, brtvd, &bpe->bpe_bp,
+ bpe->bpe_count);
}
- if (!added_to_ddt)
- brt_entry_addref(brt, &bpe->bpe_bp);
+ kmem_cache_free(brt_pending_entry_cache, bpe);
}
-
- kmem_cache_free(brt_pending_entry_cache, bpe);
+ brt_rlock(spa);
}
+ brt_unlock(spa);
}
static void
@@ -1509,25 +1448,19 @@ brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
}
static void
-brt_sync_table(brt_t *brt, dmu_tx_t *tx)
+brt_sync_table(spa_t *spa, dmu_tx_t *tx)
{
- brt_vdev_t *brtvd;
brt_entry_t *bre;
- dnode_t *dn;
- uint64_t vdevid;
- void *c;
-
- brt_wlock(brt);
- for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
- brtvd = &brt->brt_vdevs[vdevid];
-
- if (!brtvd->bv_initiated)
- continue;
+ brt_rlock(spa);
+ for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
+ brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
+ brt_unlock(spa);
if (!brtvd->bv_meta_dirty) {
ASSERT(!brtvd->bv_entcount_dirty);
ASSERT0(avl_numnodes(&brtvd->bv_tree));
+ brt_rlock(spa);
continue;
}
@@ -1535,132 +1468,80 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx)
avl_numnodes(&brtvd->bv_tree) != 0);
if (brtvd->bv_mos_brtvdev == 0)
- brt_vdev_create(brt, brtvd, tx);
-
- VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries,
- FTAG, &dn));
+ brt_vdev_create(spa, brtvd, tx);
- c = NULL;
+ void *c = NULL;
while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
- brt_sync_entry(dn, bre, tx);
+ brt_sync_entry(brtvd->bv_mos_entries_dnode, bre, tx);
brt_entry_free(bre);
- ASSERT(brt->brt_nentries > 0);
- brt->brt_nentries--;
}
- dnode_rele(dn, FTAG);
-
- brt_vdev_sync(brt, brtvd, tx);
-
+#ifdef ZFS_DEBUG
+ if (zfs_flags & ZFS_DEBUG_BRT)
+ brt_vdev_dump(brtvd);
+#endif
if (brtvd->bv_totalcount == 0)
- brt_vdev_destroy(brt, brtvd, tx);
+ brt_vdev_destroy(spa, brtvd, tx);
+ else
+ brt_vdev_sync(spa, brtvd, tx);
+ brt_rlock(spa);
}
-
- ASSERT0(brt->brt_nentries);
-
- brt_unlock(brt);
+ brt_unlock(spa);
}
void
brt_sync(spa_t *spa, uint64_t txg)
{
dmu_tx_t *tx;
- brt_t *brt;
+ uint64_t vdevid;
ASSERT(spa_syncing_txg(spa) == txg);
- brt = spa->spa_brt;
- brt_rlock(brt);
- if (brt->brt_nentries == 0) {
- /* No changes. */
- brt_unlock(brt);
+ brt_rlock(spa);
+ for (vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
+ if (spa->spa_brt_vdevs[vdevid]->bv_meta_dirty)
+ break;
+ }
+ if (vdevid >= spa->spa_brt_nvdevs) {
+ brt_unlock(spa);
return;
}
- brt_unlock(brt);
+ brt_unlock(spa);
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
-
- brt_sync_table(brt, tx);
-
+ brt_sync_table(spa, tx);
dmu_tx_commit(tx);
}
static void
-brt_table_alloc(brt_t *brt)
-{
-
- for (int i = 0; i < TXG_SIZE; i++) {
- avl_create(&brt->brt_pending_tree[i],
- brt_pending_entry_compare,
- sizeof (brt_pending_entry_t),
- offsetof(brt_pending_entry_t, bpe_node));
- mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT,
- NULL);
- }
-}
-
-static void
-brt_table_free(brt_t *brt)
-{
-
- for (int i = 0; i < TXG_SIZE; i++) {
- ASSERT(avl_is_empty(&brt->brt_pending_tree[i]));
-
- avl_destroy(&brt->brt_pending_tree[i]);
- mutex_destroy(&brt->brt_pending_lock[i]);
- }
-}
-
-static void
brt_alloc(spa_t *spa)
{
- brt_t *brt;
-
- ASSERT(spa->spa_brt == NULL);
-
- brt = kmem_zalloc(sizeof (*brt), KM_SLEEP);
- rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL);
- brt->brt_spa = spa;
- brt->brt_rangesize = 0;
- brt->brt_nentries = 0;
- brt->brt_vdevs = NULL;
- brt->brt_nvdevs = 0;
- brt_table_alloc(brt);
-
- spa->spa_brt = brt;
+ rw_init(&spa->spa_brt_lock, NULL, RW_DEFAULT, NULL);
+ spa->spa_brt_vdevs = NULL;
+ spa->spa_brt_nvdevs = 0;
+ spa->spa_brt_rangesize = 0;
}
void
brt_create(spa_t *spa)
{
-
brt_alloc(spa);
- brt_vdevs_alloc(spa->spa_brt, B_FALSE);
+ brt_vdevs_alloc(spa, B_FALSE);
}
int
brt_load(spa_t *spa)
{
-
brt_alloc(spa);
- brt_vdevs_alloc(spa->spa_brt, B_TRUE);
-
+ brt_vdevs_alloc(spa, B_TRUE);
return (0);
}
void
brt_unload(spa_t *spa)
{
- brt_t *brt = spa->spa_brt;
-
- if (brt == NULL)
- return;
-
- brt_vdevs_free(brt);
- brt_table_free(brt);
- rw_destroy(&brt->brt_lock);
- kmem_free(brt, sizeof (*brt));
- spa->spa_brt = NULL;
+ brt_vdevs_free(spa);
+ rw_destroy(&spa->spa_brt_lock);
}
/* BEGIN CSTYLED */
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index f486513fc..32542e7ce 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -1870,13 +1870,7 @@ spa_get_slop_space(spa_t *spa)
if (spa->spa_dedup_dspace == ~0ULL)
spa_update_dspace(spa);
- /*
- * spa_get_dspace() includes the space only logically "used" by
- * deduplicated data, so since it's not useful to reserve more
- * space with more deduplicated data, we subtract that out here.
- */
- space =
- spa_get_dspace(spa) - spa->spa_dedup_dspace - brt_get_dspace(spa);
+ space = spa->spa_rdspace;
slop = MIN(space >> spa_slop_shift, spa_max_slop);
/*
@@ -1912,8 +1906,7 @@ spa_get_checkpoint_space(spa_t *spa)
void
spa_update_dspace(spa_t *spa)
{
- spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
- ddt_get_dedup_dspace(spa) + brt_get_dspace(spa);
+ spa->spa_rdspace = metaslab_class_get_dspace(spa_normal_class(spa));
if (spa->spa_nonallocating_dspace > 0) {
/*
* Subtract the space provided by all non-allocating vdevs that
@@ -1933,9 +1926,11 @@ spa_update_dspace(spa_t *spa)
* doesn't matter that the data we are moving may be
* allocated twice (on the old device and the new device).
*/
- ASSERT3U(spa->spa_dspace, >=, spa->spa_nonallocating_dspace);
- spa->spa_dspace -= spa->spa_nonallocating_dspace;
+ ASSERT3U(spa->spa_rdspace, >=, spa->spa_nonallocating_dspace);
+ spa->spa_rdspace -= spa->spa_nonallocating_dspace;
}
+ spa->spa_dspace = spa->spa_rdspace + ddt_get_dedup_dspace(spa) +
+ brt_get_dspace(spa);
}
/*