diff options
author | Alex Reece <[email protected]> | 2015-04-03 14:14:28 +1100 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2015-04-28 16:24:03 -0700 |
commit | 8951cb8dfb8dcf410a237656c1f9c9767e4a9e6c (patch) | |
tree | e057d73086a975a0e22eae721ddb332ec5c81e6b /module/zfs | |
parent | 58c4aa00c65e09f254de0b939b2c1aa720c204a1 (diff) |
Illumos 4873 - zvol unmap calls can take a very long time for larger datasets
4873 zvol unmap calls can take a very long time for larger datasets
Author: Alex Reece <[email protected]>
Reviewed by: George Wilson <[email protected]>
Reviewed by: Matthew Ahrens <[email protected]>
Reviewed by: Paul Dagnelie <[email protected]>
Reviewed by: Basil Crow <[email protected]>
Reviewed by: Dan McDonald <[email protected]>
Approved by: Robert Mustacchi <[email protected]>
References:
https://www.illumos.org/issues/4873
https://github.com/illumos/illumos-gate/commit/0f6d88a
Porting Notes:
dbuf_free_range():
- reduce stack usage using kmem_alloc()
- the sorted AVL tree will handle the spill block case correctly
without all the special handling in the for() loop
Ported-by: Chris Dunlop <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/dbuf.c | 68 | ||||
-rw-r--r-- | module/zfs/dnode.c | 59 | ||||
-rw-r--r-- | module/zfs/dnode_sync.c | 12 |
3 files changed, 93 insertions, 46 deletions
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index f5327a34a..e9c8580fc 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -93,7 +93,9 @@ dbuf_cons(void *vdb, void *unused, int kmflag) mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); refcount_create(&db->db_holds); - list_link_init(&db->db_link); + + db->db_creation = gethrtime(); + return (0); } @@ -386,7 +388,7 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT3U(db->db_level, <, dn->dn_nlevels); ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID || - !list_is_empty(&dn->dn_dbufs)); + !avl_is_empty(&dn->dn_dbufs)); } if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(dn != NULL); @@ -866,23 +868,34 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) * receive; see comment below for details. */ void -dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) +dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, + dmu_tx_t *tx) { - dmu_buf_impl_t *db, *db_next; + dmu_buf_impl_t *db, *db_next, *db_search; uint64_t txg = tx->tx_txg; + avl_index_t where; boolean_t freespill = - (start == DMU_SPILL_BLKID || end == DMU_SPILL_BLKID); + (start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID); + + if (end_blkid > dn->dn_maxblkid && !freespill) + end_blkid = dn->dn_maxblkid; + dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); - if (end > dn->dn_maxblkid && !freespill) - end = dn->dn_maxblkid; - dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); + db_seach = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP); + db_search->db_level = 0; + db_search->db_blkid = start_blkid; + db_search->db_creation = 0; mutex_enter(&dn->dn_dbufs_mtx); - if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz && - !freespill) { + if (start_blkid >= dn->dn_unlisted_l0_blkid && !freespill) { /* There can't be any dbufs in this range; no need to search. */ - mutex_exit(&dn->dn_dbufs_mtx); - return; +#ifdef DEBUG + db = avl_find(&dn->dn_dbufs, db_search, &where); + ASSERT3P(db, ==, NULL); + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + ASSERT(db == NULL || db->db_level > 0); +#endif + goto out; } else if (dmu_objset_is_receiving(dn->dn_objset)) { /* * If we are receiving, we expect there to be no dbufs in @@ -894,19 +907,18 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) atomic_inc_64(&zfs_free_range_recv_miss); } - for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) { - db_next = list_next(&dn->dn_dbufs, db); + db = avl_find(&dn->dn_dbufs, db_search, &where); + ASSERT3P(db, ==, NULL); + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + + for (; db != NULL; db = db_next) { + db_next = AVL_NEXT(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); - /* Skip indirect blocks. */ - if (db->db_level != 0) - continue; - /* Skip direct blocks outside the range. */ - if (!freespill && (db->db_blkid < start || db->db_blkid > end)) - continue; - /* Skip all direct blocks, only free spill blocks. */ - if (freespill && (db->db_blkid != DMU_SPILL_BLKID)) - continue; + if (db->db_level != 0 || db->db_blkid > end_blkid) { + break; + } + ASSERT3U(db->db_blkid, >=, start_blkid); /* found a level 0 buffer in the range */ mutex_enter(&db->db_mtx); @@ -968,6 +980,9 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) mutex_exit(&db->db_mtx); } + +out: + kmem_free(db_search, sizeof (dmu_buf_impl_t)); mutex_exit(&dn->dn_dbufs_mtx); } @@ -1657,7 +1672,7 @@ dbuf_clear(dmu_buf_impl_t *db) dn = DB_DNODE(db); dndb = dn->dn_dbuf; if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { - list_remove(&dn->dn_dbufs, db); + avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); membar_producer(); DB_DNODE_EXIT(db); @@ -1829,7 +1844,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, mutex_exit(&dn->dn_dbufs_mtx); return (odb); } - list_insert_head(&dn->dn_dbufs, db); + avl_add(&dn->dn_dbufs, db); if (db->db_level == 0 && db->db_blkid >= dn->dn_unlisted_l0_blkid) dn->dn_unlisted_l0_blkid = db->db_blkid + 1; @@ -1888,7 +1903,7 @@ dbuf_destroy(dmu_buf_impl_t *db) DB_DNODE_ENTER(db); dn = DB_DNODE(db); mutex_enter(&dn->dn_dbufs_mtx); - list_remove(&dn->dn_dbufs, db); + avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); mutex_exit(&dn->dn_dbufs_mtx); DB_DNODE_EXIT(db); @@ -1906,7 +1921,6 @@ dbuf_destroy(dmu_buf_impl_t *db) db->db_parent = NULL; db->db_buf = NULL; - ASSERT(!list_link_active(&db->db_link)); ASSERT(db->db.db_data == NULL); ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 815696f70..41722f25c 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -62,6 +62,43 @@ int zfs_default_ibs = DN_MAX_INDBLKSHIFT; static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); #endif /* _KERNEL */ +static int +dbuf_compare(const void *x1, const void *x2) +{ + const dmu_buf_impl_t *d1 = x1; + const dmu_buf_impl_t *d2 = x2; + + if (d1->db_level < d2->db_level) { + return (-1); + } else if (d1->db_level > d2->db_level) { + return (1); + } + + if (d1->db_blkid < d2->db_blkid) { + return (-1); + } else if (d1->db_blkid > d2->db_blkid) { + return (1); + } + + /* + * If a dbuf is being evicted while dn_dbufs_mutex is not held, we set + * the db_state to DB_EVICTING but do not remove it from dn_dbufs. If + * another thread creates a dbuf of the same blkid before the dbuf is + * removed from dn_dbufs, we can reach a state where there are two + * dbufs of the same blkid and level in db_dbufs. To maintain the avl + * invariant that there cannot be duplicate items, we distinguish + * between these two dbufs based on the time they were created. + */ + if (d1->db_creation < d2->db_creation) { + return (-1); + } else if (d1->db_creation > d2->db_creation) { + return (1); + } else { + ASSERT3P(d1, ==, d2); + return (0); + } +} + /* ARGSUSED */ static int dnode_cons(void *arg, void *unused, int kmflag) @@ -116,7 +153,7 @@ dnode_cons(void *arg, void *unused, int kmflag) dn->dn_dbufs_count = 0; dn->dn_unlisted_l0_blkid = 0; - list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t), + avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); dn->dn_moved = 0; @@ -169,7 +206,7 @@ dnode_dest(void *arg, void *unused) ASSERT0(dn->dn_dbufs_count); ASSERT0(dn->dn_unlisted_l0_blkid); - list_destroy(&dn->dn_dbufs); + avl_destroy(&dn->dn_dbufs); } void @@ -503,7 +540,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT0(dn->dn_assigned_txg); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); ASSERT3U(refcount_count(&dn->dn_holds), <=, 1); - ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + ASSERT(avl_is_empty(&dn->dn_dbufs)); for (i = 0; i < TXG_SIZE; i++) { ASSERT0(dn->dn_next_nblkptr[i]); @@ -689,8 +726,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; ASSERT(refcount_count(&odn->dn_tx_holds) == 0); refcount_transfer(&ndn->dn_holds, &odn->dn_holds); - ASSERT(list_is_empty(&ndn->dn_dbufs)); - list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs); + ASSERT(avl_is_empty(&ndn->dn_dbufs)); + avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs); ndn->dn_dbufs_count = odn->dn_dbufs_count; ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid; ndn->dn_bonus = odn->dn_bonus; @@ -724,7 +761,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) */ odn->dn_dbuf = NULL; odn->dn_handle = NULL; - list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t), + avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); odn->dn_dbufs_count = 0; odn->dn_unlisted_l0_blkid = 0; @@ -1238,7 +1275,8 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) return; } - ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs)); + ASSERT(!refcount_is_zero(&dn->dn_holds) || + !avl_is_empty(&dn->dn_dbufs)); ASSERT(dn->dn_datablksz != 0); ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]); ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]); @@ -1311,7 +1349,7 @@ dnode_free(dnode_t *dn, dmu_tx_t *tx) int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) { - dmu_buf_impl_t *db, *db_next; + dmu_buf_impl_t *db; int err; if (size == 0) @@ -1334,9 +1372,8 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) goto fail; mutex_enter(&dn->dn_dbufs_mtx); - for (db = list_head(&dn->dn_dbufs); db; db = db_next) { - db_next = list_next(&dn->dn_dbufs, db); - + for (db = avl_first(&dn->dn_dbufs); db != NULL; + db = AVL_NEXT(&dn->dn_dbufs, db)) { if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID && db->db_blkid != DMU_SPILL_BLKID) { mutex_exit(&dn->dn_dbufs_mtx); diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 9779c51ef..4e613dd76 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -406,16 +406,13 @@ dnode_evict_dbufs(dnode_t *dn) int pass = 0; do { - dmu_buf_impl_t *db, marker; + dmu_buf_impl_t *db, *db_next; int evicting = FALSE; progress = FALSE; mutex_enter(&dn->dn_dbufs_mtx); - list_insert_tail(&dn->dn_dbufs, &marker); - db = list_head(&dn->dn_dbufs); - for (; db != ▮ db = list_head(&dn->dn_dbufs)) { - list_remove(&dn->dn_dbufs, db); - list_insert_tail(&dn->dn_dbufs, db); + for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) { + db_next = AVL_NEXT(&dn->dn_dbufs, db); #ifdef DEBUG DB_DNODE_ENTER(db); ASSERT3P(DB_DNODE(db), ==, dn); @@ -435,7 +432,6 @@ dnode_evict_dbufs(dnode_t *dn) } } - list_remove(&dn->dn_dbufs, &marker); /* * NB: we need to drop dn_dbufs_mtx between passes so * that any DB_EVICTING dbufs can make progress. @@ -516,7 +512,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); dnode_evict_dbufs(dn); - ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + ASSERT(avl_is_empty(&dn->dn_dbufs)); ASSERT3P(dn->dn_bonus, ==, NULL); /* |