diff options
Diffstat (limited to 'module/zfs/zio.c')
-rw-r--r-- | module/zfs/zio.c | 88 |
1 files changed, 56 insertions, 32 deletions
diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 9a98d4fc0..565a78c89 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. */ @@ -45,6 +45,7 @@ #include <sys/trace_zio.h> #include <sys/abd.h> #include <sys/dsl_crypt.h> +#include <sys/cityhash.h> /* * ========================================================================== @@ -2611,7 +2612,8 @@ zio_write_gang_block(zio_t *pio) ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); flags |= METASLAB_ASYNC_ALLOC; - VERIFY(refcount_held(&mc->mc_alloc_slots, pio)); + VERIFY(refcount_held(&mc->mc_alloc_slots[pio->io_allocator], + pio)); /* * The logical zio has already placed a reservation for @@ -2622,12 +2624,12 @@ zio_write_gang_block(zio_t *pio) * additional reservations for gang blocks. */ VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, - pio, flags)); + pio->io_allocator, pio, flags)); } error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, - &pio->io_alloc_list, pio); + &pio->io_alloc_list, pio, pio->io_allocator); if (error) { if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); @@ -2641,7 +2643,7 @@ zio_write_gang_block(zio_t *pio) * stage. */ metaslab_class_throttle_unreserve(mc, - gbh_copies - copies, pio); + gbh_copies - copies, pio->io_allocator, pio); } pio->io_error = error; @@ -2705,7 +2707,7 @@ zio_write_gang_block(zio_t *pio) * slot for them here. */ VERIFY(metaslab_class_throttle_reserve(mc, - zp.zp_copies, cio, flags)); + zp.zp_copies, cio->io_allocator, cio, flags)); } zio_nowait(cio); } @@ -3223,13 +3225,13 @@ zio_ddt_free(zio_t *zio) */ static zio_t * -zio_io_to_allocate(spa_t *spa) +zio_io_to_allocate(spa_t *spa, int allocator) { zio_t *zio; - ASSERT(MUTEX_HELD(&spa->spa_alloc_lock)); + ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator])); - zio = avl_first(&spa->spa_alloc_tree); + zio = avl_first(&spa->spa_alloc_trees[allocator]); if (zio == NULL) return (NULL); @@ -3239,12 +3241,13 @@ zio_io_to_allocate(spa_t *spa) * Try to place a reservation for this zio. If we're unable to * reserve then we throttle. */ + ASSERT3U(zio->io_allocator, ==, allocator); if (!metaslab_class_throttle_reserve(spa_normal_class(spa), - zio->io_prop.zp_copies, zio, 0)) { + zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) { return (NULL); } - avl_remove(&spa->spa_alloc_tree, zio); + avl_remove(&spa->spa_alloc_trees[allocator], zio); ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); return (zio); @@ -3268,13 +3271,23 @@ zio_dva_throttle(zio_t *zio) ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); - mutex_enter(&spa->spa_alloc_lock); + zbookmark_phys_t *bm = &zio->io_bookmark; + /* + * We want to try to use as many allocators as possible to help improve + * performance, but we also want logically adjacent IOs to be physically + * adjacent to improve sequential read performance. We chunk each object + * into 2^20 block regions, and then hash based on the objset, object, + * level, and region to accomplish both of these goals. + */ + zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object, + bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; + mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]); ASSERT(zio->io_type == ZIO_TYPE_WRITE); - avl_add(&spa->spa_alloc_tree, zio); + avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio); - nio = zio_io_to_allocate(zio->io_spa); - mutex_exit(&spa->spa_alloc_lock); + nio = zio_io_to_allocate(zio->io_spa, zio->io_allocator); + mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]); if (nio == zio) return (ZIO_PIPELINE_CONTINUE); @@ -3295,13 +3308,13 @@ zio_dva_throttle(zio_t *zio) } void -zio_allocate_dispatch(spa_t *spa) +zio_allocate_dispatch(spa_t *spa, int allocator) { zio_t *zio; - mutex_enter(&spa->spa_alloc_lock); - zio = zio_io_to_allocate(spa); - mutex_exit(&spa->spa_alloc_lock); + mutex_enter(&spa->spa_alloc_locks[allocator]); + zio = zio_io_to_allocate(spa, allocator); + mutex_exit(&spa->spa_alloc_locks[allocator]); if (zio == NULL) return; @@ -3340,7 +3353,7 @@ zio_dva_allocate(zio_t *zio) error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, - &zio->io_alloc_list, zio); + &zio->io_alloc_list, zio, zio->io_allocator); if (error != 0) { zfs_dbgmsg("%s: metaslab allocation failure: zio %p, " @@ -3409,14 +3422,23 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, ASSERT(txg > spa_syncing_txg(spa)); metaslab_trace_init(&io_alloc_list); + /* + * When allocating a zil block, we don't have information about + * the final destination of the block except the objset it's part + * of, so we just hash the objset ID to pick the allocator to get + * some parallelism. + */ error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, - txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL); + txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL, + cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % + spa->spa_alloc_count); if (error == 0) { *slog = TRUE; } else { error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, NULL, METASLAB_FASTWRITE, - &io_alloc_list, NULL); + &io_alloc_list, NULL, cityhash4(0, 0, 0, + os->os_dsl_dataset->ds_object) % spa->spa_alloc_count); if (error == 0) *slog = FALSE; } @@ -4119,8 +4141,8 @@ zio_ready(zio_t *zio) */ metaslab_class_throttle_unreserve( spa_normal_class(zio->io_spa), - zio->io_prop.zp_copies, zio); - zio_allocate_dispatch(zio->io_spa); + zio->io_prop.zp_copies, zio->io_allocator, zio); + zio_allocate_dispatch(zio->io_spa, zio->io_allocator); } } @@ -4204,18 +4226,19 @@ zio_dva_throttle_done(zio_t *zio) ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); mutex_enter(&pio->io_lock); - metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags); + metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags, + pio->io_allocator, B_TRUE); mutex_exit(&pio->io_lock); metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa), - 1, pio); + 1, pio->io_allocator, pio); /* * Call into the pipeline to see if there is more work that * needs to be done. If there is work to be done it will be * dispatched to another taskq thread. */ - zio_allocate_dispatch(zio->io_spa); + zio_allocate_dispatch(zio->io_spa, pio->io_allocator); } static int @@ -4227,6 +4250,7 @@ zio_done(zio_t *zio) */ const uint64_t psize = zio->io_size; zio_t *pio, *pio_next; + ASSERTV(metaslab_class_t *mc = spa_normal_class(zio->io_spa)); zio_link_t *zl = NULL; /* @@ -4245,8 +4269,7 @@ zio_done(zio_t *zio) */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && zio->io_child_type == ZIO_CHILD_VDEV) { - ASSERT(spa_normal_class( - zio->io_spa)->mc_alloc_throttle_enabled); + ASSERT(mc->mc_alloc_throttle_enabled); zio_dva_throttle_done(zio); } @@ -4258,9 +4281,10 @@ zio_done(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_bp != NULL); - metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio); - VERIFY(refcount_not_held( - &(spa_normal_class(zio->io_spa)->mc_alloc_slots), zio)); + metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio, + zio->io_allocator); + VERIFY(refcount_not_held(&mc->mc_alloc_slots[zio->io_allocator], + zio)); } |