1 files changed, 56 insertions, 32 deletions
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 9a98d4fc0..565a78c89 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  */
 
@@ -45,6 +45,7 @@
 #include <sys/trace_zio.h>
 #include <sys/abd.h>
 #include <sys/dsl_crypt.h>
+#include <sys/cityhash.h>
 
 /*
  * ==========================================================================
@@ -2611,7 +2612,8 @@ zio_write_gang_block(zio_t *pio)
 		ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
 
 		flags |= METASLAB_ASYNC_ALLOC;
-		VERIFY(refcount_held(&mc->mc_alloc_slots, pio));
+		VERIFY(refcount_held(&mc->mc_alloc_slots[pio->io_allocator],
+		    pio));
 
 		/*
 		 * The logical zio has already placed a reservation for
@@ -2622,12 +2624,12 @@ zio_write_gang_block(zio_t *pio)
 		 * additional reservations for gang blocks.
 		 */
 		VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
-		    pio, flags));
+		    pio->io_allocator, pio, flags));
 	}
 
 	error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
 	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
-	    &pio->io_alloc_list, pio);
+	    &pio->io_alloc_list, pio, pio->io_allocator);
 	if (error) {
 		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
@@ -2641,7 +2643,7 @@ zio_write_gang_block(zio_t *pio)
 			 * stage.
 			 */
 			metaslab_class_throttle_unreserve(mc,
-			    gbh_copies - copies, pio);
+			    gbh_copies - copies, pio->io_allocator, pio);
 		}
 
 		pio->io_error = error;
@@ -2705,7 +2707,7 @@ zio_write_gang_block(zio_t *pio)
 			 * slot for them here.
 			 */
 			VERIFY(metaslab_class_throttle_reserve(mc,
-			    zp.zp_copies, cio, flags));
+			    zp.zp_copies, cio->io_allocator, cio, flags));
 		}
 		zio_nowait(cio);
 	}
@@ -3223,13 +3225,13 @@ zio_ddt_free(zio_t *zio)
  */
 
 static zio_t *
-zio_io_to_allocate(spa_t *spa)
+zio_io_to_allocate(spa_t *spa, int allocator)
 {
 	zio_t *zio;
 
-	ASSERT(MUTEX_HELD(&spa->spa_alloc_lock));
+	ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator]));
 
-	zio = avl_first(&spa->spa_alloc_tree);
+	zio = avl_first(&spa->spa_alloc_trees[allocator]);
 	if (zio == NULL)
 		return (NULL);
 
@@ -3239,12 +3241,13 @@ zio_io_to_allocate(spa_t *spa)
 	 * Try to place a reservation for this zio. If we're unable to
 	 * reserve then we throttle.
 	 */
+	ASSERT3U(zio->io_allocator, ==, allocator);
 	if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
-	    zio->io_prop.zp_copies, zio, 0)) {
+	    zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
 		return (NULL);
 	}
 
-	avl_remove(&spa->spa_alloc_tree, zio);
+	avl_remove(&spa->spa_alloc_trees[allocator], zio);
 	ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
 
 	return (zio);
@@ -3268,13 +3271,23 @@ zio_dva_throttle(zio_t *zio)
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 	ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
 
-	mutex_enter(&spa->spa_alloc_lock);
+	zbookmark_phys_t *bm = &zio->io_bookmark;
+	/*
+	 * We want to try to use as many allocators as possible to help improve
+	 * performance, but we also want logically adjacent IOs to be physically
+	 * adjacent to improve sequential read performance. We chunk each object
+	 * into 2^20 block regions, and then hash based on the objset, object,
+	 * level, and region to accomplish both of these goals.
+	 */
+	zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
+	    bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
+	mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
 
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
-	avl_add(&spa->spa_alloc_tree, zio);
+	avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
 
-	nio = zio_io_to_allocate(zio->io_spa);
-	mutex_exit(&spa->spa_alloc_lock);
+	nio = zio_io_to_allocate(zio->io_spa, zio->io_allocator);
+	mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
 
 	if (nio == zio)
 		return (ZIO_PIPELINE_CONTINUE);
@@ -3295,13 +3308,13 @@ zio_dva_throttle(zio_t *zio)
 }
 
 void
-zio_allocate_dispatch(spa_t *spa)
+zio_allocate_dispatch(spa_t *spa, int allocator)
 {
 	zio_t *zio;
 
-	mutex_enter(&spa->spa_alloc_lock);
-	zio = zio_io_to_allocate(spa);
-	mutex_exit(&spa->spa_alloc_lock);
+	mutex_enter(&spa->spa_alloc_locks[allocator]);
+	zio = zio_io_to_allocate(spa, allocator);
+	mutex_exit(&spa->spa_alloc_locks[allocator]);
 	if (zio == NULL)
 		return;
 
@@ -3340,7 +3353,7 @@ zio_dva_allocate(zio_t *zio)
 
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
 	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
-	    &zio->io_alloc_list, zio);
+	    &zio->io_alloc_list, zio, zio->io_allocator);
 
 	if (error != 0) {
 		zfs_dbgmsg("%s: metaslab allocation failure: zio %p, "
@@ -3409,14 +3422,23 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
 	ASSERT(txg > spa_syncing_txg(spa));
 
 	metaslab_trace_init(&io_alloc_list);
+	/*
+	 * When allocating a zil block, we don't have information about
+	 * the final destination of the block except the objset it's part
+	 * of, so we just hash the objset ID to pick the allocator to get
+	 * some parallelism.
+	 */
 	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
-	    txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL);
+	    txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL,
+	    cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) %
+	    spa->spa_alloc_count);
 	if (error == 0) {
 		*slog = TRUE;
 	} else {
 		error = metaslab_alloc(spa, spa_normal_class(spa), size,
 		    new_bp, 1, txg, NULL, METASLAB_FASTWRITE,
-		    &io_alloc_list, NULL);
+		    &io_alloc_list, NULL, cityhash4(0, 0, 0,
+		    os->os_dsl_dataset->ds_object) % spa->spa_alloc_count);
 		if (error == 0)
 			*slog = FALSE;
 	}
@@ -4119,8 +4141,8 @@ zio_ready(zio_t *zio)
 			 */
 			metaslab_class_throttle_unreserve(
 			    spa_normal_class(zio->io_spa),
-			    zio->io_prop.zp_copies, zio);
-			zio_allocate_dispatch(zio->io_spa);
+			    zio->io_prop.zp_copies, zio->io_allocator, zio);
+			zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
 		}
 	}
 
@@ -4204,18 +4226,19 @@ zio_dva_throttle_done(zio_t *zio)
 	ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
 
 	mutex_enter(&pio->io_lock);
-	metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags);
+	metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
+	    pio->io_allocator, B_TRUE);
 	mutex_exit(&pio->io_lock);
 
 	metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
-	    1, pio);
+	    1, pio->io_allocator, pio);
 
 	/*
 	 * Call into the pipeline to see if there is more work that
 	 * needs to be done. If there is work to be done it will be
 	 * dispatched to another taskq thread.
 	 */
-	zio_allocate_dispatch(zio->io_spa);
+	zio_allocate_dispatch(zio->io_spa, pio->io_allocator);
 }
 
 static int
@@ -4227,6 +4250,7 @@ zio_done(zio_t *zio)
 	 */
 	const uint64_t psize = zio->io_size;
 	zio_t *pio, *pio_next;
+	ASSERTV(metaslab_class_t *mc = spa_normal_class(zio->io_spa));
 	zio_link_t *zl = NULL;
 
 	/*
@@ -4245,8 +4269,7 @@ zio_done(zio_t *zio)
 	 */
 	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
 	    zio->io_child_type == ZIO_CHILD_VDEV) {
-		ASSERT(spa_normal_class(
-		    zio->io_spa)->mc_alloc_throttle_enabled);
+		ASSERT(mc->mc_alloc_throttle_enabled);
 		zio_dva_throttle_done(zio);
 	}
 
@@ -4258,9 +4281,10 @@ zio_done(zio_t *zio)
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(zio->io_bp != NULL);
-		metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio);
-		VERIFY(refcount_not_held(
-		    &(spa_normal_class(zio->io_spa)->mc_alloc_slots), zio));
+		metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
+		    zio->io_allocator);
+		VERIFY(refcount_not_held(&mc->mc_alloc_slots[zio->io_allocator],
+		    zio));
 	}