8 files changed, 123 insertions, 28 deletions
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index 6acba5290..235bf56ef 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 /*
@@ -5300,6 +5301,7 @@ ztest_run(ztest_shared_t *zs)
 	 */
 	kernel_init(FREAD | FWRITE);
 	VERIFY(spa_open(zs->zs_pool, &spa, FTAG) == 0);
+	spa->spa_debug = B_TRUE;
 	zs->zs_spa = spa;
 
 	spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index 583d6303b..2cf4d2b48 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_H
@@ -47,6 +48,8 @@ extern void metaslab_sync_reassess(metaslab_group_t *mg);
 #define	METASLAB_HINTBP_FAVOR	0x0
 #define	METASLAB_HINTBP_AVOID	0x1
 #define	METASLAB_GANG_HEADER	0x2
+#define	METASLAB_GANG_CHILD	0x4
+#define	METASLAB_GANG_AVOID	0x8
 
 extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
     blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags);
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index 07988dd51..6c670a162 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_IMPL_H
@@ -52,6 +53,7 @@ struct metaslab_group {
 	avl_tree_t		mg_metaslab_tree;
 	uint64_t		mg_aliquot;
 	uint64_t		mg_bonus_area;
+	uint64_t		mg_alloc_failures;
 	int64_t			mg_bias;
 	int64_t			mg_activation_count;
 	metaslab_class_t	*mg_class;
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 52737ebc2..c9028fb09 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_SPA_H
@@ -698,6 +699,13 @@ _NOTE(CONSTCOND) } while (0)
 #define	dprintf_bp(bp, fmt, ...)
 #endif
 
+extern boolean_t spa_debug_enabled(spa_t *spa);
+#define	spa_dbgmsg(spa, ...)			\
+{						\
+	if (spa_debug_enabled(spa))		\
+		zfs_dbgmsg(__VA_ARGS__);	\
+}
+
 extern int spa_mode_global;			/* mode, e.g. FREAD | FWRITE */
 
 #ifdef	__cplusplus
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 1c34873b6..3f5cd9a73 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_SPA_IMPL_H
@@ -196,6 +197,7 @@ struct spa {
 	kcondvar_t	spa_suspend_cv;		/* notification of resume */
 	uint8_t		spa_suspended;		/* pool is suspended */
 	uint8_t		spa_claiming;		/* pool is doing zil_claim() */
+	boolean_t	spa_debug;		/* debug enabled? */
 	boolean_t	spa_is_root;		/* pool is root */
 	int		spa_minref;		/* num refs when first opened */
 	int		spa_mode;		/* FREAD | FWRITE */
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 56c46100d..b089f1eac 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -30,12 +31,31 @@
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 
-#define WITH_NDF_BLOCK_ALLOCATOR
+#define WITH_DF_BLOCK_ALLOCATOR
+
+/*
+ * Allow allocations to switch to gang blocks quickly. We do this to
+ * avoid having to load lots of space_maps in a given txg. There are,
+ * however, some cases where we want to avoid "fast" ganging and instead
+ * we want to do an exhaustive search of all metaslabs on this device.
+ * Currently we don't allow any gang or dump device related allocations
+ * to "fast" gang.
+ */
+#define	CAN_FASTGANG(flags) \
+	(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
+	METASLAB_GANG_AVOID)))
 
 uint64_t metaslab_aliquot = 512ULL << 10;
 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
 
 /*
+ * This value defines the number of allowed allocation failures per vdev.
+ * If a device reaches this threshold in a given txg then we consider skipping
+ * allocations on that device.
+ */
+int zfs_mg_alloc_failures;
+
+/*
  * Metaslab debugging: when set, keeps all space maps in core to verify frees.
  */
 static int metaslab_debug = 0;
@@ -865,7 +885,7 @@ metaslab_prefetch(metaslab_group_t *mg)
 }
 
 static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
+metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	space_map_t *sm = &msp->ms_map;
@@ -899,13 +919,6 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
 			mutex_exit(&mg->mg_lock);
 		}
 
-		/*
-		 * If we were able to load the map then make sure
-		 * that this map is still able to satisfy our request.
-		 */
-		if (msp->ms_weight < size)
-			return (ENOSPC);
-
 		metaslab_group_sort(msp->ms_group, msp,
 		    msp->ms_weight | activation_weight);
 	}
@@ -1123,6 +1136,7 @@ void
 metaslab_sync_reassess(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
+	int64_t failures = mg->mg_alloc_failures;
 	int m;
 
 	/*
@@ -1140,6 +1154,8 @@ metaslab_sync_reassess(metaslab_group_t *mg)
 		mutex_exit(&msp->ms_lock);
 	}
 
+	atomic_add_64(&mg->mg_alloc_failures, -failures);
+
 	/*
 	 * Prefetch the next potential metaslabs
 	 */
@@ -1164,9 +1180,10 @@ metaslab_distance(metaslab_t *msp, dva_t *dva)
 }
 
 static uint64_t
-metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
-    uint64_t min_distance, dva_t *dva, int d)
+metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
+    uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
 {
+	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_t *msp = NULL;
 	uint64_t offset = -1ULL;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
@@ -1187,11 +1204,17 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 
 		mutex_enter(&mg->mg_lock);
 		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
-			if (msp->ms_weight < size) {
+			if (msp->ms_weight < asize) {
+				spa_dbgmsg(spa, "%s: failed to meet weight "
+				    "requirement: vdev %llu, txg %llu, mg %p, "
+				    "msp %p, psize %llu, asize %llu, "
+				    "failures %llu, weight %llu",
+				    spa_name(spa), mg->mg_vd->vdev_id, txg,
+				    mg, msp, psize, asize,
+				    mg->mg_alloc_failures, msp->ms_weight);
 				mutex_exit(&mg->mg_lock);
 				return (-1ULL);
 			}
-
 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
 				break;
@@ -1210,6 +1233,25 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 		if (msp == NULL)
 			return (-1ULL);
 
+		/*
+		 * If we've already reached the allowable number of failed
+		 * allocation attempts on this metaslab group then we
+		 * consider skipping it. We skip it only if we're allowed
+		 * to "fast" gang, the physical size is larger than
+		 * a gang block, and we're attempting to allocate from
+		 * the primary metaslab.
+		 */
+		if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
+		    CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
+		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
+			spa_dbgmsg(spa, "%s: skipping metaslab group: "
+			    "vdev %llu, txg %llu, mg %p, psize %llu, "
+			    "asize %llu, failures %llu", spa_name(spa),
+			    mg->mg_vd->vdev_id, txg, mg, psize, asize,
+			    mg->mg_alloc_failures);
+			return (-1ULL);
+		}
+
 		mutex_enter(&msp->ms_lock);
 
 		/*
@@ -1218,7 +1260,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 		 * another thread may have changed the weight while we
 		 * were blocked on the metaslab lock.
 		 */
-		if (msp->ms_weight < size || (was_active &&
+		if (msp->ms_weight < asize || (was_active &&
 		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
 		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
 			mutex_exit(&msp->ms_lock);
@@ -1233,14 +1275,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 			continue;
 		}
 
-		if (metaslab_activate(msp, activation_weight, size) != 0) {
+		if (metaslab_activate(msp, activation_weight) != 0) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
-		if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
+		if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL)
 			break;
 
+		atomic_inc_64(&mg->mg_alloc_failures);
+
 		metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
 
 		mutex_exit(&msp->ms_lock);
@@ -1249,7 +1293,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
 		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
 
-	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize);
 
 	mutex_exit(&msp->ms_lock);
 
@@ -1376,7 +1420,8 @@ top:
 		asize = vdev_psize_to_asize(vd, psize);
 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
 
-		offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
+		offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
+		    dva, d, flags);
 		if (offset != -1ULL) {
 			/*
 			 * If we've just selected this metaslab group,
@@ -1388,18 +1433,24 @@ top:
 				vdev_stat_t *vs = &vd->vdev_stat;
 				int64_t vu, cu;
 
-				/*
-				 * Determine percent used in units of 0..1024.
-				 * (This is just to avoid floating point.)
-				 */
-				vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
-				cu = (mc->mc_alloc << 10) / (mc->mc_space + 1);
+				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
+				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
 
 				/*
-				 * Bias by at most +/- 25% of the aliquot.
+				 * Calculate how much more or less we should
+				 * try to allocate from this device during
+				 * this iteration around the rotor.
+				 * For example, if a device is 80% full
+				 * and the pool is 20% full then we should
+				 * reduce allocations by 60% on this device.
+				 *
+				 * mg_bias = (20 - 80) * 512K / 100 = -307K
+				 *
+				 * This reduces allocations by 307K for this
+				 * iteration.
 				 */
 				mg->mg_bias = ((cu - vu) *
-				    (int64_t)mg->mg_aliquot) / (1024 * 4);
+				    (int64_t)mg->mg_aliquot) / 100;
 			}
 
 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
@@ -1513,7 +1564,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 	mutex_enter(&msp->ms_lock);
 
 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
-		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
+		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
 
 	if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
 		error = ENOENT;
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 487a76d71..e4e0c35f0 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -1680,6 +1681,12 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
 	return (0);
 }
 
+boolean_t
+spa_debug_enabled(spa_t *spa)
+{
+	return (spa->spa_debug);
+}
+
 #if defined(_KERNEL) && defined(HAVE_SPL)
 /* Namespace manipulation */
 EXPORT_SYMBOL(spa_lookup);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 0fa823687..0022c64cc 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -79,6 +80,7 @@ int zio_delay_max = ZIO_DELAY_MAX;
 #ifdef _KERNEL
 extern vmem_t *zio_alloc_arena;
 #endif
+extern int zfs_mg_alloc_failures;
 
 /*
  * An allocating zio is one that either currently has the DVA allocate
@@ -158,6 +160,12 @@ zio_init(void)
 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 	}
 
+	/*
+	 * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
+	 * to fail 3 times per txg or 8 failures, whichever is greater.
+	 */
+	zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
+
 	zio_inject_init();
 }
 
@@ -2151,6 +2159,7 @@ zio_dva_allocate(zio_t *zio)
 	metaslab_class_t *mc = spa_normal_class(spa);
 	blkptr_t *bp = zio->io_bp;
 	int error;
+	int flags = 0;
 
 	if (zio->io_gang_leader == NULL) {
 		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
@@ -2163,10 +2172,21 @@ zio_dva_allocate(zio_t *zio)
 	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
+	/*
+	 * The dump device does not support gang blocks so allocation on
+	 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
+	 * the "fast" gang feature.
+	 */
+	flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
+	flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
+	    METASLAB_GANG_CHILD : 0;
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
-	    zio->io_prop.zp_copies, zio->io_txg, NULL, 0);
+	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
 
 	if (error) {
+		spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
+		    "size %llu, error %d", spa_name(spa), zio, zio->io_size,
+		    error);
 		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
 			return (zio_write_gang_block(zio));
 		zio->io_error = error;