aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--cmd/ztest/ztest.c2
-rw-r--r--include/sys/metaslab.h3
-rw-r--r--include/sys/metaslab_impl.h2
-rw-r--r--include/sys/spa.h8
-rw-r--r--include/sys/spa_impl.h2
-rw-r--r--module/zfs/metaslab.c105
-rw-r--r--module/zfs/spa_misc.c7
-rw-r--r--module/zfs/zio.c22
8 files changed, 123 insertions, 28 deletions
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index 6acba5290..235bf56ef 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
*/
/*
@@ -5300,6 +5301,7 @@ ztest_run(ztest_shared_t *zs)
*/
kernel_init(FREAD | FWRITE);
VERIFY(spa_open(zs->zs_pool, &spa, FTAG) == 0);
+ spa->spa_debug = B_TRUE;
zs->zs_spa = spa;
spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index 583d6303b..2cf4d2b48 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_H
@@ -47,6 +48,8 @@ extern void metaslab_sync_reassess(metaslab_group_t *mg);
#define METASLAB_HINTBP_FAVOR 0x0
#define METASLAB_HINTBP_AVOID 0x1
#define METASLAB_GANG_HEADER 0x2
+#define METASLAB_GANG_CHILD 0x4
+#define METASLAB_GANG_AVOID 0x8
extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags);
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index 07988dd51..6c670a162 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_IMPL_H
@@ -52,6 +53,7 @@ struct metaslab_group {
avl_tree_t mg_metaslab_tree;
uint64_t mg_aliquot;
uint64_t mg_bonus_area;
+ uint64_t mg_alloc_failures;
int64_t mg_bias;
int64_t mg_activation_count;
metaslab_class_t *mg_class;
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 52737ebc2..c9028fb09 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
*/
#ifndef _SYS_SPA_H
@@ -698,6 +699,13 @@ _NOTE(CONSTCOND) } while (0)
#define dprintf_bp(bp, fmt, ...)
#endif
+extern boolean_t spa_debug_enabled(spa_t *spa);
+#define spa_dbgmsg(spa, ...) \
+{ \
+ if (spa_debug_enabled(spa)) \
+ zfs_dbgmsg(__VA_ARGS__); \
+}
+
extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */
#ifdef __cplusplus
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 1c34873b6..3f5cd9a73 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
*/
#ifndef _SYS_SPA_IMPL_H
@@ -196,6 +197,7 @@ struct spa {
kcondvar_t spa_suspend_cv; /* notification of resume */
uint8_t spa_suspended; /* pool is suspended */
uint8_t spa_claiming; /* pool is doing zil_claim() */
+ boolean_t spa_debug; /* debug enabled? */
boolean_t spa_is_root; /* pool is root */
int spa_minref; /* num refs when first opened */
int spa_mode; /* FREAD | FWRITE */
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 56c46100d..b089f1eac 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -30,12 +31,31 @@
#include <sys/vdev_impl.h>
#include <sys/zio.h>
-#define WITH_NDF_BLOCK_ALLOCATOR
+#define WITH_DF_BLOCK_ALLOCATOR
+
+/*
+ * Allow allocations to switch to gang blocks quickly. We do this to
+ * avoid having to load lots of space_maps in a given txg. There are,
+ * however, some cases where we want to avoid "fast" ganging and instead
+ * we want to do an exhaustive search of all metaslabs on this device.
+ * Currently we don't allow any gang or dump device related allocations
+ * to "fast" gang.
+ */
+#define CAN_FASTGANG(flags) \
+ (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
+ METASLAB_GANG_AVOID)))
uint64_t metaslab_aliquot = 512ULL << 10;
uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
/*
+ * This value defines the number of allowed allocation failures per vdev.
+ * If a device reaches this threshold in a given txg then we consider skipping
+ * allocations on that device.
+ */
+int zfs_mg_alloc_failures;
+
+/*
* Metaslab debugging: when set, keeps all space maps in core to verify frees.
*/
static int metaslab_debug = 0;
@@ -865,7 +885,7 @@ metaslab_prefetch(metaslab_group_t *mg)
}
static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
+metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
{
metaslab_group_t *mg = msp->ms_group;
space_map_t *sm = &msp->ms_map;
@@ -899,13 +919,6 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
mutex_exit(&mg->mg_lock);
}
- /*
- * If we were able to load the map then make sure
- * that this map is still able to satisfy our request.
- */
- if (msp->ms_weight < size)
- return (ENOSPC);
-
metaslab_group_sort(msp->ms_group, msp,
msp->ms_weight | activation_weight);
}
@@ -1123,6 +1136,7 @@ void
metaslab_sync_reassess(metaslab_group_t *mg)
{
vdev_t *vd = mg->mg_vd;
+ int64_t failures = mg->mg_alloc_failures;
int m;
/*
@@ -1140,6 +1154,8 @@ metaslab_sync_reassess(metaslab_group_t *mg)
mutex_exit(&msp->ms_lock);
}
+ atomic_add_64(&mg->mg_alloc_failures, -failures);
+
/*
* Prefetch the next potential metaslabs
*/
@@ -1164,9 +1180,10 @@ metaslab_distance(metaslab_t *msp, dva_t *dva)
}
static uint64_t
-metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
- uint64_t min_distance, dva_t *dva, int d)
+metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
+ uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
{
+ spa_t *spa = mg->mg_vd->vdev_spa;
metaslab_t *msp = NULL;
uint64_t offset = -1ULL;
avl_tree_t *t = &mg->mg_metaslab_tree;
@@ -1187,11 +1204,17 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
mutex_enter(&mg->mg_lock);
for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
- if (msp->ms_weight < size) {
+ if (msp->ms_weight < asize) {
+ spa_dbgmsg(spa, "%s: failed to meet weight "
+ "requirement: vdev %llu, txg %llu, mg %p, "
+ "msp %p, psize %llu, asize %llu, "
+ "failures %llu, weight %llu",
+ spa_name(spa), mg->mg_vd->vdev_id, txg,
+ mg, msp, psize, asize,
+ mg->mg_alloc_failures, msp->ms_weight);
mutex_exit(&mg->mg_lock);
return (-1ULL);
}
-
was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
if (activation_weight == METASLAB_WEIGHT_PRIMARY)
break;
@@ -1210,6 +1233,25 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
if (msp == NULL)
return (-1ULL);
+ /*
+ * If we've already reached the allowable number of failed
+ * allocation attempts on this metaslab group then we
+ * consider skipping it. We skip it only if we're allowed
+ * to "fast" gang, the physical size is larger than
+ * a gang block, and we're attempting to allocate from
+ * the primary metaslab.
+ */
+ if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
+ CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
+ activation_weight == METASLAB_WEIGHT_PRIMARY) {
+ spa_dbgmsg(spa, "%s: skipping metaslab group: "
+ "vdev %llu, txg %llu, mg %p, psize %llu, "
+ "asize %llu, failures %llu", spa_name(spa),
+ mg->mg_vd->vdev_id, txg, mg, psize, asize,
+ mg->mg_alloc_failures);
+ return (-1ULL);
+ }
+
mutex_enter(&msp->ms_lock);
/*
@@ -1218,7 +1260,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
* another thread may have changed the weight while we
* were blocked on the metaslab lock.
*/
- if (msp->ms_weight < size || (was_active &&
+ if (msp->ms_weight < asize || (was_active &&
!(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
activation_weight == METASLAB_WEIGHT_PRIMARY)) {
mutex_exit(&msp->ms_lock);
@@ -1233,14 +1275,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
continue;
}
- if (metaslab_activate(msp, activation_weight, size) != 0) {
+ if (metaslab_activate(msp, activation_weight) != 0) {
mutex_exit(&msp->ms_lock);
continue;
}
- if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
+ if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL)
break;
+ atomic_inc_64(&mg->mg_alloc_failures);
+
metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
mutex_exit(&msp->ms_lock);
@@ -1249,7 +1293,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
- space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+ space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize);
mutex_exit(&msp->ms_lock);
@@ -1376,7 +1420,8 @@ top:
asize = vdev_psize_to_asize(vd, psize);
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
- offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
+ offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
+ dva, d, flags);
if (offset != -1ULL) {
/*
* If we've just selected this metaslab group,
@@ -1388,18 +1433,24 @@ top:
vdev_stat_t *vs = &vd->vdev_stat;
int64_t vu, cu;
- /*
- * Determine percent used in units of 0..1024.
- * (This is just to avoid floating point.)
- */
- vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
- cu = (mc->mc_alloc << 10) / (mc->mc_space + 1);
+ vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
+ cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
/*
- * Bias by at most +/- 25% of the aliquot.
+ * Calculate how much more or less we should
+ * try to allocate from this device during
+ * this iteration around the rotor.
+ * For example, if a device is 80% full
+ * and the pool is 20% full then we should
+ * reduce allocations by 60% on this device.
+ *
+ * mg_bias = (20 - 80) * 512K / 100 = -307K
+ *
+ * This reduces allocations by 307K for this
+ * iteration.
*/
mg->mg_bias = ((cu - vu) *
- (int64_t)mg->mg_aliquot) / (1024 * 4);
+ (int64_t)mg->mg_aliquot) / 100;
}
if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
@@ -1513,7 +1564,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
mutex_enter(&msp->ms_lock);
if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
- error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
+ error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
error = ENOENT;
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 487a76d71..e4e0c35f0 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -1680,6 +1681,12 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
return (0);
}
+boolean_t
+spa_debug_enabled(spa_t *spa)
+{
+ return (spa->spa_debug);
+}
+
#if defined(_KERNEL) && defined(HAVE_SPL)
/* Namespace manipulation */
EXPORT_SYMBOL(spa_lookup);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 0fa823687..0022c64cc 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -79,6 +80,7 @@ int zio_delay_max = ZIO_DELAY_MAX;
#ifdef _KERNEL
extern vmem_t *zio_alloc_arena;
#endif
+extern int zfs_mg_alloc_failures;
/*
* An allocating zio is one that either currently has the DVA allocate
@@ -158,6 +160,12 @@ zio_init(void)
zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
}
+ /*
+ * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
+ * to fail 3 times per txg or 8 failures, whichever is greater.
+ */
+ zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
+
zio_inject_init();
}
@@ -2151,6 +2159,7 @@ zio_dva_allocate(zio_t *zio)
metaslab_class_t *mc = spa_normal_class(spa);
blkptr_t *bp = zio->io_bp;
int error;
+ int flags = 0;
if (zio->io_gang_leader == NULL) {
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
@@ -2163,10 +2172,21 @@ zio_dva_allocate(zio_t *zio)
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+ /*
+ * The dump device does not support gang blocks so allocation on
+ * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
+ * the "fast" gang feature.
+ */
+ flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
+ flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
+ METASLAB_GANG_CHILD : 0;
error = metaslab_alloc(spa, mc, zio->io_size, bp,
- zio->io_prop.zp_copies, zio->io_txg, NULL, 0);
+ zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
if (error) {
+ spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
+ "size %llu, error %d", spa_name(spa), zio, zio->io_size,
+ error);
if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
return (zio_write_gang_block(zio));
zio->io_error = error;