aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--man/man5/zfs-module-parameters.534
-rw-r--r--module/zfs/metaslab.c143
2 files changed, 117 insertions, 60 deletions
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index 604f2f6c9..3ed7bc6e4 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -328,6 +328,40 @@ Use \fB1\fR for yes (default) and \fB0\fR for no.
.sp
.ne 2
.na
+\fBmetaslab_df_max_search\fR (int)
+.ad
+.RS 12n
+Maximum distance to search forward from the last offset. Without this limit,
+fragmented pools can see >100,000 iterations and metaslab_block_picker()
+becomes the performance limiting factor on high-performance storage.
+
+With the default setting of 16MB, we typically see less than 500 iterations,
+even with very fragmented, ashift=9 pools. The maximum number of iterations
+possible is: \fBmetaslab_df_max_search / (2 * (1<<ashift))\fR.
+With the default setting of 16MB this is 16*1024 (with ashift=9) or 2048
+(with ashift=12).
+.sp
+Default value: \fB16,777,216\fR (16MB)
+.RE
+
+.sp
+.ne 2
+.na
+\fBmetaslab_df_use_largest_segment\fR (int)
+.ad
+.RS 12n
+If we are not searching forward (due to metaslab_df_max_search,
+metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable controls
+what segment is used. If it is set, we will use the largest free segment.
+If it is not set, we will use a segment of exactly the requested size (or
+larger).
+.sp
+Use \fB1\fR for yes and \fB0\fR for no (default).
+.RE
+
+.sp
+.ne 2
+.na
\fBzfs_vdev_default_ms_count\fR (int)
.ad
.RS 12n
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 41cbaad5f..92310aaf9 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -160,6 +160,30 @@ uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
int metaslab_df_free_pct = 4;
/*
+ * Maximum distance to search forward from the last offset. Without this
+ * limit, fragmented pools can see >100,000 iterations and
+ * metaslab_block_picker() becomes the performance limiting factor on
+ * high-performance storage.
+ *
+ * With the default setting of 16MB, we typically see less than 500
+ * iterations, even with very fragmented, ashift=9 pools. The maximum number
+ * of iterations possible is:
+ * metaslab_df_max_search / (2 * (1<<ashift))
+ * With the default setting of 16MB this is 16*1024 (with ashift=9) or
+ * 2048 (with ashift=12).
+ */
+int metaslab_df_max_search = 16 * 1024 * 1024;
+
+/*
+ * If we are not searching forward (due to metaslab_df_max_search,
+ * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable
+ * controls what segment is used. If it is set, we will use the largest free
+ * segment. If it is not set, we will use a segment of exactly the requested
+ * size (or larger).
+ */
+int metaslab_df_use_largest_segment = B_FALSE;
+
+/*
* Percentage of all cpus that can be used by the metaslab taskq.
*/
int metaslab_load_pct = 50;
@@ -1200,8 +1224,7 @@ metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
return (rs);
}
-#if defined(WITH_FF_BLOCK_ALLOCATOR) || \
- defined(WITH_DF_BLOCK_ALLOCATOR) || \
+#if defined(WITH_DF_BLOCK_ALLOCATOR) || \
defined(WITH_CF_BLOCK_ALLOCATOR)
/*
* This is a helper function that can be used by the allocator to find
@@ -1210,13 +1233,16 @@ metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
*/
static uint64_t
metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
- uint64_t align)
+ uint64_t max_search)
{
range_seg_t *rs = metaslab_block_find(t, *cursor, size);
+ uint64_t first_found;
- while (rs != NULL) {
- uint64_t offset = P2ROUNDUP(rs->rs_start, align);
+ if (rs != NULL)
+ first_found = rs->rs_start;
+ while (rs != NULL && rs->rs_start - first_found <= max_search) {
+ uint64_t offset = rs->rs_start;
if (offset + size <= rs->rs_end) {
*cursor = offset + size;
return (offset);
@@ -1224,55 +1250,30 @@ metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
rs = AVL_NEXT(t, rs);
}
- /*
- * If we know we've searched the whole map (*cursor == 0), give up.
- * Otherwise, reset the cursor to the beginning and try again.
- */
- if (*cursor == 0)
- return (-1ULL);
-
*cursor = 0;
- return (metaslab_block_picker(t, cursor, size, align));
-}
-#endif /* WITH_FF/DF/CF_BLOCK_ALLOCATOR */
-
-#if defined(WITH_FF_BLOCK_ALLOCATOR)
-/*
- * ==========================================================================
- * The first-fit block allocator
- * ==========================================================================
- */
-static uint64_t
-metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
-{
- /*
- * Find the largest power of 2 block size that evenly divides the
- * requested size. This is used to try to allocate blocks with similar
- * alignment from the same area of the metaslab (i.e. same cursor
- * bucket) but it does not guarantee that other allocations sizes
- * may exist in the same region.
- */
- uint64_t align = size & -size;
- uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
- avl_tree_t *t = &msp->ms_allocatable->rt_root;
-
- return (metaslab_block_picker(t, cursor, size, align));
+ return (-1ULL);
}
-
-static metaslab_ops_t metaslab_ff_ops = {
- metaslab_ff_alloc
-};
-
-metaslab_ops_t *zfs_metaslab_ops = &metaslab_ff_ops;
-#endif /* WITH_FF_BLOCK_ALLOCATOR */
+#endif /* WITH_DF/CF_BLOCK_ALLOCATOR */
#if defined(WITH_DF_BLOCK_ALLOCATOR)
/*
* ==========================================================================
- * Dynamic block allocator -
- * Uses the first fit allocation scheme until space get low and then
- * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
- * and metaslab_df_free_pct to determine when to switch the allocation scheme.
+ * Dynamic Fit (df) block allocator
+ *
+ * Search for a free chunk of at least this size, starting from the last
+ * offset (for this alignment of block) looking for up to
+ * metaslab_df_max_search bytes (16MB). If a large enough free chunk is not
+ * found within 16MB, then return a free chunk of exactly the requested size (or
+ * larger).
+ *
+ * If it seems like searching from the last offset will be unproductive, skip
+ * that and just return a free chunk of exactly the requested size (or larger).
+ * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct. This
+ * mechanism is probably not very useful and may be removed in the future.
+ *
+ * The behavior when not searching can be changed to return the largest free
+ * chunk, instead of a free chunk of exactly the requested size, by setting
+ * metaslab_df_use_largest_segment.
* ==========================================================================
*/
static uint64_t
@@ -1288,28 +1289,42 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
uint64_t align = size & -size;
uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
range_tree_t *rt = msp->ms_allocatable;
- avl_tree_t *t = &rt->rt_root;
- uint64_t max_size = metaslab_block_maxsize(msp);
int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
+ uint64_t offset;
ASSERT(MUTEX_HELD(&msp->ms_lock));
- ASSERT3U(avl_numnodes(t), ==,
+ ASSERT3U(avl_numnodes(&rt->rt_root), ==,
avl_numnodes(&msp->ms_allocatable_by_size));
- if (max_size < size)
- return (-1ULL);
-
/*
- * If we're running low on space switch to using the size
- * sorted AVL tree (best-fit).
+ * If we're running low on space, find a segment based on size,
+ * rather than iterating based on offset.
*/
- if (max_size < metaslab_df_alloc_threshold ||
+ if (metaslab_block_maxsize(msp) < metaslab_df_alloc_threshold ||
free_pct < metaslab_df_free_pct) {
- t = &msp->ms_allocatable_by_size;
- *cursor = 0;
+ offset = -1;
+ } else {
+ offset = metaslab_block_picker(&rt->rt_root,
+ cursor, size, metaslab_df_max_search);
}
- return (metaslab_block_picker(t, cursor, size, 1ULL));
+ if (offset == -1) {
+ range_seg_t *rs;
+ if (metaslab_df_use_largest_segment) {
+ /* use largest free segment */
+ rs = avl_last(&msp->ms_allocatable_by_size);
+ } else {
+ /* use segment of this size, or next largest */
+ rs = metaslab_block_find(&msp->ms_allocatable_by_size,
+ 0, size);
+ }
+ if (rs != NULL && rs->rs_start + size <= rs->rs_end) {
+ offset = rs->rs_start;
+ *cursor = offset + size;
+ }
+ }
+
+ return (offset);
}
static metaslab_ops_t metaslab_df_ops = {
@@ -4823,6 +4838,14 @@ MODULE_PARM_DESC(zfs_metaslab_switch_threshold,
module_param(metaslab_force_ganging, ulong, 0644);
MODULE_PARM_DESC(metaslab_force_ganging,
"blocks larger than this size are forced to be gang blocks");
+
+module_param(metaslab_df_max_search, int, 0644);
+MODULE_PARM_DESC(metaslab_df_max_search,
+ "max distance (bytes) to search forward before using size tree");
+
+module_param(metaslab_df_use_largest_segment, int, 0644);
+MODULE_PARM_DESC(metaslab_df_use_largest_segment,
+ "when looking in size tree, use largest segment instead of exact fit");
/* END CSTYLED */
#endif