diff options
author | Don Brady <[email protected]> | 2017-01-12 12:52:56 -0700 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2017-01-12 11:52:56 -0800 |
commit | 4e21fd060a567a9c8a1bd0e640985412181c1e33 (patch) | |
tree | 622424c5f98ac43cd93d8af221775a77df161856 /include/sys | |
parent | 5727b00e06a4208a7040489d582f13bc3c183384 (diff) |
OpenZFS 7303 - dynamic metaslab selection
This change introduces a new weighting algorithm to improve
metaslab selection. The new weighting algorithm relies on the
SPACEMAP_HISTOGRAM feature. As a result, the metaslab weight
now encodes the type of weighting algorithm used (size-based
vs segment-based).
Porting Notes: The metaslab allocation tracing code is conditionally
removed on linux (dependent on mdb debugger).
Authored by: George Wilson <[email protected]>
Reviewed by: Alex Reece <[email protected]>
Reviewed by: Chris Siden <[email protected]>
Reviewed by: Dan Kimmel <[email protected]>
Reviewed by: Matthew Ahrens <[email protected]>
Reviewed by: Paul Dagnelie <[email protected]>
Reviewed by: Pavel Zakharov [email protected]
Reviewed by: Prakash Surya <[email protected]>
Reviewed by: Don Brady <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Ported-by: Don Brady <[email protected]>
OpenZFS-issue: https://www.illumos.org/issues/7303
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/d5190931bd
Closes #5404
Diffstat (limited to 'include/sys')
-rw-r--r-- | include/sys/metaslab.h | 13 | ||||
-rw-r--r-- | include/sys/metaslab_impl.h | 108 | ||||
-rw-r--r-- | include/sys/zfs_context.h | 1 | ||||
-rw-r--r-- | include/sys/zfs_debug.h | 17 | ||||
-rw-r--r-- | include/sys/zio.h | 6 |
5 files changed, 131 insertions, 14 deletions
diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 408f6d333..be271c702 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_H @@ -36,10 +36,12 @@ extern "C" { #endif + typedef struct metaslab_ops { - uint64_t (*msop_alloc)(metaslab_t *msp, uint64_t size); + uint64_t (*msop_alloc)(metaslab_t *, uint64_t); } metaslab_ops_t; + extern metaslab_ops_t *zfs_metaslab_ops; int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t, @@ -64,13 +66,18 @@ uint64_t metaslab_block_maxsize(metaslab_t *); #define METASLAB_FASTWRITE 0x20 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, - blkptr_t *, int, uint64_t, blkptr_t *, int, zio_t *); + blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); void metaslab_check_free(spa_t *, const blkptr_t *); void metaslab_fastwrite_mark(spa_t *, const blkptr_t *); void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *); +void metaslab_alloc_trace_init(void); +void metaslab_alloc_trace_fini(void); +void metaslab_trace_init(zio_alloc_list_t *); +void metaslab_trace_fini(zio_alloc_list_t *); + metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *); void metaslab_class_destroy(metaslab_class_t *); int metaslab_class_validate(metaslab_class_t *); diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 080a1db9a..7b68b51f6 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_IMPL_H @@ -42,6 +42,94 @@ extern "C" { #endif /* + * Metaslab allocation tracing record. + */ +typedef struct metaslab_alloc_trace { + list_node_t mat_list_node; + metaslab_group_t *mat_mg; + metaslab_t *mat_msp; + uint64_t mat_size; + uint64_t mat_weight; + uint32_t mat_dva_id; + uint64_t mat_offset; +} metaslab_alloc_trace_t; + +/* + * Used by the metaslab allocation tracing facility to indicate + * error conditions. These errors are stored to the offset member + * of the metaslab_alloc_trace_t record and displayed by mdb. + */ +typedef enum trace_alloc_type { + TRACE_ALLOC_FAILURE = -1ULL, + TRACE_TOO_SMALL = -2ULL, + TRACE_FORCE_GANG = -3ULL, + TRACE_NOT_ALLOCATABLE = -4ULL, + TRACE_GROUP_FAILURE = -5ULL, + TRACE_ENOSPC = -6ULL, + TRACE_CONDENSING = -7ULL, + TRACE_VDEV_ERROR = -8ULL +} trace_alloc_type_t; + +#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) +#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) +#define METASLAB_WEIGHT_TYPE (1ULL << 61) +#define METASLAB_ACTIVE_MASK \ + (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) + +/* + * The metaslab weight is used to encode the amount of free space in a + * metaslab, such that the "best" metaslab appears first when sorting the + * metaslabs by weight. The weight (and therefore the "best" metaslab) can + * be determined in two different ways: by computing a weighted sum of all + * the free space in the metaslab (a space based weight) or by counting only + * the free segments of the largest size (a segment based weight). We prefer + * the segment based weight because it reflects how the free space is + * comprised, but we cannot always use it -- legacy pools do not have the + * space map histogram information necessary to determine the largest + * contiguous regions. Pools that have the space map histogram determine + * the segment weight by looking at each bucket in the histogram and + * determining the free space whose size in bytes is in the range: + * [2^i, 2^(i+1)) + * We then encode the largest index, i, that contains regions into the + * segment-weighted value. + * + * Space-based weight: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * |PS1| weighted-free space | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * PS - indicates primary and secondary activation + * space - the fragmentation-weighted space + * + * Segment-based weight: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * |PS0| idx| count of segments in region | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * PS - indicates primary and secondary activation + * idx - index for the highest bucket in the histogram + * count - number of segments in the specified bucket + */ +#define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 62, 2) +#define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 62, 2, x) + +#define WEIGHT_IS_SPACEBASED(weight) \ + ((weight) == 0 || BF64_GET((weight), 61, 1)) +#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 61, 1, 1) + +/* + * These macros are only applicable to segment-based weighting. + */ +#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 55, 6) +#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 55, 6, x) +#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 55) +#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 55, x) + +/* * A metaslab class encompasses a category of allocatable top-level vdevs. * Each top-level vdev is associated with a metaslab group which defines * the allocatable region for that vdev. Examples of these categories include @@ -220,7 +308,6 @@ struct metaslab { kmutex_t ms_lock; kcondvar_t ms_load_cv; space_map_t *ms_sm; - metaslab_ops_t *ms_ops; uint64_t ms_id; uint64_t ms_start; uint64_t ms_size; @@ -233,12 +320,27 @@ struct metaslab { boolean_t ms_condensing; /* condensing? */ boolean_t ms_condense_wanted; + + /* + * We must hold both ms_lock and ms_group->mg_lock in order to + * modify ms_loaded. + */ boolean_t ms_loaded; boolean_t ms_loading; int64_t ms_deferspace; /* sum of ms_defermap[] space */ uint64_t ms_weight; /* weight vs. others in group */ - uint64_t ms_access_txg; + uint64_t ms_activation_weight; /* activation weight */ + + /* + * Track of whenever a metaslab is selected for loading or allocation. + * We use this value to determine how long the metaslab should + * stay cached. + */ + uint64_t ms_selected_txg; + + uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ + uint64_t ms_max_size; /* maximum allocatable size */ /* * The metaslab block allocators can optionally use a size-ordered diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index a1abe20da..6e950716c 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -394,6 +394,7 @@ extern void cv_broadcast(kcondvar_t *cv); */ extern kstat_t *kstat_create(const char *, int, const char *, const char *, uchar_t, ulong_t, uchar_t); +extern void kstat_named_init(kstat_named_t *, const char *, uchar_t); extern void kstat_install(kstat_t *); extern void kstat_delete(kstat_t *); extern void kstat_waitq_enter(kstat_io_t *); diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h index 2f0064ee0..a2f61e5b2 100644 --- a/include/sys/zfs_debug.h +++ b/include/sys/zfs_debug.h @@ -42,14 +42,15 @@ extern int zfs_flags; extern int zfs_recover; extern int zfs_free_leak_on_eio; -#define ZFS_DEBUG_DPRINTF (1<<0) -#define ZFS_DEBUG_DBUF_VERIFY (1<<1) -#define ZFS_DEBUG_DNODE_VERIFY (1<<2) -#define ZFS_DEBUG_SNAPNAMES (1<<3) -#define ZFS_DEBUG_MODIFY (1<<4) -#define ZFS_DEBUG_SPA (1<<5) -#define ZFS_DEBUG_ZIO_FREE (1<<6) -#define ZFS_DEBUG_HISTOGRAM_VERIFY (1<<7) +#define ZFS_DEBUG_DPRINTF (1 << 0) +#define ZFS_DEBUG_DBUF_VERIFY (1 << 1) +#define ZFS_DEBUG_DNODE_VERIFY (1 << 2) +#define ZFS_DEBUG_SNAPNAMES (1 << 3) +#define ZFS_DEBUG_MODIFY (1 << 4) +#define ZFS_DEBUG_SPA (1 << 5) +#define ZFS_DEBUG_ZIO_FREE (1 << 6) +#define ZFS_DEBUG_HISTOGRAM_VERIFY (1 << 7) +#define ZFS_DEBUG_METASLAB_VERIFY (1 << 8) extern void __dprintf(const char *file, const char *func, int line, const char *fmt, ...); diff --git a/include/sys/zio.h b/include/sys/zio.h index 6c5153dcf..1f8708e5a 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -358,6 +358,11 @@ typedef int zio_pipe_stage_t(zio_t *zio); #define ZIO_REEXECUTE_NOW 0x01 #define ZIO_REEXECUTE_SUSPEND 0x02 +typedef struct zio_alloc_list { + list_t zal_list; + uint64_t zal_size; +} zio_alloc_list_t; + typedef struct zio_link { zio_t *zl_parent; zio_t *zl_child; @@ -417,6 +422,7 @@ struct zio { avl_node_t io_queue_node; avl_node_t io_offset_node; avl_node_t io_alloc_node; + zio_alloc_list_t io_alloc_list; /* Internal pipeline state */ enum zio_flag io_flags; |