diff options
author | George Wilson <[email protected]> | 2014-07-19 12:19:24 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2014-08-18 08:40:49 -0700 |
commit | f3a7f6610f2df0217ba3b99099019417a954b673 (patch) | |
tree | 720f77d117032a585761dd5bb80e5a5694915111 /include | |
parent | f67d709080f3d4a247191f0d25cbedc5da103f78 (diff) |
Illumos 4976-4984 - metaslab improvements
4976 zfs should only avoid writing to a failing non-redundant top-level vdev
4978 ztest fails in get_metaslab_refcount()
4979 extend free space histogram to device and pool
4980 metaslabs should have a fragmentation metric
4981 remove fragmented ops vector from block allocator
4982 space_map object should proactively upgrade when feature is enabled
4983 need to collect metaslab information via mdb
4984 device selection should use fragmentation metric
Reviewed by: Matthew Ahrens <[email protected]>
Reviewed by: Adam Leventhal <[email protected]>
Reviewed by: Christopher Siden <[email protected]>
Approved by: Garrett D'Amore <[email protected]>
References:
https://www.illumos.org/issues/4976
https://www.illumos.org/issues/4978
https://www.illumos.org/issues/4979
https://www.illumos.org/issues/4980
https://www.illumos.org/issues/4981
https://www.illumos.org/issues/4982
https://www.illumos.org/issues/4983
https://www.illumos.org/issues/4984
https://github.com/illumos/illumos-gate/commit/2e4c998
Notes:
The "zdb -M" option has been re-tasked to display the new metaslab
fragmentation metric and the new "zdb -I" option is used to control
the maximum number of in-flight I/Os.
The new fragmentation metric is derived from the space map histogram
which has been rolled up to the vdev and pool level and is presented
to the user via "zpool list".
Add a number of module parameters related to the new metaslab weighting
logic.
Ported by: Tim Chase <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #2595
Diffstat (limited to 'include')
-rw-r--r-- | include/sys/fs/zfs.h | 9 | ||||
-rw-r--r-- | include/sys/metaslab.h | 71 | ||||
-rw-r--r-- | include/sys/metaslab_impl.h | 36 | ||||
-rw-r--r-- | include/sys/space_map.h | 8 | ||||
-rw-r--r-- | include/sys/zfs_debug.h | 17 |
5 files changed, 92 insertions, 49 deletions
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index d54cd5679..24ad768c0 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -194,6 +194,7 @@ typedef enum { ZPOOL_PROP_COMMENT, ZPOOL_PROP_EXPANDSZ, ZPOOL_PROP_FREEING, + ZPOOL_PROP_FRAGMENTATION, ZPOOL_PROP_LEAKED, ZPOOL_NUM_PROPS } zpool_prop_t; @@ -600,6 +601,13 @@ typedef struct zpool_rewind_policy { #define SPA_MINDEVSIZE (64ULL << 20) /* + * Set if the fragmentation has not yet been calculated. This can happen + * because the space maps have not been upgraded or the histogram feature + * is not enabled. + */ +#define ZFS_FRAG_INVALID UINT64_MAX + +/* * The location of the pool configuration repository, shared between kernel and * userland. */ @@ -747,6 +755,7 @@ typedef struct vdev_stat { uint64_t vs_self_healed; /* self-healed bytes */ uint64_t vs_scan_removing; /* removing? */ uint64_t vs_scan_processed; /* scan processed bytes */ + uint64_t vs_fragmentation; /* device fragmentation */ } vdev_stat_t; /* diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index a3bbc25f7..962dafca4 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_H @@ -38,23 +38,22 @@ extern "C" { typedef struct metaslab_ops { uint64_t (*msop_alloc)(metaslab_t *msp, uint64_t size); - boolean_t (*msop_fragmented)(metaslab_t *msp); } metaslab_ops_t; extern metaslab_ops_t *zfs_metaslab_ops; -metaslab_t *metaslab_init(metaslab_group_t *mg, uint64_t id, - uint64_t object, uint64_t txg); -void metaslab_fini(metaslab_t *msp); +metaslab_t *metaslab_init(metaslab_group_t *, uint64_t, + uint64_t, uint64_t); +void metaslab_fini(metaslab_t *); -void metaslab_load_wait(metaslab_t *msp); -int metaslab_load(metaslab_t *msp); -void metaslab_unload(metaslab_t *msp); +void metaslab_load_wait(metaslab_t *); +int metaslab_load(metaslab_t *); +void metaslab_unload(metaslab_t *); -void metaslab_sync(metaslab_t *msp, uint64_t txg); -void metaslab_sync_done(metaslab_t *msp, uint64_t txg); -void metaslab_sync_reassess(metaslab_group_t *mg); -uint64_t metaslab_block_maxsize(metaslab_t *msp); +void metaslab_sync(metaslab_t *, uint64_t); +void metaslab_sync_done(metaslab_t *, uint64_t); +void metaslab_sync_reassess(metaslab_group_t *); +uint64_t metaslab_block_maxsize(metaslab_t *); #define METASLAB_HINTBP_FAVOR 0x0 #define METASLAB_HINTBP_AVOID 0x1 @@ -63,30 +62,36 @@ uint64_t metaslab_block_maxsize(metaslab_t *msp); #define METASLAB_GANG_AVOID 0x8 #define METASLAB_FASTWRITE 0x10 -int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, - blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags); -void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now); -int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); -void metaslab_check_free(spa_t *spa, const blkptr_t *bp); -void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp); -void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp); +int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, + blkptr_t *, int, uint64_t, blkptr_t *, int); +void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); +int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); +void metaslab_check_free(spa_t *, const blkptr_t *); +void metaslab_fastwrite_mark(spa_t *, const blkptr_t *); +void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *); -metaslab_class_t *metaslab_class_create(spa_t *spa, metaslab_ops_t *ops); -void metaslab_class_destroy(metaslab_class_t *mc); -int metaslab_class_validate(metaslab_class_t *mc); +metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *); +void metaslab_class_destroy(metaslab_class_t *); +int metaslab_class_validate(metaslab_class_t *); +void metaslab_class_histogram_verify(metaslab_class_t *); +uint64_t metaslab_class_fragmentation(metaslab_class_t *); +uint64_t metaslab_class_expandable_space(metaslab_class_t *); -void metaslab_class_space_update(metaslab_class_t *mc, - int64_t alloc_delta, int64_t defer_delta, - int64_t space_delta, int64_t dspace_delta); -uint64_t metaslab_class_get_alloc(metaslab_class_t *mc); -uint64_t metaslab_class_get_space(metaslab_class_t *mc); -uint64_t metaslab_class_get_dspace(metaslab_class_t *mc); -uint64_t metaslab_class_get_deferred(metaslab_class_t *mc); +void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t, + int64_t, int64_t); +uint64_t metaslab_class_get_alloc(metaslab_class_t *); +uint64_t metaslab_class_get_space(metaslab_class_t *); +uint64_t metaslab_class_get_dspace(metaslab_class_t *); +uint64_t metaslab_class_get_deferred(metaslab_class_t *); -metaslab_group_t *metaslab_group_create(metaslab_class_t *mc, vdev_t *vd); -void metaslab_group_destroy(metaslab_group_t *mg); -void metaslab_group_activate(metaslab_group_t *mg); -void metaslab_group_passivate(metaslab_group_t *mg); +metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *); +void metaslab_group_destroy(metaslab_group_t *); +void metaslab_group_activate(metaslab_group_t *); +void metaslab_group_passivate(metaslab_group_t *); +uint64_t metaslab_group_get_space(metaslab_group_t *); +void metaslab_group_histogram_verify(metaslab_group_t *); +uint64_t metaslab_group_fragmentation(metaslab_group_t *); +void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *); #ifdef __cplusplus } diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 3cd27d75e..88bda071f 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -41,6 +41,23 @@ extern "C" { #endif +/* + * A metaslab class encompasses a category of allocatable top-level vdevs. + * Each top-level vdev is associated with a metaslab group which defines + * the allocatable region for that vdev. Examples of these categories include + * "normal" for data block allocations (i.e. main pool allocations) or "log" + * for allocations designated for intent log devices (i.e. slog devices). + * When a block allocation is requested from the SPA it is associated with a + * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging + * to the class can be used to satisfy that request. Allocations are done + * by traversing the metaslab groups that are linked off of the mc_rotor field. + * This rotor points to the next metaslab group where allocations will be + * attempted. Allocating a block is a 3 step process -- select the metaslab + * group, select the metaslab, and then allocate the block. The metaslab + * class defines the low-level block allocator that will be used as the + * final step in allocation. These allocators are pluggable allowing each class + * to use a block allocator that best suits that class. + */ struct metaslab_class { spa_t *mc_spa; metaslab_group_t *mc_rotor; @@ -51,9 +68,19 @@ struct metaslab_class { uint64_t mc_deferred; /* total deferred frees */ uint64_t mc_space; /* total space (alloc + free) */ uint64_t mc_dspace; /* total deflated space */ + uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE]; kmutex_t mc_fastwrite_lock; }; +/* + * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs) + * of a top-level vdev. They are linked togther to form a circular linked + * list and can belong to only one metaslab class. Metaslab groups may become + * ineligible for allocations for a number of reasons such as limited free + * space, fragmentation, or going offline. When this happens the allocator will + * simply find the next metaslab group in the linked list and attempt + * to allocate from that group instead. + */ struct metaslab_group { kmutex_t mg_lock; avl_tree_t mg_metaslab_tree; @@ -67,12 +94,14 @@ struct metaslab_group { taskq_t *mg_taskq; metaslab_group_t *mg_prev; metaslab_group_t *mg_next; + uint64_t mg_fragmentation; + uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE]; }; /* * This value defines the number of elements in the ms_lbas array. The value - * of 64 was chosen as it covers to cover all power of 2 buckets up to - * UINT64_MAX. This is the equivalent of highbit(UINT64_MAX). + * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX. + * This is the equivalent of highbit(UINT64_MAX). */ #define MAX_LBAS 64 @@ -135,6 +164,7 @@ struct metaslab { uint64_t ms_id; uint64_t ms_start; uint64_t ms_size; + uint64_t ms_fragmentation; range_tree_t *ms_alloctree[TXG_SIZE]; range_tree_t *ms_freetree[TXG_SIZE]; @@ -142,12 +172,12 @@ struct metaslab { range_tree_t *ms_tree; boolean_t ms_condensing; /* condensing? */ + boolean_t ms_condense_wanted; boolean_t ms_loaded; boolean_t ms_loading; int64_t ms_deferspace; /* sum of ms_defermap[] space */ uint64_t ms_weight; /* weight vs. others in group */ - uint64_t ms_factor; uint64_t ms_access_txg; /* diff --git a/include/sys/space_map.h b/include/sys/space_map.h index 369180330..67fa2767b 100644 --- a/include/sys/space_map.h +++ b/include/sys/space_map.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #ifndef _SYS_SPACE_MAP_H @@ -44,9 +44,7 @@ extern "C" { * maintain backward compatibility. */ #define SPACE_MAP_SIZE_V0 (3 * sizeof (uint64_t)) -#define SPACE_MAP_HISTOGRAM_SIZE(sm) \ - (sizeof ((sm)->sm_phys->smp_histogram) / \ - sizeof ((sm)->sm_phys->smp_histogram[0])) +#define SPACE_MAP_HISTOGRAM_SIZE 32 /* * The space_map_phys is the on-disk representation of the space map. @@ -68,7 +66,7 @@ typedef struct space_map_phys { * whose size is: * 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1) */ - uint64_t smp_histogram[32]; /* histogram of free space */ + uint64_t smp_histogram[SPACE_MAP_HISTOGRAM_SIZE]; } space_map_phys_t; /* diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h index 829b37a46..35ffa0187 100644 --- a/include/sys/zfs_debug.h +++ b/include/sys/zfs_debug.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_DEBUG_H @@ -50,13 +50,14 @@ extern int zfs_flags; extern int zfs_recover; extern int zfs_free_leak_on_eio; -#define ZFS_DEBUG_DPRINTF (1<<0) -#define ZFS_DEBUG_DBUF_VERIFY (1<<1) -#define ZFS_DEBUG_DNODE_VERIFY (1<<2) -#define ZFS_DEBUG_SNAPNAMES (1<<3) -#define ZFS_DEBUG_MODIFY (1<<4) -#define ZFS_DEBUG_SPA (1<<5) -#define ZFS_DEBUG_ZIO_FREE (1<<6) +#define ZFS_DEBUG_DPRINTF (1<<0) +#define ZFS_DEBUG_DBUF_VERIFY (1<<1) +#define ZFS_DEBUG_DNODE_VERIFY (1<<2) +#define ZFS_DEBUG_SNAPNAMES (1<<3) +#define ZFS_DEBUG_MODIFY (1<<4) +#define ZFS_DEBUG_SPA (1<<5) +#define ZFS_DEBUG_ZIO_FREE (1<<6) +#define ZFS_DEBUG_HISTOGRAM_VERIFY (1<<7) /* * Always log zfs debug messages to the spl debug subsystem as SS_USER1. |