diff options
Diffstat (limited to 'include/sys')
-rw-r--r-- | include/sys/metaslab.h | 18 | ||||
-rw-r--r-- | include/sys/metaslab_impl.h | 79 | ||||
-rw-r--r-- | include/sys/spa_impl.h | 12 | ||||
-rw-r--r-- | include/sys/vdev_impl.h | 3 | ||||
-rw-r--r-- | include/sys/zio.h | 3 |
5 files changed, 81 insertions, 34 deletions
diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 282ec231c..545bcafa5 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_H @@ -66,9 +66,10 @@ uint64_t metaslab_block_maxsize(metaslab_t *); #define METASLAB_FASTWRITE 0x20 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, - blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *); + blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *, + int); int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t, - dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *); + dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t); void metaslab_free_dva(spa_t *, const dva_t *, boolean_t); @@ -91,9 +92,9 @@ int metaslab_class_validate(metaslab_class_t *); void metaslab_class_histogram_verify(metaslab_class_t *); uint64_t metaslab_class_fragmentation(metaslab_class_t *); uint64_t metaslab_class_expandable_space(metaslab_class_t *); -boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, +boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int, zio_t *, int); -void metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *); +void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *); void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t, int64_t, int64_t); @@ -102,7 +103,7 @@ uint64_t metaslab_class_get_space(metaslab_class_t *); uint64_t metaslab_class_get_dspace(metaslab_class_t *); uint64_t metaslab_class_get_deferred(metaslab_class_t *); -metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *); +metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *, int); void metaslab_group_destroy(metaslab_group_t *); void metaslab_group_activate(metaslab_group_t *); void metaslab_group_passivate(metaslab_group_t *); @@ -111,8 +112,9 @@ uint64_t metaslab_group_get_space(metaslab_group_t *); void metaslab_group_histogram_verify(metaslab_group_t *); uint64_t metaslab_group_fragmentation(metaslab_group_t *); void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *); -void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int); -void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *); +void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int, + boolean_t); +void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int); #ifdef __cplusplus } diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index dafd2b231..cc6e8b796 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_IMPL_H @@ -52,6 +52,7 @@ typedef struct metaslab_alloc_trace { uint64_t mat_weight; uint32_t mat_dva_id; uint64_t mat_offset; + int mat_allocator; } metaslab_alloc_trace_t; /* @@ -72,9 +73,11 @@ typedef enum trace_alloc_type { #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) -#define METASLAB_WEIGHT_TYPE (1ULL << 61) +#define METASLAB_WEIGHT_CLAIM (1ULL << 61) +#define METASLAB_WEIGHT_TYPE (1ULL << 60) #define METASLAB_ACTIVE_MASK \ - (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) + (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \ + METASLAB_WEIGHT_CLAIM) /* * The metaslab weight is used to encode the amount of free space in a @@ -97,37 +100,39 @@ typedef enum trace_alloc_type { * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * |PS1| weighted-free space | + * |PSC1| weighted-free space | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * PS - indicates primary and secondary activation + * C - indicates activation for claimed block zio * space - the fragmentation-weighted space * * Segment-based weight: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * |PS0| idx| count of segments in region | + * |PSC0| idx| count of segments in region | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * PS - indicates primary and secondary activation + * C - indicates activation for claimed block zio * idx - index for the highest bucket in the histogram * count - number of segments in the specified bucket */ -#define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 62, 2) -#define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 62, 2, x) +#define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 61, 3) +#define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 61, 3, x) #define WEIGHT_IS_SPACEBASED(weight) \ - ((weight) == 0 || BF64_GET((weight), 61, 1)) -#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 61, 1, 1) + ((weight) == 0 || BF64_GET((weight), 60, 1)) +#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 60, 1, 1) /* * These macros are only applicable to segment-based weighting. */ -#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 55, 6) -#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 55, 6, x) -#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 55) -#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 55, x) +#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 54, 6) +#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 54, 6, x) +#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54) +#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x) /* * A metaslab class encompasses a category of allocatable top-level vdevs. @@ -178,8 +183,8 @@ struct metaslab_class { * allowed to reserve slots even if we've reached the maximum * number of allocations allowed. */ - uint64_t mc_alloc_max_slots; - refcount_t mc_alloc_slots; + uint64_t *mc_alloc_max_slots; + refcount_t *mc_alloc_slots; uint64_t mc_alloc_groups; /* # of allocatable groups */ @@ -201,9 +206,12 @@ struct metaslab_class { */ struct metaslab_group { kmutex_t mg_lock; + metaslab_t **mg_primaries; + metaslab_t **mg_secondaries; avl_tree_t mg_metaslab_tree; uint64_t mg_aliquot; boolean_t mg_allocatable; /* can we allocate? */ + uint64_t mg_ms_ready; /* * A metaslab group is considered to be initialized only after @@ -223,15 +231,33 @@ struct metaslab_group { metaslab_group_t *mg_next; /* - * Each metaslab group can handle mg_max_alloc_queue_depth allocations - * which are tracked by mg_alloc_queue_depth. It's possible for a - * metaslab group to handle more allocations than its max. This - * can occur when gang blocks are required or when other groups - * are unable to handle their share of allocations. + * In order for the allocation throttle to function properly, we cannot + * have too many IOs going to each disk by default; the throttle + * operates by allocating more work to disks that finish quickly, so + * allocating larger chunks to each disk reduces its effectiveness. + * However, if the number of IOs going to each allocator is too small, + * we will not perform proper aggregation at the vdev_queue layer, + * also resulting in decreased performance. Therefore, we will use a + * ramp-up strategy. + * + * Each allocator in each metaslab group has a current queue depth + * (mg_alloc_queue_depth[allocator]) and a current max queue depth + * (mg_cur_max_alloc_queue_depth[allocator]), and each metaslab group + * has an absolute max queue depth (mg_max_alloc_queue_depth). We + * add IOs to an allocator until the mg_alloc_queue_depth for that + * allocator hits the cur_max. Every time an IO completes for a given + * allocator on a given metaslab group, we increment its cur_max until + * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to + * help protect against disks that decrease in performance over time. + * + * It's possible for an allocator to handle more allocations than + * its max. This can occur when gang blocks are required or when other + * groups are unable to handle their share of allocations. */ uint64_t mg_max_alloc_queue_depth; - refcount_t mg_alloc_queue_depth; - + uint64_t *mg_cur_max_alloc_queue_depth; + refcount_t *mg_alloc_queue_depth; + int mg_allocators; /* * A metalab group that can no longer allocate the minimum block * size will set mg_no_free_space. Once a metaslab group is out @@ -356,6 +382,13 @@ struct metaslab { uint64_t ms_max_size; /* maximum allocatable size */ /* + * -1 if it's not active in an allocator, otherwise set to the allocator + * this metaslab is active for. + */ + int ms_allocator; + boolean_t ms_primary; /* Only valid if ms_allocator is not -1 */ + + /* * The metaslab block allocators can optionally use a size-ordered * range tree and/or an array of LBAs. Not all allocators use * this functionality. The ms_allocatable_by_size should always @@ -369,6 +402,8 @@ struct metaslab { metaslab_group_t *ms_group; /* metaslab group */ avl_node_t ms_group_node; /* node in metaslab group tree */ txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ + + boolean_t ms_new; }; #ifdef __cplusplus diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 8d2a20dbb..1b8e48180 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -239,8 +239,16 @@ struct spa { uint64_t spa_last_synced_guid; /* last synced guid */ list_t spa_config_dirty_list; /* vdevs with dirty config */ list_t spa_state_dirty_list; /* vdevs with dirty state */ - kmutex_t spa_alloc_lock; - avl_tree_t spa_alloc_tree; + /* + * spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are + * stored in spa_alloc_count. There is one tree and one lock for each + * allocator, to help improve allocation performance in write-heavy + * workloads. + */ + kmutex_t *spa_alloc_locks; + avl_tree_t *spa_alloc_trees; + int spa_alloc_count; + spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ nvlist_t *spa_label_features; /* Features for reading MOS */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index c22087307..701328ea6 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_VDEV_IMPL_H @@ -60,6 +60,7 @@ typedef struct vdev_cache_entry vdev_cache_entry_t; struct abd; extern int zfs_vdev_queue_depth_pct; +extern int zfs_vdev_def_queue_depth; extern uint32_t zfs_vdev_async_write_max_active; /* diff --git a/include/sys/zio.h b/include/sys/zio.h index 6c0c682a8..bca861d18 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright 2016 Toomas Soome <[email protected]> @@ -507,6 +507,7 @@ struct zio { void *io_waiter; kmutex_t io_lock; kcondvar_t io_cv; + int io_allocator; /* FMA state */ zio_cksum_report_t *io_cksum_report; |