diff options
Diffstat (limited to 'include')
-rw-r--r-- | include/sys/fs/zfs.h | 3 | ||||
-rw-r--r-- | include/sys/metaslab.h | 23 | ||||
-rw-r--r-- | include/sys/metaslab_impl.h | 63 | ||||
-rw-r--r-- | include/sys/refcount.h | 7 | ||||
-rw-r--r-- | include/sys/spa_impl.h | 2 | ||||
-rw-r--r-- | include/sys/vdev_impl.h | 16 | ||||
-rw-r--r-- | include/sys/zio.h | 44 | ||||
-rw-r--r-- | include/sys/zio_impl.h | 51 |
8 files changed, 157 insertions, 52 deletions
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 5c93f53de..c51d190c7 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1038,7 +1038,8 @@ typedef enum { SPA_LOAD_IMPORT, /* import in progress */ SPA_LOAD_TRYIMPORT, /* tryimport in progress */ SPA_LOAD_RECOVER, /* recovery requested */ - SPA_LOAD_ERROR /* load failed */ + SPA_LOAD_ERROR, /* load failed */ + SPA_LOAD_CREATE /* creation in progress */ } spa_load_state_t; /* diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 5f831a1f5..408f6d333 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_H @@ -55,15 +55,16 @@ void metaslab_sync_done(metaslab_t *, uint64_t); void metaslab_sync_reassess(metaslab_group_t *); uint64_t metaslab_block_maxsize(metaslab_t *); -#define METASLAB_HINTBP_FAVOR 0x0 -#define METASLAB_HINTBP_AVOID 0x1 -#define METASLAB_GANG_HEADER 0x2 -#define METASLAB_GANG_CHILD 0x4 -#define METASLAB_GANG_AVOID 0x8 -#define METASLAB_FASTWRITE 0x10 +#define METASLAB_HINTBP_FAVOR 0x0 +#define METASLAB_HINTBP_AVOID 0x1 +#define METASLAB_GANG_HEADER 0x2 +#define METASLAB_GANG_CHILD 0x4 +#define METASLAB_ASYNC_ALLOC 0x8 +#define METASLAB_DONT_THROTTLE 0x10 +#define METASLAB_FASTWRITE 0x20 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, - blkptr_t *, int, uint64_t, blkptr_t *, int); + blkptr_t *, int, uint64_t, blkptr_t *, int, zio_t *); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); void metaslab_check_free(spa_t *, const blkptr_t *); @@ -76,6 +77,9 @@ int metaslab_class_validate(metaslab_class_t *); void metaslab_class_histogram_verify(metaslab_class_t *); uint64_t metaslab_class_fragmentation(metaslab_class_t *); uint64_t metaslab_class_expandable_space(metaslab_class_t *); +boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, + zio_t *, int); +void metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *); void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t, int64_t, int64_t); @@ -88,10 +92,13 @@ metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *); void metaslab_group_destroy(metaslab_group_t *); void metaslab_group_activate(metaslab_group_t *); void metaslab_group_passivate(metaslab_group_t *); +boolean_t metaslab_group_initialized(metaslab_group_t *); uint64_t metaslab_group_get_space(metaslab_group_t *); void metaslab_group_histogram_verify(metaslab_group_t *); uint64_t metaslab_group_fragmentation(metaslab_group_t *); void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *); +void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int); +void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *); #ifdef __cplusplus } diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 27a53b515..1c8993aca 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_IMPL_H @@ -59,11 +59,42 @@ extern "C" { * to use a block allocator that best suits that class. */ struct metaslab_class { + kmutex_t mc_lock; spa_t *mc_spa; metaslab_group_t *mc_rotor; metaslab_ops_t *mc_ops; uint64_t mc_aliquot; + + /* + * Track the number of metaslab groups that have been initialized + * and can accept allocations. An initialized metaslab group is + * one has been completely added to the config (i.e. we have + * updated the MOS config and the space has been added to the pool). + */ + uint64_t mc_groups; + + /* + * Toggle to enable/disable the allocation throttle. + */ + boolean_t mc_alloc_throttle_enabled; + + /* + * The allocation throttle works on a reservation system. Whenever + * an asynchronous zio wants to perform an allocation it must + * first reserve the number of blocks that it wants to allocate. + * If there aren't sufficient slots available for the pending zio + * then that I/O is throttled until more slots free up. The current + * number of reserved allocations is maintained by the mc_alloc_slots + * refcount. The mc_alloc_max_slots value determines the maximum + * number of allocations that the system allows. Gang blocks are + * allowed to reserve slots even if we've reached the maximum + * number of allocations allowed. + */ + uint64_t mc_alloc_max_slots; + refcount_t mc_alloc_slots; + uint64_t mc_alloc_groups; /* # of allocatable groups */ + uint64_t mc_alloc; /* total allocated space */ uint64_t mc_deferred; /* total deferred frees */ uint64_t mc_space; /* total space (alloc + free) */ @@ -85,6 +116,15 @@ struct metaslab_group { avl_tree_t mg_metaslab_tree; uint64_t mg_aliquot; boolean_t mg_allocatable; /* can we allocate? */ + + /* + * A metaslab group is considered to be initialized only after + * we have updated the MOS config and added the space to the pool. + * We only allow allocation attempts to a metaslab group if it + * has been initialized. + */ + boolean_t mg_initialized; + uint64_t mg_free_capacity; /* percentage free */ int64_t mg_bias; int64_t mg_activation_count; @@ -93,6 +133,27 @@ struct metaslab_group { taskq_t *mg_taskq; metaslab_group_t *mg_prev; metaslab_group_t *mg_next; + + /* + * Each metaslab group can handle mg_max_alloc_queue_depth allocations + * which are tracked by mg_alloc_queue_depth. It's possible for a + * metaslab group to handle more allocations than its max. This + * can occur when gang blocks are required or when other groups + * are unable to handle their share of allocations. + */ + uint64_t mg_max_alloc_queue_depth; + refcount_t mg_alloc_queue_depth; + + /* + * A metalab group that can no longer allocate the minimum block + * size will set mg_no_free_space. Once a metaslab group is out + * of space then its share of work must be distributed to other + * groups. + */ + boolean_t mg_no_free_space; + + uint64_t mg_allocations; + uint64_t mg_failed_allocations; uint64_t mg_fragmentation; uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE]; }; diff --git a/include/sys/refcount.h b/include/sys/refcount.h index 580976c91..3f50cddb6 100644 --- a/include/sys/refcount.h +++ b/include/sys/refcount.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_REFCOUNT_H @@ -61,6 +62,7 @@ typedef struct refcount { void refcount_create(refcount_t *rc); void refcount_create_untracked(refcount_t *rc); +void refcount_create_tracked(refcount_t *rc); void refcount_destroy(refcount_t *rc); void refcount_destroy_many(refcount_t *rc, uint64_t number); int refcount_is_zero(refcount_t *rc); @@ -71,6 +73,8 @@ int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag); int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag); void refcount_transfer(refcount_t *dst, refcount_t *src); void refcount_transfer_ownership(refcount_t *, void *, void *); +boolean_t refcount_held(refcount_t *, void *); +boolean_t refcount_not_held(refcount_t *, void *); void refcount_init(void); void refcount_fini(void); @@ -83,6 +87,7 @@ typedef struct refcount { #define refcount_create(rc) ((rc)->rc_count = 0) #define refcount_create_untracked(rc) ((rc)->rc_count = 0) +#define refcount_create_tracked(rc) ((rc)->rc_count = 0) #define refcount_destroy(rc) ((rc)->rc_count = 0) #define refcount_destroy_many(rc, number) ((rc)->rc_count = 0) #define refcount_is_zero(rc) ((rc)->rc_count == 0) @@ -99,6 +104,8 @@ typedef struct refcount { atomic_add_64(&(dst)->rc_count, __tmp); \ } #define refcount_transfer_ownership(rc, current_holder, new_holder) (void)0 +#define refcount_held(rc, holder) ((rc)->rc_count > 0) +#define refcount_not_held(rc, holder) (B_TRUE) #define refcount_init() #define refcount_fini() diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index cb1d16ad5..88bde98dc 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -165,6 +165,8 @@ struct spa { uint64_t spa_last_synced_guid; /* last synced guid */ list_t spa_config_dirty_list; /* vdevs with dirty config */ list_t spa_state_dirty_list; /* vdevs with dirty state */ + kmutex_t spa_alloc_lock; + avl_tree_t spa_alloc_tree; spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ nvlist_t *spa_label_features; /* Features for reading MOS */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 0d09c81c7..47e70090a 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -53,6 +53,9 @@ typedef struct vdev_queue vdev_queue_t; typedef struct vdev_cache vdev_cache_t; typedef struct vdev_cache_entry vdev_cache_entry_t; +extern int zfs_vdev_queue_depth_pct; +extern uint32_t zfs_vdev_async_write_max_active; + /* * Virtual device operations */ @@ -177,10 +180,21 @@ struct vdev { uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ uint64_t vdev_islog; /* is an intent log device */ uint64_t vdev_removing; /* device is being removed? */ - boolean_t vdev_ishole; /* is a hole in the namespace */ + boolean_t vdev_ishole; /* is a hole in the namespace */ + kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */ uint64_t vdev_top_zap; /* + * The queue depth parameters determine how many async writes are + * still pending (i.e. allocated by net yet issued to disk) per + * top-level (vdev_async_write_queue_depth) and the maximum allowed + * (vdev_max_async_write_queue_depth). These values only apply to + * top-level vdevs. + */ + uint64_t vdev_async_write_queue_depth; + uint64_t vdev_max_async_write_queue_depth; + + /* * Leaf vdev state. */ range_tree_t *vdev_dtl[DTL_TYPES]; /* dirty time logs */ diff --git a/include/sys/zio.h b/include/sys/zio.h index 22001559c..864e8b2be 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -157,6 +157,7 @@ enum zio_flag { ZIO_FLAG_DONT_CACHE = 1 << 11, ZIO_FLAG_NODATA = 1 << 12, ZIO_FLAG_INDUCE_DAMAGE = 1 << 13, + ZIO_FLAG_IO_ALLOCATING = 1 << 14, #define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) #define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) @@ -164,28 +165,28 @@ enum zio_flag { /* * Flags inherited by vdev children. */ - ZIO_FLAG_IO_RETRY = 1 << 14, /* must be first for INHERIT */ - ZIO_FLAG_PROBE = 1 << 15, - ZIO_FLAG_TRYHARD = 1 << 16, - ZIO_FLAG_OPTIONAL = 1 << 17, + ZIO_FLAG_IO_RETRY = 1 << 15, /* must be first for INHERIT */ + ZIO_FLAG_PROBE = 1 << 16, + ZIO_FLAG_TRYHARD = 1 << 17, + ZIO_FLAG_OPTIONAL = 1 << 18, #define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) /* * Flags not inherited by any children. */ - ZIO_FLAG_DONT_QUEUE = 1 << 18, /* must be first for INHERIT */ - ZIO_FLAG_DONT_PROPAGATE = 1 << 19, - ZIO_FLAG_IO_BYPASS = 1 << 20, - ZIO_FLAG_IO_REWRITE = 1 << 21, - ZIO_FLAG_RAW = 1 << 22, - ZIO_FLAG_GANG_CHILD = 1 << 23, - ZIO_FLAG_DDT_CHILD = 1 << 24, - ZIO_FLAG_GODFATHER = 1 << 25, - ZIO_FLAG_NOPWRITE = 1 << 26, - ZIO_FLAG_REEXECUTED = 1 << 27, - ZIO_FLAG_DELEGATED = 1 << 28, - ZIO_FLAG_FASTWRITE = 1 << 29, + ZIO_FLAG_DONT_QUEUE = 1 << 19, /* must be first for INHERIT */ + ZIO_FLAG_DONT_PROPAGATE = 1 << 20, + ZIO_FLAG_IO_BYPASS = 1 << 21, + ZIO_FLAG_IO_REWRITE = 1 << 22, + ZIO_FLAG_RAW = 1 << 23, + ZIO_FLAG_GANG_CHILD = 1 << 24, + ZIO_FLAG_DDT_CHILD = 1 << 25, + ZIO_FLAG_GODFATHER = 1 << 26, + ZIO_FLAG_NOPWRITE = 1 << 27, + ZIO_FLAG_REEXECUTED = 1 << 28, + ZIO_FLAG_DELEGATED = 1 << 29, + ZIO_FLAG_FASTWRITE = 1 << 30 }; #define ZIO_FLAG_MUSTSUCCEED 0 @@ -225,6 +226,7 @@ enum zio_wait_type { typedef void zio_done_func_t(zio_t *zio); +extern int zio_dva_throttle_enabled; extern const char *zio_type_name[ZIO_TYPES]; /* @@ -379,7 +381,6 @@ struct zio { blkptr_t io_bp_copy; list_t io_parent_list; list_t io_child_list; - zio_link_t *io_walk_link; zio_t *io_logical; zio_transform_t *io_transform_stack; @@ -407,12 +408,14 @@ struct zio { uint64_t io_offset; hrtime_t io_timestamp; /* submitted at */ + hrtime_t io_queued_timestamp; hrtime_t io_target_timestamp; hrtime_t io_delta; /* vdev queue service delta */ hrtime_t io_delay; /* Device access time (disk or */ /* file). */ avl_node_t io_queue_node; avl_node_t io_offset_node; + avl_node_t io_alloc_node; /* Internal pipeline state */ enum zio_flag io_flags; @@ -421,6 +424,7 @@ struct zio { enum zio_flag io_orig_flags; enum zio_stage io_orig_stage; enum zio_stage io_orig_pipeline; + enum zio_stage io_pipeline_trace; int io_error; int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; @@ -443,6 +447,8 @@ struct zio { taskq_ent_t io_tqent; }; +extern int zio_timestamp_compare(const void *, const void *); + extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, enum zio_flag flags); @@ -502,8 +508,8 @@ extern void zio_interrupt(zio_t *zio); extern void zio_delay_init(zio_t *zio); extern void zio_delay_interrupt(zio_t *zio); -extern zio_t *zio_walk_parents(zio_t *cio); -extern zio_t *zio_walk_children(zio_t *pio); +extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **); +extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **); extern zio_t *zio_unique_parent(zio_t *cio); extern void zio_add_child(zio_t *pio, zio_t *cio); diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h index 08f820103..a36749a30 100644 --- a/include/sys/zio_impl.h +++ b/include/sys/zio_impl.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #ifndef _ZIO_IMPL_H @@ -108,35 +108,37 @@ enum zio_stage { ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */ ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */ - ZIO_STAGE_FREE_BP_INIT = 1 << 2, /* --F-- */ - ZIO_STAGE_ISSUE_ASYNC = 1 << 3, /* RWF-- */ - ZIO_STAGE_WRITE_BP_INIT = 1 << 4, /* -W--- */ + ZIO_STAGE_WRITE_BP_INIT = 1 << 2, /* -W--- */ + ZIO_STAGE_FREE_BP_INIT = 1 << 3, /* --F-- */ + ZIO_STAGE_ISSUE_ASYNC = 1 << 4, /* RWF-- */ + ZIO_STAGE_WRITE_COMPRESS = 1 << 5, /* -W--- */ - ZIO_STAGE_CHECKSUM_GENERATE = 1 << 5, /* -W--- */ + ZIO_STAGE_CHECKSUM_GENERATE = 1 << 6, /* -W--- */ - ZIO_STAGE_NOP_WRITE = 1 << 6, /* -W--- */ + ZIO_STAGE_NOP_WRITE = 1 << 7, /* -W--- */ - ZIO_STAGE_DDT_READ_START = 1 << 7, /* R---- */ - ZIO_STAGE_DDT_READ_DONE = 1 << 8, /* R---- */ - ZIO_STAGE_DDT_WRITE = 1 << 9, /* -W--- */ - ZIO_STAGE_DDT_FREE = 1 << 10, /* --F-- */ + ZIO_STAGE_DDT_READ_START = 1 << 8, /* R---- */ + ZIO_STAGE_DDT_READ_DONE = 1 << 9, /* R---- */ + ZIO_STAGE_DDT_WRITE = 1 << 10, /* -W--- */ + ZIO_STAGE_DDT_FREE = 1 << 11, /* --F-- */ - ZIO_STAGE_GANG_ASSEMBLE = 1 << 11, /* RWFC- */ - ZIO_STAGE_GANG_ISSUE = 1 << 12, /* RWFC- */ + ZIO_STAGE_GANG_ASSEMBLE = 1 << 12, /* RWFC- */ + ZIO_STAGE_GANG_ISSUE = 1 << 13, /* RWFC- */ - ZIO_STAGE_DVA_ALLOCATE = 1 << 13, /* -W--- */ - ZIO_STAGE_DVA_FREE = 1 << 14, /* --F-- */ - ZIO_STAGE_DVA_CLAIM = 1 << 15, /* ---C- */ + ZIO_STAGE_DVA_THROTTLE = 1 << 14, /* -W--- */ + ZIO_STAGE_DVA_ALLOCATE = 1 << 15, /* -W--- */ + ZIO_STAGE_DVA_FREE = 1 << 16, /* --F-- */ + ZIO_STAGE_DVA_CLAIM = 1 << 17, /* ---C- */ - ZIO_STAGE_READY = 1 << 16, /* RWFCI */ + ZIO_STAGE_READY = 1 << 18, /* RWFCI */ - ZIO_STAGE_VDEV_IO_START = 1 << 17, /* RW--I */ - ZIO_STAGE_VDEV_IO_DONE = 1 << 18, /* RW--I */ - ZIO_STAGE_VDEV_IO_ASSESS = 1 << 19, /* RW--I */ + ZIO_STAGE_VDEV_IO_START = 1 << 19, /* RW--I */ + ZIO_STAGE_VDEV_IO_DONE = 1 << 20, /* RW--I */ + ZIO_STAGE_VDEV_IO_ASSESS = 1 << 21, /* RW--I */ - ZIO_STAGE_CHECKSUM_VERIFY = 1 << 20, /* R---- */ + ZIO_STAGE_CHECKSUM_VERIFY = 1 << 22, /* R---- */ - ZIO_STAGE_DONE = 1 << 21 /* RWFCI */ + ZIO_STAGE_DONE = 1 << 23 /* RWFCI */ }; #define ZIO_INTERLOCK_STAGES \ @@ -187,22 +189,27 @@ enum zio_stage { #define ZIO_REWRITE_PIPELINE \ (ZIO_WRITE_COMMON_STAGES | \ + ZIO_STAGE_WRITE_COMPRESS | \ ZIO_STAGE_WRITE_BP_INIT) #define ZIO_WRITE_PIPELINE \ (ZIO_WRITE_COMMON_STAGES | \ ZIO_STAGE_WRITE_BP_INIT | \ + ZIO_STAGE_WRITE_COMPRESS | \ + ZIO_STAGE_DVA_THROTTLE | \ ZIO_STAGE_DVA_ALLOCATE) #define ZIO_DDT_CHILD_WRITE_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_DVA_THROTTLE | \ ZIO_STAGE_DVA_ALLOCATE) #define ZIO_DDT_WRITE_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ - ZIO_STAGE_ISSUE_ASYNC | \ ZIO_STAGE_WRITE_BP_INIT | \ + ZIO_STAGE_ISSUE_ASYNC | \ + ZIO_STAGE_WRITE_COMPRESS | \ ZIO_STAGE_CHECKSUM_GENERATE | \ ZIO_STAGE_DDT_WRITE) |