summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
Diffstat (limited to 'include')
-rw-r--r--include/sys/fs/zfs.h3
-rw-r--r--include/sys/metaslab.h23
-rw-r--r--include/sys/metaslab_impl.h63
-rw-r--r--include/sys/refcount.h7
-rw-r--r--include/sys/spa_impl.h2
-rw-r--r--include/sys/vdev_impl.h16
-rw-r--r--include/sys/zio.h44
-rw-r--r--include/sys/zio_impl.h51
8 files changed, 157 insertions, 52 deletions
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 5c93f53de..c51d190c7 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -1038,7 +1038,8 @@ typedef enum {
SPA_LOAD_IMPORT, /* import in progress */
SPA_LOAD_TRYIMPORT, /* tryimport in progress */
SPA_LOAD_RECOVER, /* recovery requested */
- SPA_LOAD_ERROR /* load failed */
+ SPA_LOAD_ERROR, /* load failed */
+ SPA_LOAD_CREATE /* creation in progress */
} spa_load_state_t;
/*
diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index 5f831a1f5..408f6d333 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_H
@@ -55,15 +55,16 @@ void metaslab_sync_done(metaslab_t *, uint64_t);
void metaslab_sync_reassess(metaslab_group_t *);
uint64_t metaslab_block_maxsize(metaslab_t *);
-#define METASLAB_HINTBP_FAVOR 0x0
-#define METASLAB_HINTBP_AVOID 0x1
-#define METASLAB_GANG_HEADER 0x2
-#define METASLAB_GANG_CHILD 0x4
-#define METASLAB_GANG_AVOID 0x8
-#define METASLAB_FASTWRITE 0x10
+#define METASLAB_HINTBP_FAVOR 0x0
+#define METASLAB_HINTBP_AVOID 0x1
+#define METASLAB_GANG_HEADER 0x2
+#define METASLAB_GANG_CHILD 0x4
+#define METASLAB_ASYNC_ALLOC 0x8
+#define METASLAB_DONT_THROTTLE 0x10
+#define METASLAB_FASTWRITE 0x20
int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
- blkptr_t *, int, uint64_t, blkptr_t *, int);
+ blkptr_t *, int, uint64_t, blkptr_t *, int, zio_t *);
void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
void metaslab_check_free(spa_t *, const blkptr_t *);
@@ -76,6 +77,9 @@ int metaslab_class_validate(metaslab_class_t *);
void metaslab_class_histogram_verify(metaslab_class_t *);
uint64_t metaslab_class_fragmentation(metaslab_class_t *);
uint64_t metaslab_class_expandable_space(metaslab_class_t *);
+boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int,
+ zio_t *, int);
+void metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *);
void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
int64_t, int64_t);
@@ -88,10 +92,13 @@ metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *);
void metaslab_group_destroy(metaslab_group_t *);
void metaslab_group_activate(metaslab_group_t *);
void metaslab_group_passivate(metaslab_group_t *);
+boolean_t metaslab_group_initialized(metaslab_group_t *);
uint64_t metaslab_group_get_space(metaslab_group_t *);
void metaslab_group_histogram_verify(metaslab_group_t *);
uint64_t metaslab_group_fragmentation(metaslab_group_t *);
void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
+void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int);
+void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *);
#ifdef __cplusplus
}
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index 27a53b515..1c8993aca 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_IMPL_H
@@ -59,11 +59,42 @@ extern "C" {
* to use a block allocator that best suits that class.
*/
struct metaslab_class {
+ kmutex_t mc_lock;
spa_t *mc_spa;
metaslab_group_t *mc_rotor;
metaslab_ops_t *mc_ops;
uint64_t mc_aliquot;
+
+ /*
+ * Track the number of metaslab groups that have been initialized
+ * and can accept allocations. An initialized metaslab group is
+ * one has been completely added to the config (i.e. we have
+ * updated the MOS config and the space has been added to the pool).
+ */
+ uint64_t mc_groups;
+
+ /*
+ * Toggle to enable/disable the allocation throttle.
+ */
+ boolean_t mc_alloc_throttle_enabled;
+
+ /*
+ * The allocation throttle works on a reservation system. Whenever
+ * an asynchronous zio wants to perform an allocation it must
+ * first reserve the number of blocks that it wants to allocate.
+ * If there aren't sufficient slots available for the pending zio
+ * then that I/O is throttled until more slots free up. The current
+ * number of reserved allocations is maintained by the mc_alloc_slots
+ * refcount. The mc_alloc_max_slots value determines the maximum
+ * number of allocations that the system allows. Gang blocks are
+ * allowed to reserve slots even if we've reached the maximum
+ * number of allocations allowed.
+ */
+ uint64_t mc_alloc_max_slots;
+ refcount_t mc_alloc_slots;
+
uint64_t mc_alloc_groups; /* # of allocatable groups */
+
uint64_t mc_alloc; /* total allocated space */
uint64_t mc_deferred; /* total deferred frees */
uint64_t mc_space; /* total space (alloc + free) */
@@ -85,6 +116,15 @@ struct metaslab_group {
avl_tree_t mg_metaslab_tree;
uint64_t mg_aliquot;
boolean_t mg_allocatable; /* can we allocate? */
+
+ /*
+ * A metaslab group is considered to be initialized only after
+ * we have updated the MOS config and added the space to the pool.
+ * We only allow allocation attempts to a metaslab group if it
+ * has been initialized.
+ */
+ boolean_t mg_initialized;
+
uint64_t mg_free_capacity; /* percentage free */
int64_t mg_bias;
int64_t mg_activation_count;
@@ -93,6 +133,27 @@ struct metaslab_group {
taskq_t *mg_taskq;
metaslab_group_t *mg_prev;
metaslab_group_t *mg_next;
+
+ /*
+ * Each metaslab group can handle mg_max_alloc_queue_depth allocations
+ * which are tracked by mg_alloc_queue_depth. It's possible for a
+ * metaslab group to handle more allocations than its max. This
+ * can occur when gang blocks are required or when other groups
+ * are unable to handle their share of allocations.
+ */
+ uint64_t mg_max_alloc_queue_depth;
+ refcount_t mg_alloc_queue_depth;
+
+ /*
+ * A metalab group that can no longer allocate the minimum block
+ * size will set mg_no_free_space. Once a metaslab group is out
+ * of space then its share of work must be distributed to other
+ * groups.
+ */
+ boolean_t mg_no_free_space;
+
+ uint64_t mg_allocations;
+ uint64_t mg_failed_allocations;
uint64_t mg_fragmentation;
uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
};
diff --git a/include/sys/refcount.h b/include/sys/refcount.h
index 580976c91..3f50cddb6 100644
--- a/include/sys/refcount.h
+++ b/include/sys/refcount.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_REFCOUNT_H
@@ -61,6 +62,7 @@ typedef struct refcount {
void refcount_create(refcount_t *rc);
void refcount_create_untracked(refcount_t *rc);
+void refcount_create_tracked(refcount_t *rc);
void refcount_destroy(refcount_t *rc);
void refcount_destroy_many(refcount_t *rc, uint64_t number);
int refcount_is_zero(refcount_t *rc);
@@ -71,6 +73,8 @@ int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
void refcount_transfer(refcount_t *dst, refcount_t *src);
void refcount_transfer_ownership(refcount_t *, void *, void *);
+boolean_t refcount_held(refcount_t *, void *);
+boolean_t refcount_not_held(refcount_t *, void *);
void refcount_init(void);
void refcount_fini(void);
@@ -83,6 +87,7 @@ typedef struct refcount {
#define refcount_create(rc) ((rc)->rc_count = 0)
#define refcount_create_untracked(rc) ((rc)->rc_count = 0)
+#define refcount_create_tracked(rc) ((rc)->rc_count = 0)
#define refcount_destroy(rc) ((rc)->rc_count = 0)
#define refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
#define refcount_is_zero(rc) ((rc)->rc_count == 0)
@@ -99,6 +104,8 @@ typedef struct refcount {
atomic_add_64(&(dst)->rc_count, __tmp); \
}
#define refcount_transfer_ownership(rc, current_holder, new_holder) (void)0
+#define refcount_held(rc, holder) ((rc)->rc_count > 0)
+#define refcount_not_held(rc, holder) (B_TRUE)
#define refcount_init()
#define refcount_fini()
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index cb1d16ad5..88bde98dc 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -165,6 +165,8 @@ struct spa {
uint64_t spa_last_synced_guid; /* last synced guid */
list_t spa_config_dirty_list; /* vdevs with dirty config */
list_t spa_state_dirty_list; /* vdevs with dirty state */
+ kmutex_t spa_alloc_lock;
+ avl_tree_t spa_alloc_tree;
spa_aux_vdev_t spa_spares; /* hot spares */
spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */
nvlist_t *spa_label_features; /* Features for reading MOS */
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 0d09c81c7..47e70090a 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -53,6 +53,9 @@ typedef struct vdev_queue vdev_queue_t;
typedef struct vdev_cache vdev_cache_t;
typedef struct vdev_cache_entry vdev_cache_entry_t;
+extern int zfs_vdev_queue_depth_pct;
+extern uint32_t zfs_vdev_async_write_max_active;
+
/*
* Virtual device operations
*/
@@ -177,10 +180,21 @@ struct vdev {
uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
uint64_t vdev_islog; /* is an intent log device */
uint64_t vdev_removing; /* device is being removed? */
- boolean_t vdev_ishole; /* is a hole in the namespace */
+ boolean_t vdev_ishole; /* is a hole in the namespace */
+ kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */
uint64_t vdev_top_zap;
/*
+ * The queue depth parameters determine how many async writes are
+ * still pending (i.e. allocated by net yet issued to disk) per
+ * top-level (vdev_async_write_queue_depth) and the maximum allowed
+ * (vdev_max_async_write_queue_depth). These values only apply to
+ * top-level vdevs.
+ */
+ uint64_t vdev_async_write_queue_depth;
+ uint64_t vdev_max_async_write_queue_depth;
+
+ /*
* Leaf vdev state.
*/
range_tree_t *vdev_dtl[DTL_TYPES]; /* dirty time logs */
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 22001559c..864e8b2be 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -157,6 +157,7 @@ enum zio_flag {
ZIO_FLAG_DONT_CACHE = 1 << 11,
ZIO_FLAG_NODATA = 1 << 12,
ZIO_FLAG_INDUCE_DAMAGE = 1 << 13,
+ ZIO_FLAG_IO_ALLOCATING = 1 << 14,
#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1)
#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1)
@@ -164,28 +165,28 @@ enum zio_flag {
/*
* Flags inherited by vdev children.
*/
- ZIO_FLAG_IO_RETRY = 1 << 14, /* must be first for INHERIT */
- ZIO_FLAG_PROBE = 1 << 15,
- ZIO_FLAG_TRYHARD = 1 << 16,
- ZIO_FLAG_OPTIONAL = 1 << 17,
+ ZIO_FLAG_IO_RETRY = 1 << 15, /* must be first for INHERIT */
+ ZIO_FLAG_PROBE = 1 << 16,
+ ZIO_FLAG_TRYHARD = 1 << 17,
+ ZIO_FLAG_OPTIONAL = 1 << 18,
#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)
/*
* Flags not inherited by any children.
*/
- ZIO_FLAG_DONT_QUEUE = 1 << 18, /* must be first for INHERIT */
- ZIO_FLAG_DONT_PROPAGATE = 1 << 19,
- ZIO_FLAG_IO_BYPASS = 1 << 20,
- ZIO_FLAG_IO_REWRITE = 1 << 21,
- ZIO_FLAG_RAW = 1 << 22,
- ZIO_FLAG_GANG_CHILD = 1 << 23,
- ZIO_FLAG_DDT_CHILD = 1 << 24,
- ZIO_FLAG_GODFATHER = 1 << 25,
- ZIO_FLAG_NOPWRITE = 1 << 26,
- ZIO_FLAG_REEXECUTED = 1 << 27,
- ZIO_FLAG_DELEGATED = 1 << 28,
- ZIO_FLAG_FASTWRITE = 1 << 29,
+ ZIO_FLAG_DONT_QUEUE = 1 << 19, /* must be first for INHERIT */
+ ZIO_FLAG_DONT_PROPAGATE = 1 << 20,
+ ZIO_FLAG_IO_BYPASS = 1 << 21,
+ ZIO_FLAG_IO_REWRITE = 1 << 22,
+ ZIO_FLAG_RAW = 1 << 23,
+ ZIO_FLAG_GANG_CHILD = 1 << 24,
+ ZIO_FLAG_DDT_CHILD = 1 << 25,
+ ZIO_FLAG_GODFATHER = 1 << 26,
+ ZIO_FLAG_NOPWRITE = 1 << 27,
+ ZIO_FLAG_REEXECUTED = 1 << 28,
+ ZIO_FLAG_DELEGATED = 1 << 29,
+ ZIO_FLAG_FASTWRITE = 1 << 30
};
#define ZIO_FLAG_MUSTSUCCEED 0
@@ -225,6 +226,7 @@ enum zio_wait_type {
typedef void zio_done_func_t(zio_t *zio);
+extern int zio_dva_throttle_enabled;
extern const char *zio_type_name[ZIO_TYPES];
/*
@@ -379,7 +381,6 @@ struct zio {
blkptr_t io_bp_copy;
list_t io_parent_list;
list_t io_child_list;
- zio_link_t *io_walk_link;
zio_t *io_logical;
zio_transform_t *io_transform_stack;
@@ -407,12 +408,14 @@ struct zio {
uint64_t io_offset;
hrtime_t io_timestamp; /* submitted at */
+ hrtime_t io_queued_timestamp;
hrtime_t io_target_timestamp;
hrtime_t io_delta; /* vdev queue service delta */
hrtime_t io_delay; /* Device access time (disk or */
/* file). */
avl_node_t io_queue_node;
avl_node_t io_offset_node;
+ avl_node_t io_alloc_node;
/* Internal pipeline state */
enum zio_flag io_flags;
@@ -421,6 +424,7 @@ struct zio {
enum zio_flag io_orig_flags;
enum zio_stage io_orig_stage;
enum zio_stage io_orig_pipeline;
+ enum zio_stage io_pipeline_trace;
int io_error;
int io_child_error[ZIO_CHILD_TYPES];
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
@@ -443,6 +447,8 @@ struct zio {
taskq_ent_t io_tqent;
};
+extern int zio_timestamp_compare(const void *, const void *);
+
extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
zio_done_func_t *done, void *private, enum zio_flag flags);
@@ -502,8 +508,8 @@ extern void zio_interrupt(zio_t *zio);
extern void zio_delay_init(zio_t *zio);
extern void zio_delay_interrupt(zio_t *zio);
-extern zio_t *zio_walk_parents(zio_t *cio);
-extern zio_t *zio_walk_children(zio_t *pio);
+extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
+extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
extern zio_t *zio_unique_parent(zio_t *cio);
extern void zio_add_child(zio_t *pio, zio_t *cio);
diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h
index 08f820103..a36749a30 100644
--- a/include/sys/zio_impl.h
+++ b/include/sys/zio_impl.h
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#ifndef _ZIO_IMPL_H
@@ -108,35 +108,37 @@ enum zio_stage {
ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */
ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */
- ZIO_STAGE_FREE_BP_INIT = 1 << 2, /* --F-- */
- ZIO_STAGE_ISSUE_ASYNC = 1 << 3, /* RWF-- */
- ZIO_STAGE_WRITE_BP_INIT = 1 << 4, /* -W--- */
+ ZIO_STAGE_WRITE_BP_INIT = 1 << 2, /* -W--- */
+ ZIO_STAGE_FREE_BP_INIT = 1 << 3, /* --F-- */
+ ZIO_STAGE_ISSUE_ASYNC = 1 << 4, /* RWF-- */
+ ZIO_STAGE_WRITE_COMPRESS = 1 << 5, /* -W--- */
- ZIO_STAGE_CHECKSUM_GENERATE = 1 << 5, /* -W--- */
+ ZIO_STAGE_CHECKSUM_GENERATE = 1 << 6, /* -W--- */
- ZIO_STAGE_NOP_WRITE = 1 << 6, /* -W--- */
+ ZIO_STAGE_NOP_WRITE = 1 << 7, /* -W--- */
- ZIO_STAGE_DDT_READ_START = 1 << 7, /* R---- */
- ZIO_STAGE_DDT_READ_DONE = 1 << 8, /* R---- */
- ZIO_STAGE_DDT_WRITE = 1 << 9, /* -W--- */
- ZIO_STAGE_DDT_FREE = 1 << 10, /* --F-- */
+ ZIO_STAGE_DDT_READ_START = 1 << 8, /* R---- */
+ ZIO_STAGE_DDT_READ_DONE = 1 << 9, /* R---- */
+ ZIO_STAGE_DDT_WRITE = 1 << 10, /* -W--- */
+ ZIO_STAGE_DDT_FREE = 1 << 11, /* --F-- */
- ZIO_STAGE_GANG_ASSEMBLE = 1 << 11, /* RWFC- */
- ZIO_STAGE_GANG_ISSUE = 1 << 12, /* RWFC- */
+ ZIO_STAGE_GANG_ASSEMBLE = 1 << 12, /* RWFC- */
+ ZIO_STAGE_GANG_ISSUE = 1 << 13, /* RWFC- */
- ZIO_STAGE_DVA_ALLOCATE = 1 << 13, /* -W--- */
- ZIO_STAGE_DVA_FREE = 1 << 14, /* --F-- */
- ZIO_STAGE_DVA_CLAIM = 1 << 15, /* ---C- */
+ ZIO_STAGE_DVA_THROTTLE = 1 << 14, /* -W--- */
+ ZIO_STAGE_DVA_ALLOCATE = 1 << 15, /* -W--- */
+ ZIO_STAGE_DVA_FREE = 1 << 16, /* --F-- */
+ ZIO_STAGE_DVA_CLAIM = 1 << 17, /* ---C- */
- ZIO_STAGE_READY = 1 << 16, /* RWFCI */
+ ZIO_STAGE_READY = 1 << 18, /* RWFCI */
- ZIO_STAGE_VDEV_IO_START = 1 << 17, /* RW--I */
- ZIO_STAGE_VDEV_IO_DONE = 1 << 18, /* RW--I */
- ZIO_STAGE_VDEV_IO_ASSESS = 1 << 19, /* RW--I */
+ ZIO_STAGE_VDEV_IO_START = 1 << 19, /* RW--I */
+ ZIO_STAGE_VDEV_IO_DONE = 1 << 20, /* RW--I */
+ ZIO_STAGE_VDEV_IO_ASSESS = 1 << 21, /* RW--I */
- ZIO_STAGE_CHECKSUM_VERIFY = 1 << 20, /* R---- */
+ ZIO_STAGE_CHECKSUM_VERIFY = 1 << 22, /* R---- */
- ZIO_STAGE_DONE = 1 << 21 /* RWFCI */
+ ZIO_STAGE_DONE = 1 << 23 /* RWFCI */
};
#define ZIO_INTERLOCK_STAGES \
@@ -187,22 +189,27 @@ enum zio_stage {
#define ZIO_REWRITE_PIPELINE \
(ZIO_WRITE_COMMON_STAGES | \
+ ZIO_STAGE_WRITE_COMPRESS | \
ZIO_STAGE_WRITE_BP_INIT)
#define ZIO_WRITE_PIPELINE \
(ZIO_WRITE_COMMON_STAGES | \
ZIO_STAGE_WRITE_BP_INIT | \
+ ZIO_STAGE_WRITE_COMPRESS | \
+ ZIO_STAGE_DVA_THROTTLE | \
ZIO_STAGE_DVA_ALLOCATE)
#define ZIO_DDT_CHILD_WRITE_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_DVA_THROTTLE | \
ZIO_STAGE_DVA_ALLOCATE)
#define ZIO_DDT_WRITE_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
- ZIO_STAGE_ISSUE_ASYNC | \
ZIO_STAGE_WRITE_BP_INIT | \
+ ZIO_STAGE_ISSUE_ASYNC | \
+ ZIO_STAGE_WRITE_COMPRESS | \
ZIO_STAGE_CHECKSUM_GENERATE | \
ZIO_STAGE_DDT_WRITE)