aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--cmd/zinject/translate.c15
-rw-r--r--cmd/zinject/zinject.c37
-rw-r--r--include/sys/dsl_pool.h2
-rw-r--r--include/sys/fm/fs/zfs.h5
-rw-r--r--include/sys/spa.h2
-rw-r--r--include/sys/spa_impl.h4
-rw-r--r--include/sys/vdev.h1
-rw-r--r--include/sys/vdev_impl.h2
-rw-r--r--include/sys/zfs_context.h9
-rw-r--r--include/sys/zfs_ioctl.h12
-rw-r--r--include/sys/zio.h9
-rw-r--r--lib/libzpool/taskq.c13
-rw-r--r--module/zfs/spa.c11
-rw-r--r--module/zfs/spa_misc.c54
-rw-r--r--module/zfs/vdev.c40
-rw-r--r--module/zfs/vdev_disk.c5
-rw-r--r--module/zfs/vdev_queue.c15
-rw-r--r--module/zfs/zfs_fm.c16
-rw-r--r--module/zfs/zio.c8
-rw-r--r--module/zfs/zio_inject.c65
20 files changed, 275 insertions, 50 deletions
diff --git a/cmd/zinject/translate.c b/cmd/zinject/translate.c
index 160786641..fc1612738 100644
--- a/cmd/zinject/translate.c
+++ b/cmd/zinject/translate.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
*/
#include <libzfs.h>
@@ -476,6 +477,20 @@ translate_device(const char *pool, const char *device, err_type_t label_type,
&record->zi_guid) == 0);
}
+ /*
+ * Device faults can take on three different forms:
+ * 1). delayed or hanging I/O
+ * 2). zfs label faults
+ * 3). generic disk faults
+ */
+ if (record->zi_timer != 0) {
+ record->zi_cmd = ZINJECT_DELAY_IO;
+ } else if (label_type != TYPE_INVAL) {
+ record->zi_cmd = ZINJECT_LABEL_FAULT;
+ } else {
+ record->zi_cmd = ZINJECT_DEVICE_FAULT;
+ }
+
switch (label_type) {
default:
break;
diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c
index d584eadd8..9ae3f9ddc 100644
--- a/cmd/zinject/zinject.c
+++ b/cmd/zinject/zinject.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
*/
/*
@@ -235,8 +236,8 @@ usage(void)
"\t\t'pad1', or 'pad2'.\n"
"\t\t'errno' can be 'nxio' (the default), 'io', or 'dtl'.\n"
"\n"
- "\tzinject -d device -A <degrade|fault> pool\n"
- "\t\tPerform a specific action on a particular device\n"
+ "\tzinject -d device -A <degrade|fault> -D <delay secs> pool\n"
+ "\t\tPerform a specific action on a particular device.\n"
"\n"
"\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
"\t\tCause the pool to stop writing blocks yet not\n"
@@ -589,7 +590,7 @@ main(int argc, char **argv)
}
while ((c = getopt(argc, argv,
- ":aA:b:d:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
+ ":aA:b:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
switch (c) {
case 'a':
flags |= ZINJECT_FLUSH_ARC;
@@ -615,6 +616,16 @@ main(int argc, char **argv)
case 'd':
device = optarg;
break;
+ case 'D':
+ errno = 0;
+ record.zi_timer = strtoull(optarg, &end, 10);
+ if (errno != 0 || *end != '\0') {
+ (void) fprintf(stderr, "invalid i/o delay "
+ "value: '%s'\n", optarg);
+ usage();
+ return (1);
+ }
+ break;
case 'e':
if (strcasecmp(optarg, "io") == 0) {
error = EIO;
@@ -679,6 +690,7 @@ main(int argc, char **argv)
case 'p':
(void) strlcpy(record.zi_func, optarg,
sizeof (record.zi_func));
+ record.zi_cmd = ZINJECT_PANIC;
break;
case 'q':
quiet = 1;
@@ -762,13 +774,15 @@ main(int argc, char **argv)
return (1);
}
+ if (record.zi_duration != 0)
+ record.zi_cmd = ZINJECT_IGNORED_WRITES;
+
if (cancel != NULL) {
/*
* '-c' is invalid with any other options.
*/
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
- level != 0 || record.zi_func[0] != '\0' ||
- record.zi_duration != 0) {
+ level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
(void) fprintf(stderr, "cancel (-c) incompatible with "
"any other options\n");
usage();
@@ -800,8 +814,7 @@ main(int argc, char **argv)
* for doing injection, so handle it separately here.
*/
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
- level != 0 || record.zi_func[0] != '\0' ||
- record.zi_duration != 0) {
+ level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
(void) fprintf(stderr, "device (-d) incompatible with "
"data error injection\n");
usage();
@@ -835,7 +848,7 @@ main(int argc, char **argv)
} else if (raw != NULL) {
if (range != NULL || type != TYPE_INVAL || level != 0 ||
- record.zi_func[0] != '\0' || record.zi_duration != 0) {
+ record.zi_cmd != ZINJECT_UNINITIALIZED) {
(void) fprintf(stderr, "raw (-b) format with "
"any other options\n");
usage();
@@ -858,13 +871,14 @@ main(int argc, char **argv)
return (1);
}
+ record.zi_cmd = ZINJECT_DATA_FAULT;
if (translate_raw(raw, &record) != 0)
return (1);
if (!error)
error = EIO;
- } else if (record.zi_func[0] != '\0') {
+ } else if (record.zi_cmd == ZINJECT_PANIC) {
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
- level != 0 || device != NULL || record.zi_duration != 0) {
+ level != 0 || device != NULL) {
(void) fprintf(stderr, "panic (-p) incompatible with "
"other options\n");
usage();
@@ -882,7 +896,7 @@ main(int argc, char **argv)
if (argv[1] != NULL)
record.zi_type = atoi(argv[1]);
dataset[0] = '\0';
- } else if (record.zi_duration != 0) {
+ } else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
if (nowrites == 0) {
(void) fprintf(stderr, "-s or -g meaningless "
"without -I (ignore writes)\n");
@@ -936,6 +950,7 @@ main(int argc, char **argv)
return (1);
}
+ record.zi_cmd = ZINJECT_DATA_FAULT;
if (translate_record(type, argv[0], range, level, &record, pool,
dataset) != 0)
return (1);
diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h
index ff5df1414..96a2296bf 100644
--- a/include/sys/dsl_pool.h
+++ b/include/sys/dsl_pool.h
@@ -41,6 +41,8 @@
extern "C" {
#endif
+extern int zfs_txg_synctime_ms;
+
struct objset;
struct dsl_dir;
struct dsl_dataset;
diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h
index d5c6004c2..741b99e62 100644
--- a/include/sys/fm/fs/zfs.h
+++ b/include/sys/fm/fs/zfs.h
@@ -73,6 +73,8 @@ extern "C" {
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE "vdev_state"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT "vdev_ashift"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS "vdev_complete_ts"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS "vdev_delta_ts"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path"
@@ -88,6 +90,9 @@ extern "C" {
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE "zio_stage"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE "zio_pipeline"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY "zio_delay"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP "zio_timestamp"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_DEADLINE "zio_deadline"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA "zio_delta"
#define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state"
#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected"
#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual"
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 821172297..ca9fb2438 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -570,6 +570,7 @@ extern int spa_offline_log(spa_t *spa);
/* Log claim callback */
extern void spa_claim_notify(zio_t *zio);
+extern void spa_deadman(void *);
/* Accessor functions */
extern boolean_t spa_shutting_down(spa_t *spa);
@@ -604,6 +605,7 @@ extern boolean_t spa_suspended(spa_t *spa);
extern uint64_t spa_bootfs(spa_t *spa);
extern uint64_t spa_delegation(spa_t *spa);
extern objset_t *spa_meta_objset(spa_t *spa);
+extern uint64_t spa_deadman_synctime(spa_t *spa);
/* Miscellaneous support routines */
extern void spa_activate_mos_feature(spa_t *spa, const char *feature);
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 65edc97f5..5bd0e0a9c 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -227,6 +227,10 @@ struct spa {
uint64_t spa_feat_for_write_obj; /* required to write to pool */
uint64_t spa_feat_for_read_obj; /* required to read from pool */
uint64_t spa_feat_desc_obj; /* Feature descriptions */
+ taskqid_t spa_deadman_tqid; /* Task id */
+ uint64_t spa_deadman_calls; /* number of deadman calls */
+ uint64_t spa_sync_starttime; /* starting time fo spa_sync */
+ uint64_t spa_deadman_synctime; /* deadman expiration timer */
/*
* spa_refcnt & spa_config_lock must be the last elements
* because refcount_t changes size based on compilation options.
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index 8f297a917..f49086a47 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -78,6 +78,7 @@ extern void vdev_metaslab_fini(vdev_t *vd);
extern void vdev_metaslab_set_size(vdev_t *);
extern void vdev_expand(vdev_t *vd, uint64_t txg);
extern void vdev_split(vdev_t *vd);
+extern void vdev_deadman(vdev_t *vd);
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 964ee2407..711408e6a 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -105,6 +105,8 @@ struct vdev_queue {
avl_tree_t vq_read_tree;
avl_tree_t vq_write_tree;
avl_tree_t vq_pending_tree;
+ uint64_t vq_io_complete_ts;
+ uint64_t vq_io_delta_ts;
list_t vq_io_list;
kmutex_t vq_lock;
};
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index 53080f382..a23bfdcf8 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -25,6 +25,7 @@
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZFS_CONTEXT_H
@@ -400,6 +401,8 @@ extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
#define taskq_create_sysdc(a, b, d, e, p, dc, f) \
(taskq_create(a, b, maxclsyspri, d, e, f))
extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
+extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t,
+ clock_t);
extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
taskq_ent_t *);
extern int taskq_empty_ent(taskq_ent_t *);
@@ -407,6 +410,7 @@ extern void taskq_init_ent(taskq_ent_t *);
extern void taskq_destroy(taskq_t *);
extern void taskq_wait(taskq_t *);
extern int taskq_member(taskq_t *, kthread_t *);
+extern int taskq_cancel_id(taskq_t *, taskqid_t);
extern void system_taskq_init(void);
extern void system_taskq_fini(void);
@@ -523,6 +527,11 @@ extern vnode_t *rootdir;
extern void delay(clock_t ticks);
+#define SEC_TO_TICK(sec) ((sec) * hz)
+#define MSEC_TO_TICK(msec) ((msec) / (MILLISEC / hz))
+#define USEC_TO_TICK(usec) ((usec) / (MICROSEC / hz))
+#define NSEC_TO_TICK(usec) ((usec) / (NANOSEC / hz))
+
#define gethrestime_sec() time(NULL)
#define gethrestime(t) \
do {\
diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h
index 740d8eda4..c0cb4705d 100644
--- a/include/sys/zfs_ioctl.h
+++ b/include/sys/zfs_ioctl.h
@@ -236,6 +236,8 @@ typedef struct zinject_record {
uint32_t zi_iotype;
int32_t zi_duration;
uint64_t zi_timer;
+ uint32_t zi_cmd;
+ uint32_t zi_pad;
} zinject_record_t;
#define ZINJECT_NULL 0x1
@@ -245,6 +247,16 @@ typedef struct zinject_record {
#define ZEVENT_NONBLOCK 0x1
#define ZEVENT_SIZE 1024
+typedef enum zinject_type {
+ ZINJECT_UNINITIALIZED,
+ ZINJECT_DATA_FAULT,
+ ZINJECT_DEVICE_FAULT,
+ ZINJECT_LABEL_FAULT,
+ ZINJECT_IGNORED_WRITES,
+ ZINJECT_PANIC,
+ ZINJECT_DELAY_IO,
+} zinject_type_t;
+
typedef struct zfs_share {
uint64_t z_exportdata;
uint64_t z_sharedata;
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 64efde016..03530330c 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -21,8 +21,6 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- */
-/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
@@ -411,7 +409,10 @@ struct zio {
const zio_vsd_ops_t *io_vsd_ops;
uint64_t io_offset;
- uint64_t io_deadline;
+ uint64_t io_deadline; /* expires at timestamp + deadline */
+ uint64_t io_timestamp; /* submitted at (ticks) */
+ uint64_t io_delta; /* vdev queue service delta (ticks) */
+ uint64_t io_delay; /* vdev disk service delta (ticks) */
avl_node_t io_offset_node;
avl_node_t io_deadline_node;
avl_tree_t *io_vdev_tree;
@@ -423,7 +424,6 @@ struct zio {
enum zio_flag io_orig_flags;
enum zio_stage io_orig_stage;
enum zio_stage io_orig_pipeline;
- uint64_t io_delay;
int io_error;
int io_child_error[ZIO_CHILD_TYPES];
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
@@ -560,6 +560,7 @@ extern int zio_handle_fault_injection(zio_t *zio, int error);
extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
extern int zio_handle_label_injection(zio_t *zio, int error);
extern void zio_handle_ignored_writes(zio_t *zio);
+extern uint64_t zio_handle_io_delay(zio_t *zio);
/*
* Checksum ereport functions
diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c
index 803f7dc37..64e214205 100644
--- a/lib/libzpool/taskq.c
+++ b/lib/libzpool/taskq.c
@@ -147,6 +147,13 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags)
return (1);
}
+taskqid_t
+taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags,
+ clock_t expire_time)
+{
+ return (0);
+}
+
int
taskq_empty_ent(taskq_ent_t *t)
{
@@ -339,6 +346,12 @@ taskq_member(taskq_t *tq, kthread_t *t)
return (0);
}
+int
+taskq_cancel_id(taskq_t *tq, taskqid_t id)
+{
+ return (ENOENT);
+}
+
void
system_taskq_init(void)
{
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 0babf4752..0d3537eaa 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1013,6 +1013,8 @@ spa_deactivate(spa_t *spa)
list_destroy(&spa->spa_config_dirty_list);
list_destroy(&spa->spa_state_dirty_list);
+ taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
+
for (t = 0; t < ZIO_TYPES; t++) {
for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
if (spa->spa_zio_taskq[t][q] != NULL)
@@ -6017,6 +6019,12 @@ spa_sync(spa_t *spa, uint64_t txg)
tx = dmu_tx_create_assigned(dp, txg);
+ spa->spa_sync_starttime = gethrtime();
+ taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
+ spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq,
+ spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
+ NSEC_TO_TICK(spa->spa_deadman_synctime));
+
/*
* If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
* set spa_deflate if we have no raid-z vdevs.
@@ -6145,6 +6153,9 @@ spa_sync(spa_t *spa, uint64_t txg)
}
dmu_tx_commit(tx);
+ taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
+ spa->spa_deadman_tqid = 0;
+
/*
* Clear the dirty config list.
*/
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 5ec8e688e..476f9c02c 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -237,6 +237,24 @@ kmem_cache_t *spa_buffer_pool;
int spa_mode_global;
/*
+ * Expiration time in units of zfs_txg_synctime_ms. This value has two
+ * meanings. First it is used to determine when the spa_deadman logic
+ * should fire. By default the spa_deadman will fire if spa_sync has
+ * not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds).
+ * Secondly, the value determines if an I/O is considered "hung".
+ * Any I/O that has not completed in zfs_deadman_synctime is considered
+ * "hung" resulting in a zevent being posted.
+ * 1000 zfs_txg_synctime_ms (i.e. 1000 seconds).
+ */
+unsigned long zfs_deadman_synctime = 1000ULL;
+
+/*
+ * By default the deadman is enabled.
+ */
+int zfs_deadman_enabled = 1;
+
+
+/*
* ==========================================================================
* SPA config locking
* ==========================================================================
@@ -413,6 +431,27 @@ spa_lookup(const char *name)
}
/*
+ * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
+ * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
+ * looking for potentially hung I/Os.
+ */
+void
+spa_deadman(void *arg)
+{
+ spa_t *spa = arg;
+
+ zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
+ (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
+ ++spa->spa_deadman_calls);
+ if (zfs_deadman_enabled)
+ vdev_deadman(spa->spa_root_vdev);
+
+ spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq,
+ spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
+ NSEC_TO_TICK(spa->spa_deadman_synctime));
+}
+
+/*
* Create an uninitialized spa_t with the given name. Requires
* spa_namespace_lock. The caller must ensure that the spa_t doesn't already
* exist by calling spa_lookup() first.
@@ -454,6 +493,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_proc = &p0;
spa->spa_proc_state = SPA_PROC_NONE;
+ spa->spa_deadman_synctime = zfs_deadman_synctime *
+ zfs_txg_synctime_ms * MICROSEC;
+
refcount_create(&spa->spa_refcount);
spa_config_lock_init(spa);
@@ -1493,6 +1535,12 @@ spa_prev_software_version(spa_t *spa)
}
uint64_t
+spa_deadman_synctime(spa_t *spa)
+{
+ return (spa->spa_deadman_synctime);
+}
+
+uint64_t
dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
{
uint64_t asize = DVA_GET_ASIZE(dva);
@@ -1812,4 +1860,10 @@ EXPORT_SYMBOL(spa_writeable);
EXPORT_SYMBOL(spa_mode);
EXPORT_SYMBOL(spa_namespace_lock);
+
+module_param(zfs_deadman_synctime, ulong, 0644);
+MODULE_PARM_DESC(zfs_deadman_synctime,"Expire in units of zfs_txg_synctime_ms");
+
+module_param(zfs_deadman_enabled, int, 0644);
+MODULE_PARM_DESC(zfs_deadman_enabled, "Enable deadman timer");
#endif
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index afff4d423..15ff30fb3 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -3195,6 +3195,46 @@ vdev_split(vdev_t *vd)
vdev_propagate_state(cvd);
}
+void
+vdev_deadman(vdev_t *vd)
+{
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ vdev_deadman(cvd);
+ }
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ vdev_queue_t *vq = &vd->vdev_queue;
+
+ mutex_enter(&vq->vq_lock);
+ if (avl_numnodes(&vq->vq_pending_tree) > 0) {
+ spa_t *spa = vd->vdev_spa;
+ zio_t *fio;
+ uint64_t delta;
+
+ /*
+ * Look at the head of all the pending queues,
+ * if any I/O has been outstanding for longer than
+ * the spa_deadman_synctime we log a zevent.
+ */
+ fio = avl_first(&vq->vq_pending_tree);
+ delta = ddi_get_lbolt64() - fio->io_timestamp;
+ if (delta > NSEC_TO_TICK(spa_deadman_synctime(spa))) {
+ zfs_dbgmsg("SLOW IO: zio timestamp %llu, "
+ "delta %llu, last io %llu",
+ fio->io_timestamp, delta,
+ vq->vq_io_complete_ts);
+ zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
+ spa, vd, fio, 0, 0);
+ }
+ }
+ mutex_exit(&vq->vq_lock);
+ }
+}
+
#if defined(_KERNEL) && defined(HAVE_SPL)
EXPORT_SYMBOL(vdev_fault);
EXPORT_SYMBOL(vdev_degrade);
diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
index 08f64a913..31d1a28bd 100644
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -407,8 +407,7 @@ vdev_disk_dio_put(dio_request_t *dr)
vdev_disk_dio_free(dr);
if (zio) {
- zio->io_delay = jiffies_to_msecs(
- jiffies_64 - zio->io_delay);
+ zio->io_delay = jiffies_64 - zio->io_delay;
zio->io_error = error;
ASSERT3S(zio->io_error, >=, 0);
if (zio->io_error)
@@ -609,7 +608,7 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, size, rc)
{
zio_t *zio = bio->bi_private;
- zio->io_delay = jiffies_to_msecs(jiffies_64 - zio->io_delay);
+ zio->io_delay = jiffies_64 - zio->io_delay;
zio->io_error = -rc;
if (rc && (rc == -EOPNOTSUPP))
zio->io_vd->vdev_nowritecache = B_TRUE;
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index e2096fac9..3f2793ba4 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -23,6 +23,10 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
#include <sys/zfs_context.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
@@ -319,6 +323,7 @@ again:
vi, size, fio->io_type, ZIO_PRIORITY_AGG,
flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
vdev_queue_agg_io_done, NULL);
+ aio->io_timestamp = fio->io_timestamp;
nio = fio;
do {
@@ -391,7 +396,8 @@ vdev_queue_io(zio_t *zio)
mutex_enter(&vq->vq_lock);
- zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) +
+ zio->io_timestamp = ddi_get_lbolt64();
+ zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
zio->io_priority;
vdev_queue_io_add(vq, zio);
@@ -417,10 +423,17 @@ vdev_queue_io_done(zio_t *zio)
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
int i;
+ if (zio_injection_enabled)
+ delay(SEC_TO_TICK(zio_handle_io_delay(zio)));
+
mutex_enter(&vq->vq_lock);
avl_remove(&vq->vq_pending_tree, zio);
+ zio->io_delta = ddi_get_lbolt64() - zio->io_timestamp;
+ vq->vq_io_complete_ts = ddi_get_lbolt64();
+ vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp;
+
for (i = 0; i < zfs_vdev_ramp_rate; i++) {
zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
if (nio == NULL)
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index 820291bf4..af2030ae7 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -250,6 +250,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
if (vd != NULL) {
vdev_t *pvd = vd->vdev_parent;
+ vdev_queue_t *vq = &vd->vdev_queue;
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
DATA_TYPE_UINT64, vd->vdev_guid,
@@ -272,6 +273,15 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT,
DATA_TYPE_UINT64, vd->vdev_ashift, NULL);
+ if (vq != NULL) {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS,
+ DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL);
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS,
+ DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL);
+ }
+
if (pvd != NULL) {
fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
@@ -304,6 +314,12 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
DATA_TYPE_UINT32, zio->io_pipeline, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY,
DATA_TYPE_UINT64, zio->io_delay, NULL);
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP,
+ DATA_TYPE_UINT64, zio->io_timestamp, NULL);
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DEADLINE,
+ DATA_TYPE_UINT64, zio->io_deadline, NULL);
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA,
+ DATA_TYPE_UINT64, zio->io_delta, NULL);
/*
* If the 'size' parameter is non-zero, it indicates this is a
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 0622553f5..943f2d677 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -609,6 +609,9 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_vsd_ops = NULL;
zio->io_offset = offset;
zio->io_deadline = 0;
+ zio->io_timestamp = 0;
+ zio->io_delta = 0;
+ zio->io_delay = 0;
zio->io_orig_data = zio->io_data = data;
zio->io_orig_size = zio->io_size = size;
zio->io_orig_flags = zio->io_flags = flags;
@@ -620,7 +623,6 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_bp_override = NULL;
zio->io_walk_link = NULL;
zio->io_transform_stack = NULL;
- zio->io_delay = 0;
zio->io_error = 0;
zio->io_child_count = 0;
zio->io_parent_count = 0;
@@ -2906,11 +2908,11 @@ zio_done(zio_t *zio)
vdev_stat_update(zio, zio->io_size);
/*
- * If this I/O is attached to a particular vdev is slow, exeeding
+ * If this I/O is attached to a particular vdev is slow, exceeding
* 30 seconds to complete, post an error described the I/O delay.
* We ignore these errors if the device is currently unavailable.
*/
- if (zio->io_delay >= zio_delay_max) {
+ if (zio->io_delay >= MSEC_TO_TICK(zio_delay_max)) {
if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
zio->io_vd, zio, 0, 0);
diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c
index 293f267fc..eb589c429 100644
--- a/module/zfs/zio_inject.c
+++ b/module/zfs/zio_inject.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
*/
/*
@@ -147,14 +148,8 @@ zio_handle_fault_injection(zio_t *zio, int error)
for (handler = list_head(&inject_handlers); handler != NULL;
handler = list_next(&inject_handlers, handler)) {
- /* Ignore errors not destined for this pool */
- if (zio->io_spa != handler->zi_spa)
- continue;
-
- /* Ignore device errors and panic injection */
- if (handler->zi_record.zi_guid != 0 ||
- handler->zi_record.zi_func[0] != '\0' ||
- handler->zi_record.zi_duration != 0)
+ if (zio->io_spa != handler->zi_spa ||
+ handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
continue;
/* If this handler matches, return EIO */
@@ -197,10 +192,7 @@ zio_handle_label_injection(zio_t *zio, int error)
uint64_t start = handler->zi_record.zi_start;
uint64_t end = handler->zi_record.zi_end;
- /* Ignore device only faults or panic injection */
- if (handler->zi_record.zi_start == 0 ||
- handler->zi_record.zi_func[0] != '\0' ||
- handler->zi_record.zi_duration != 0)
+ if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
continue;
/*
@@ -246,13 +238,7 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
for (handler = list_head(&inject_handlers); handler != NULL;
handler = list_next(&inject_handlers, handler)) {
- /*
- * Ignore label specific faults, panic injection
- * or fake writes
- */
- if (handler->zi_record.zi_start != 0 ||
- handler->zi_record.zi_func[0] != '\0' ||
- handler->zi_record.zi_duration != 0)
+ if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
continue;
if (vd->vdev_guid == handler->zi_record.zi_guid) {
@@ -316,10 +302,8 @@ zio_handle_ignored_writes(zio_t *zio)
handler = list_next(&inject_handlers, handler)) {
/* Ignore errors not destined for this pool */
- if (zio->io_spa != handler->zi_spa)
- continue;
-
- if (handler->zi_record.zi_duration == 0)
+ if (zio->io_spa != handler->zi_spa ||
+ handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
continue;
/*
@@ -355,11 +339,8 @@ spa_handle_ignored_writes(spa_t *spa)
for (handler = list_head(&inject_handlers); handler != NULL;
handler = list_next(&inject_handlers, handler)) {
- /* Ignore errors not destined for this pool */
- if (spa != handler->zi_spa)
- continue;
-
- if (handler->zi_record.zi_duration == 0)
+ if (spa != handler->zi_spa ||
+ handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
continue;
if (handler->zi_record.zi_duration > 0) {
@@ -379,6 +360,34 @@ spa_handle_ignored_writes(spa_t *spa)
rw_exit(&inject_lock);
}
+uint64_t
+zio_handle_io_delay(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ inject_handler_t *handler;
+ uint64_t seconds = 0;
+
+ if (zio_injection_enabled == 0)
+ return (0);
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
+ continue;
+
+ if (vd->vdev_guid == handler->zi_record.zi_guid) {
+ seconds = handler->zi_record.zi_timer;
+ break;
+ }
+
+ }
+ rw_exit(&inject_lock);
+ return (seconds);
+}
+
/*
* Create a new handler for the given record. We add it to the list, adding
* a reference to the spa_t in the process. We increment zio_injection_enabled,