aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2015-12-16 11:28:15 -0800
committerBrian Behlendorf <[email protected]>2015-12-18 13:27:12 -0800
commit6fe53787f38f10956b8d375133ed4559f8ce847b (patch)
treecdeeba055e332d5164459982896e7da020769770
parenta8ad3bf02cace90c45bc25df1bff1089d19e79f1 (diff)
Fix vdev_queue_aggregate() deadlock
This deadlock may manifest itself in slightly different ways but at the core it is caused by a memory allocation blocking on file- system reclaim in the zio pipeline. This is normally impossible because zio_execute() disables filesystem reclaim by setting PF_FSTRANS on the thread. However, kmem cache allocations may still indirectly block on file system reclaim while holding the critical vq->vq_lock as shown below. To resolve this issue zio_buf_alloc_flags() is introduced which allocation flags to be passed. This can then be used in vdev_queue_aggregate() with KM_NOSLEEP when allocating the aggregate IO buffer. Since aggregating the IO is purely a performance optimization we want this to either succeed or fail quickly. Trying too hard to allocate this memory under the vq->vq_lock can negatively impact performance and result in this deadlock. * z_wr_iss zio_vdev_io_start vdev_queue_io -> Takes vq->vq_lock vdev_queue_io_to_issue vdev_queue_aggregate zio_buf_alloc -> Waiting on spl_kmem_cache process * z_wr_int zio_vdev_io_done vdev_queue_io_done mutex_lock -> Waiting on vq->vq_lock held by z_wr_iss * txg_sync spa_sync dsl_pool_sync zio_wait -> Waiting on zio being handled by z_wr_int * spl_kmem_cache spl_cache_grow_work kv_alloc spl_vmalloc ... evict zpl_evict_inode zfs_inactive dmu_tx_wait txg_wait_open -> Waiting on txg_sync Signed-off-by: Brian Behlendorf <[email protected]> Signed-off-by: Chunwei Chen <[email protected]> Signed-off-by: Tim Chase <[email protected]> Closes #3808 Closes #3867
-rw-r--r--include/sys/zio.h1
-rw-r--r--module/zfs/vdev_queue.c7
-rw-r--r--module/zfs/zio.c15
3 files changed, 22 insertions, 1 deletions
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 278b6e086..4916d8724 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -525,6 +525,7 @@ extern void *zio_buf_alloc(size_t size);
extern void zio_buf_free(void *buf, size_t size);
extern void *zio_data_buf_alloc(size_t size);
extern void zio_data_buf_free(void *buf, size_t size);
+extern void *zio_buf_alloc_flags(size_t size, int flags);
extern void zio_resubmit_stage_async(void *);
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index 4a4544f29..0c62a6fa3 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -502,6 +502,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
boolean_t stretch = B_FALSE;
avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
+ void *buf;
if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
return (NULL);
@@ -608,8 +609,12 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
size = IO_SPAN(first, last);
ASSERT3U(size, <=, zfs_vdev_aggregation_limit);
+ buf = zio_buf_alloc_flags(size, KM_NOSLEEP);
+ if (buf == NULL)
+ return (NULL);
+
aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
- zio_buf_alloc(size), size, first->io_type, zio->io_priority,
+ buf, size, first->io_type, zio->io_priority,
flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
vdev_queue_agg_io_done, NULL);
aio->io_timestamp = first->io_timestamp;
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index a16266a40..2bc88c52c 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -248,6 +248,20 @@ zio_data_buf_alloc(size_t size)
return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
}
+/*
+ * Use zio_buf_alloc_flags when specific allocation flags are needed. e.g.
+ * passing KM_NOSLEEP when it is acceptable for an allocation to fail.
+ */
+void *
+zio_buf_alloc_flags(size_t size, int flags)
+{
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ return (kmem_cache_alloc(zio_buf_cache[c], flags));
+}
+
void
zio_buf_free(void *buf, size_t size)
{
@@ -3475,6 +3489,7 @@ zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1,
EXPORT_SYMBOL(zio_type_name);
EXPORT_SYMBOL(zio_buf_alloc);
EXPORT_SYMBOL(zio_data_buf_alloc);
+EXPORT_SYMBOL(zio_buf_alloc_flags);
EXPORT_SYMBOL(zio_buf_free);
EXPORT_SYMBOL(zio_data_buf_free);