diff options
author | Brian Behlendorf <[email protected]> | 2015-12-16 11:28:15 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2015-12-18 13:27:12 -0800 |
commit | 6fe53787f38f10956b8d375133ed4559f8ce847b (patch) | |
tree | cdeeba055e332d5164459982896e7da020769770 /module/zfs | |
parent | a8ad3bf02cace90c45bc25df1bff1089d19e79f1 (diff) |
Fix vdev_queue_aggregate() deadlock
This deadlock may manifest itself in slightly different ways but
at the core it is caused by a memory allocation blocking on file-
system reclaim in the zio pipeline. This is normally impossible
because zio_execute() disables filesystem reclaim by setting
PF_FSTRANS on the thread. However, kmem cache allocations may
still indirectly block on file system reclaim while holding the
critical vq->vq_lock as shown below.
To resolve this issue zio_buf_alloc_flags() is introduced which
allocation flags to be passed. This can then be used in
vdev_queue_aggregate() with KM_NOSLEEP when allocating the
aggregate IO buffer. Since aggregating the IO is purely a
performance optimization we want this to either succeed or fail
quickly. Trying too hard to allocate this memory under the
vq->vq_lock can negatively impact performance and result in
this deadlock.
* z_wr_iss
zio_vdev_io_start
vdev_queue_io -> Takes vq->vq_lock
vdev_queue_io_to_issue
vdev_queue_aggregate
zio_buf_alloc -> Waiting on spl_kmem_cache process
* z_wr_int
zio_vdev_io_done
vdev_queue_io_done
mutex_lock -> Waiting on vq->vq_lock held by z_wr_iss
* txg_sync
spa_sync
dsl_pool_sync
zio_wait -> Waiting on zio being handled by z_wr_int
* spl_kmem_cache
spl_cache_grow_work
kv_alloc
spl_vmalloc
...
evict
zpl_evict_inode
zfs_inactive
dmu_tx_wait
txg_wait_open -> Waiting on txg_sync
Signed-off-by: Brian Behlendorf <[email protected]>
Signed-off-by: Chunwei Chen <[email protected]>
Signed-off-by: Tim Chase <[email protected]>
Closes #3808
Closes #3867
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/vdev_queue.c | 7 | ||||
-rw-r--r-- | module/zfs/zio.c | 15 |
2 files changed, 21 insertions, 1 deletions
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 4a4544f29..0c62a6fa3 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -502,6 +502,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) boolean_t stretch = B_FALSE; avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type); enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; + void *buf; if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE) return (NULL); @@ -608,8 +609,12 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) size = IO_SPAN(first, last); ASSERT3U(size, <=, zfs_vdev_aggregation_limit); + buf = zio_buf_alloc_flags(size, KM_NOSLEEP); + if (buf == NULL) + return (NULL); + aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, - zio_buf_alloc(size), size, first->io_type, zio->io_priority, + buf, size, first->io_type, zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_timestamp = first->io_timestamp; diff --git a/module/zfs/zio.c b/module/zfs/zio.c index a16266a40..2bc88c52c 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -248,6 +248,20 @@ zio_data_buf_alloc(size_t size) return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); } +/* + * Use zio_buf_alloc_flags when specific allocation flags are needed. e.g. + * passing KM_NOSLEEP when it is acceptable for an allocation to fail. + */ +void * +zio_buf_alloc_flags(size_t size, int flags) +{ + size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; + + VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + return (kmem_cache_alloc(zio_buf_cache[c], flags)); +} + void zio_buf_free(void *buf, size_t size) { @@ -3475,6 +3489,7 @@ zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1, EXPORT_SYMBOL(zio_type_name); EXPORT_SYMBOL(zio_buf_alloc); EXPORT_SYMBOL(zio_data_buf_alloc); +EXPORT_SYMBOL(zio_buf_alloc_flags); EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_data_buf_free); |