From 4a67ce886a7b3def5f66c1aedf9e5436d157a03c Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Mon, 9 Jul 2018 18:02:58 +0200 Subject: radv: make sure to wait for CP DMA when needed This might fix some synchronization issues. I don't know if that will affect performance but it's required for correctness. CC: Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen --- src/amd/vulkan/radv_cmd_buffer.c | 15 +++++++++++++++ src/amd/vulkan/radv_private.h | 5 +++++ src/amd/vulkan/si_cmd_buffer.c | 36 ++++++++++++++++++++++++++++++++---- 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 9da42fe03e9..5dbdb3d9966 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -2596,6 +2596,11 @@ VkResult radv_EndCommandBuffer( si_emit_cache_flush(cmd_buffer); } + /* Make sure CP DMA is idle at the end of IBs because the kernel + * doesn't wait for it. + */ + si_cp_dma_wait_for_idle(cmd_buffer); + vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments); if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs)) @@ -4242,6 +4247,11 @@ radv_barrier(struct radv_cmd_buffer *cmd_buffer, 0); } + /* Make sure CP DMA is idle because the driver might have performed a + * DMA operation for copying or filling buffers/images. + */ + si_cp_dma_wait_for_idle(cmd_buffer); + cmd_buffer->state.flush_bits |= dst_flush_bits; } @@ -4292,6 +4302,11 @@ static void write_event(struct radv_cmd_buffer *cmd_buffer, VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT; + /* Make sure CP DMA is idle because the driver might have performed a + * DMA operation for copying or filling buffers/images. + */ + si_cp_dma_wait_for_idle(cmd_buffer); + /* TODO: Emit EOS events for syncing PS/CS stages. */ if (!(stageMask & ~top_of_pipe_flags)) { diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 4e4b3a6037a..2400de49a25 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -979,6 +979,9 @@ struct radv_cmd_state { uint32_t last_num_instances; uint32_t last_first_instance; uint32_t last_vertex_offset; + + /* Whether CP DMA is busy/idle. */ + bool dma_is_busy; }; struct radv_cmd_pool { @@ -1091,6 +1094,8 @@ void si_cp_dma_prefetch(struct radv_cmd_buffer *cmd_buffer, uint64_t va, unsigned size); void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint64_t size, unsigned value); +void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer); + void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer); bool radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c index 454fd8c39c8..6d566a918df 100644 --- a/src/amd/vulkan/si_cmd_buffer.c +++ b/src/amd/vulkan/si_cmd_buffer.c @@ -1040,7 +1040,6 @@ static void si_emit_cp_dma(struct radv_cmd_buffer *cmd_buffer, struct radeon_cmdbuf *cs = cmd_buffer->cs; uint32_t header = 0, command = 0; - assert(size); assert(size <= cp_dma_max_byte_count(cmd_buffer)); radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9); @@ -1099,9 +1098,14 @@ static void si_emit_cp_dma(struct radv_cmd_buffer *cmd_buffer, * indices. If we wanted to execute CP DMA in PFP, this packet * should precede it. */ - if ((flags & CP_DMA_SYNC) && cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) { - radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating)); - radeon_emit(cs, 0); + if (flags & CP_DMA_SYNC) { + if (cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) { + radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating)); + radeon_emit(cs, 0); + } + + /* CP will see the sync flag and wait for all DMAs to complete. */ + cmd_buffer->state.dma_is_busy = false; } if (unlikely(cmd_buffer->device->trace_bo)) @@ -1165,6 +1169,8 @@ void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer, uint64_t main_src_va, main_dest_va; uint64_t skipped_size = 0, realign_size = 0; + /* Assume that we are not going to sync after the last DMA operation. */ + cmd_buffer->state.dma_is_busy = true; if (cmd_buffer->device->physical_device->rad_info.family <= CHIP_CARRIZO || cmd_buffer->device->physical_device->rad_info.family == CHIP_STONEY) { @@ -1228,6 +1234,9 @@ void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va, assert(va % 4 == 0 && size % 4 == 0); + /* Assume that we are not going to sync after the last DMA operation. */ + cmd_buffer->state.dma_is_busy = true; + while (size) { unsigned byte_count = MIN2(size, cp_dma_max_byte_count(cmd_buffer)); unsigned dma_flags = CP_DMA_CLEAR; @@ -1243,6 +1252,25 @@ void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va, } } +void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer) +{ + if (cmd_buffer->device->physical_device->rad_info.chip_class < CIK) + return; + + if (!cmd_buffer->state.dma_is_busy) + return; + + /* Issue a dummy DMA that copies zero bytes. + * + * The DMA engine will see that there's no work to do and skip this + * DMA request, however, the CP will see the sync flag and still wait + * for all DMAs to complete. + */ + si_emit_cp_dma(cmd_buffer, 0, 0, 0, CP_DMA_SYNC); + + cmd_buffer->state.dma_is_busy = false; +} + /* For MSAA sample positions. */ #define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y) \ (((s0x) & 0xf) | (((unsigned)(s0y) & 0xf) << 4) | \ -- cgit v1.2.3