diff options
author | Samuel Pitoiset <[email protected]> | 2017-10-31 09:58:00 +0100 |
---|---|---|
committer | Samuel Pitoiset <[email protected]> | 2017-11-02 23:03:59 +0100 |
commit | bad31f6a65c148bb506d524150915e2b2fc74a0e (patch) | |
tree | 89ce80be6dcb6c5ad252b50c63902c848568988a /src/amd/vulkan | |
parent | cf5f8f55c3e25508fb975b263d6430a93442247a (diff) |
radv: use the optimal packets order for dispatch calls
This should reduce the time where compute units are idle, mainly
for meta operations because they use a bunch of compute shaders.
This seems to have a really minor positive effect for Talos, at least.
Signed-off-by: Samuel Pitoiset <[email protected]>
Reviewed-by: Bas Nieuwenhuizen <[email protected]>
Diffstat (limited to 'src/amd/vulkan')
-rw-r--r-- | src/amd/vulkan/radv_cmd_buffer.c | 61 |
1 files changed, 53 insertions, 8 deletions
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 27b7f78a89b..4b608377fc6 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -2532,8 +2532,6 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer) compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE]; va = radv_buffer_get_va(compute_shader->bo) + compute_shader->bo_offset; - radv_emit_shader_prefetch(cmd_buffer, compute_shader); - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 16); @@ -3561,18 +3559,65 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, } static void -radv_dispatch(struct radv_cmd_buffer *cmd_buffer, - const struct radv_dispatch_info *info) +radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer) { - radv_emit_compute_pipeline(cmd_buffer); - radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT); radv_flush_constants(cmd_buffer, cmd_buffer->state.compute_pipeline, VK_SHADER_STAGE_COMPUTE_BIT); +} + +static void +radv_dispatch(struct radv_cmd_buffer *cmd_buffer, + const struct radv_dispatch_info *info) +{ + struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline; + bool pipeline_is_dirty = pipeline && + pipeline != cmd_buffer->state.emitted_compute_pipeline; + + if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | + RADV_CMD_FLAG_FLUSH_AND_INV_DB | + RADV_CMD_FLAG_PS_PARTIAL_FLUSH | + RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) { + /* If we have to wait for idle, set all states first, so that + * all SET packets are processed in parallel with previous draw + * calls. Then upload descriptors, set shader pointers, and + * dispatch, and prefetch at the end. This ensures that the + * time the CUs are idle is very short. (there are only SET_SH + * packets between the wait and the draw) + */ + radv_emit_compute_pipeline(cmd_buffer); + si_emit_cache_flush(cmd_buffer); + /* <-- CUs are idle here --> */ + + radv_upload_compute_shader_descriptors(cmd_buffer); - si_emit_cache_flush(cmd_buffer); + radv_emit_dispatch_packets(cmd_buffer, info); + /* <-- CUs are busy here --> */ - radv_emit_dispatch_packets(cmd_buffer, info); + /* Start prefetches after the dispatch has been started. Both + * will run in parallel, but starting the dispatch first is + * more important. + */ + if (pipeline_is_dirty) { + radv_emit_shader_prefetch(cmd_buffer, + pipeline->shaders[MESA_SHADER_COMPUTE]); + } + } else { + /* If we don't wait for idle, start prefetches first, then set + * states, and dispatch at the end. + */ + si_emit_cache_flush(cmd_buffer); + + if (pipeline_is_dirty) { + radv_emit_shader_prefetch(cmd_buffer, + pipeline->shaders[MESA_SHADER_COMPUTE]); + } + + radv_upload_compute_shader_descriptors(cmd_buffer); + + radv_emit_compute_pipeline(cmd_buffer); + radv_emit_dispatch_packets(cmd_buffer, info); + } radv_cmd_buffer_after_draw(cmd_buffer); } |