summaryrefslogtreecommitdiffstats
path: root/src/amd
diff options
context:
space:
mode:
authorSamuel Pitoiset <[email protected]>2017-10-17 09:47:53 +0200
committerSamuel Pitoiset <[email protected]>2017-10-20 20:07:53 +0200
commit341529dbee5c2b17fdcb7990484a383459bed305 (patch)
tree0c59e373b7811225ca8b6d63be9065f05fa8930e /src/amd
parentaf6985b309ccaef4b055728aaba08cf233542990 (diff)
radv: use optimal packet order for draws
Ported from RadeonSI. The time where shaders are idle should be shorter now. This can give a little boost, like +6% with the dynamicubo Vulkan demo. Signed-off-by: Samuel Pitoiset <[email protected]> Reviewed-by: Bas Nieuwenhuizen <[email protected]>
Diffstat (limited to 'src/amd')
-rw-r--r--src/amd/vulkan/radv_cmd_buffer.c96
1 files changed, 79 insertions, 17 deletions
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 4c50dde395e..d511659d02d 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -1001,8 +1001,6 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
radv_emit_fragment_shader(cmd_buffer, pipeline);
radv_emit_vgt_vertex_reuse(cmd_buffer, pipeline);
- radv_emit_shaders_prefetch(cmd_buffer, pipeline);
-
cmd_buffer->scratch_size_needed =
MAX2(cmd_buffer->scratch_size_needed,
pipeline->max_waves * pipeline->scratch_bytes_per_wave);
@@ -1772,6 +1770,19 @@ radv_cmd_buffer_update_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer)
return true;
}
+static bool
+radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer)
+{
+ if (!radv_cmd_buffer_update_vertex_descriptors(cmd_buffer))
+ return false;
+
+ radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
+ radv_flush_constants(cmd_buffer, cmd_buffer->state.pipeline,
+ VK_SHADER_STAGE_ALL_GRAPHICS);
+
+ return true;
+}
+
static void
radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, bool indexed_draw,
bool instanced_draw, bool indirect_draw,
@@ -3114,16 +3125,9 @@ radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer,
}
static void
-radv_draw(struct radv_cmd_buffer *cmd_buffer,
- const struct radv_draw_info *info)
+radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer,
+ const struct radv_draw_info *info)
{
- MAYBE_UNUSED unsigned cdw_max =
- radeon_check_space(cmd_buffer->device->ws,
- cmd_buffer->cs, 4096);
-
- if (!radv_cmd_buffer_update_vertex_descriptors(cmd_buffer))
- return;
-
if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)
radv_emit_graphics_pipeline(cmd_buffer);
@@ -3142,19 +3146,77 @@ radv_draw(struct radv_cmd_buffer *cmd_buffer,
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
}
+ radv_cmd_buffer_flush_dynamic_state(cmd_buffer);
+
radv_emit_draw_registers(cmd_buffer, info->indexed,
info->instance_count > 1, info->indirect,
info->indirect ? 0 : info->count);
- radv_cmd_buffer_flush_dynamic_state(cmd_buffer);
+ cmd_buffer->state.dirty = 0;
+}
- radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
- radv_flush_constants(cmd_buffer, cmd_buffer->state.pipeline,
- VK_SHADER_STAGE_ALL_GRAPHICS);
+static void
+radv_draw(struct radv_cmd_buffer *cmd_buffer,
+ const struct radv_draw_info *info)
+{
+ bool pipeline_is_dirty =
+ (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) &&
+ cmd_buffer->state.pipeline &&
+ cmd_buffer->state.pipeline != cmd_buffer->state.emitted_pipeline;
- si_emit_cache_flush(cmd_buffer);
+ MAYBE_UNUSED unsigned cdw_max =
+ radeon_check_space(cmd_buffer->device->ws,
+ cmd_buffer->cs, 4096);
- radv_emit_draw_packets(cmd_buffer, info);
+ /* Use optimal packet order based on whether we need to sync the
+ * pipeline.
+ */
+ if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
+ RADV_CMD_FLAG_FLUSH_AND_INV_DB |
+ RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
+ RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
+ /* If we have to wait for idle, set all states first, so that
+ * all SET packets are processed in parallel with previous draw
+ * calls. Then upload descriptors, set shader pointers, and
+ * draw, and prefetch at the end. This ensures that the time
+ * the CUs are idle is very short. (there are only SET_SH
+ * packets between the wait and the draw)
+ */
+ radv_emit_all_graphics_states(cmd_buffer, info);
+ si_emit_cache_flush(cmd_buffer);
+ /* <-- CUs are idle here --> */
+
+ if (!radv_upload_graphics_shader_descriptors(cmd_buffer))
+ return;
+
+ radv_emit_draw_packets(cmd_buffer, info);
+ /* <-- CUs are busy here --> */
+
+ /* Start prefetches after the draw has been started. Both will
+ * run in parallel, but starting the draw first is more
+ * important.
+ */
+ if (pipeline_is_dirty) {
+ radv_emit_shaders_prefetch(cmd_buffer,
+ cmd_buffer->state.pipeline);
+ }
+ } else {
+ /* If we don't wait for idle, start prefetches first, then set
+ * states, and draw at the end.
+ */
+ si_emit_cache_flush(cmd_buffer);
+
+ if (pipeline_is_dirty) {
+ radv_emit_shaders_prefetch(cmd_buffer,
+ cmd_buffer->state.pipeline);
+ }
+
+ if (!radv_upload_graphics_shader_descriptors(cmd_buffer))
+ return;
+
+ radv_emit_all_graphics_states(cmd_buffer, info);
+ radv_emit_draw_packets(cmd_buffer, info);
+ }
assert(cmd_buffer->cs->cdw <= cdw_max);
radv_cmd_buffer_after_draw(cmd_buffer);