summaryrefslogtreecommitdiffstats
path: root/src/amd/vulkan/radv_device.c
diff options
context:
space:
mode:
authorBas Nieuwenhuizen <[email protected]>2017-09-02 12:59:55 +0200
committerBas Nieuwenhuizen <[email protected]>2017-09-04 00:06:40 +0200
commit1a72ca5667f82b8144c0003bddd76a774221ac09 (patch)
tree81694a31d6b27cfaaeb4660b705e8e2f22c57b63 /src/amd/vulkan/radv_device.c
parentdec7b38fe62a1db46c5150a7368d3bb3c5e45305 (diff)
radv: Put semaphore waits in preamble cs.
The separate flush cs gets in the way of batchchain. Reviewed-by: Dave Airlie <[email protected]>
Diffstat (limited to 'src/amd/vulkan/radv_device.c')
-rw-r--r--src/amd/vulkan/radv_device.c94
1 files changed, 36 insertions, 58 deletions
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index a5898691df6..d4dd2dbf5ff 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -1000,6 +1000,8 @@ radv_queue_finish(struct radv_queue *queue)
if (queue->hw_ctx)
queue->device->ws->ctx_destroy(queue->hw_ctx);
+ if (queue->initial_full_flush_preamble_cs)
+ queue->device->ws->cs_destroy(queue->initial_full_flush_preamble_cs);
if (queue->initial_preamble_cs)
queue->device->ws->cs_destroy(queue->initial_preamble_cs);
if (queue->continue_preamble_cs)
@@ -1178,41 +1180,6 @@ VkResult radv_CreateDevice(
break;
}
device->ws->cs_finalize(device->empty_cs[family]);
-
- device->flush_cs[family] = device->ws->cs_create(device->ws, family);
- switch (family) {
- case RADV_QUEUE_GENERAL:
- case RADV_QUEUE_COMPUTE:
- si_cs_emit_cache_flush(device->flush_cs[family],
- false,
- device->physical_device->rad_info.chip_class,
- NULL, 0,
- family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK,
- RADV_CMD_FLAG_INV_ICACHE |
- RADV_CMD_FLAG_INV_SMEM_L1 |
- RADV_CMD_FLAG_INV_VMEM_L1 |
- RADV_CMD_FLAG_INV_GLOBAL_L2);
- break;
- }
- device->ws->cs_finalize(device->flush_cs[family]);
-
- device->flush_shader_cs[family] = device->ws->cs_create(device->ws, family);
- switch (family) {
- case RADV_QUEUE_GENERAL:
- case RADV_QUEUE_COMPUTE:
- si_cs_emit_cache_flush(device->flush_shader_cs[family],
- false,
- device->physical_device->rad_info.chip_class,
- NULL, 0,
- family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK,
- family == RADV_QUEUE_COMPUTE ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH) |
- RADV_CMD_FLAG_INV_ICACHE |
- RADV_CMD_FLAG_INV_SMEM_L1 |
- RADV_CMD_FLAG_INV_VMEM_L1 |
- RADV_CMD_FLAG_INV_GLOBAL_L2);
- break;
- }
- device->ws->cs_finalize(device->flush_shader_cs[family]);
}
if (getenv("RADV_TRACE_FILE")) {
@@ -1280,10 +1247,6 @@ void radv_DestroyDevice(
vk_free(&device->alloc, device->queues[i]);
if (device->empty_cs[i])
device->ws->cs_destroy(device->empty_cs[i]);
- if (device->flush_cs[i])
- device->ws->cs_destroy(device->flush_cs[i]);
- if (device->flush_shader_cs[i])
- device->ws->cs_destroy(device->flush_shader_cs[i]);
}
radv_device_finish_meta(device);
@@ -1576,6 +1539,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
uint32_t gsvs_ring_size,
bool needs_tess_rings,
bool needs_sample_positions,
+ struct radeon_winsys_cs **initial_full_flush_preamble_cs,
struct radeon_winsys_cs **initial_preamble_cs,
struct radeon_winsys_cs **continue_preamble_cs)
{
@@ -1586,7 +1550,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
struct radeon_winsys_bo *gsvs_ring_bo = NULL;
struct radeon_winsys_bo *tess_factor_ring_bo = NULL;
struct radeon_winsys_bo *tess_offchip_ring_bo = NULL;
- struct radeon_winsys_cs *dest_cs[2] = {0};
+ struct radeon_winsys_cs *dest_cs[3] = {0};
bool add_tess_rings = false, add_sample_positions = false;
unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0;
unsigned max_offchip_buffers;
@@ -1611,6 +1575,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
gsvs_ring_size <= queue->gsvs_ring_size &&
!add_tess_rings && !add_sample_positions &&
queue->initial_preamble_cs) {
+ *initial_full_flush_preamble_cs = queue->initial_full_flush_preamble_cs;
*initial_preamble_cs = queue->initial_preamble_cs;
*continue_preamble_cs = queue->continue_preamble_cs;
if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size)
@@ -1712,7 +1677,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
} else
descriptor_bo = queue->descriptor_bo;
- for(int i = 0; i < 2; ++i) {
+ for(int i = 0; i < 3; ++i) {
struct radeon_winsys_cs *cs = NULL;
cs = queue->device->ws->cs_create(queue->device->ws,
queue->queue_family_index ? RING_COMPUTE : RING_GFX);
@@ -1831,7 +1796,19 @@ radv_get_preamble_cs(struct radv_queue *queue,
radeon_emit(cs, rsrc1);
}
- if (!i) {
+ if (i == 0) {
+ si_cs_emit_cache_flush(cs,
+ false,
+ queue->device->physical_device->rad_info.chip_class,
+ NULL, 0,
+ queue->queue_family_index == RING_COMPUTE &&
+ queue->device->physical_device->rad_info.chip_class >= CIK,
+ (queue->queue_family_index == RADV_QUEUE_COMPUTE ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH)) |
+ RADV_CMD_FLAG_INV_ICACHE |
+ RADV_CMD_FLAG_INV_SMEM_L1 |
+ RADV_CMD_FLAG_INV_VMEM_L1 |
+ RADV_CMD_FLAG_INV_GLOBAL_L2);
+ } else if (i == 1) {
si_cs_emit_cache_flush(cs,
false,
queue->device->physical_device->rad_info.chip_class,
@@ -1848,14 +1825,18 @@ radv_get_preamble_cs(struct radv_queue *queue,
goto fail;
}
+ if (queue->initial_full_flush_preamble_cs)
+ queue->device->ws->cs_destroy(queue->initial_full_flush_preamble_cs);
+
if (queue->initial_preamble_cs)
queue->device->ws->cs_destroy(queue->initial_preamble_cs);
if (queue->continue_preamble_cs)
queue->device->ws->cs_destroy(queue->continue_preamble_cs);
- queue->initial_preamble_cs = dest_cs[0];
- queue->continue_preamble_cs = dest_cs[1];
+ queue->initial_full_flush_preamble_cs = dest_cs[0];
+ queue->initial_preamble_cs = dest_cs[1];
+ queue->continue_preamble_cs = dest_cs[2];
if (scratch_bo != queue->scratch_bo) {
if (queue->scratch_bo)
@@ -1904,6 +1885,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
if (add_sample_positions)
queue->has_sample_positions = true;
+ *initial_full_flush_preamble_cs = queue->initial_full_flush_preamble_cs;
*initial_preamble_cs = queue->initial_preamble_cs;
*continue_preamble_cs = queue->continue_preamble_cs;
if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size)
@@ -2028,7 +2010,7 @@ VkResult radv_QueueSubmit(
uint32_t scratch_size = 0;
uint32_t compute_scratch_size = 0;
uint32_t esgs_ring_size = 0, gsvs_ring_size = 0;
- struct radeon_winsys_cs *initial_preamble_cs = NULL, *continue_preamble_cs = NULL;
+ struct radeon_winsys_cs *initial_preamble_cs = NULL, *initial_flush_preamble_cs = NULL, *continue_preamble_cs = NULL;
VkResult result;
bool fence_emitted = false;
bool tess_rings_needed = false;
@@ -2053,7 +2035,7 @@ VkResult radv_QueueSubmit(
result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size,
esgs_ring_size, gsvs_ring_size, tess_rings_needed,
- sample_positions_needed,
+ sample_positions_needed, &initial_flush_preamble_cs,
&initial_preamble_cs, &continue_preamble_cs);
if (result != VK_SUCCESS)
return result;
@@ -2061,7 +2043,7 @@ VkResult radv_QueueSubmit(
for (uint32_t i = 0; i < submitCount; i++) {
struct radeon_winsys_cs **cs_array;
bool do_flush = !i || pSubmits[i].pWaitDstStageMask;
- bool can_patch = !do_flush;
+ bool can_patch = true;
uint32_t advance;
struct radv_winsys_sem_info sem_info;
@@ -2091,35 +2073,31 @@ VkResult radv_QueueSubmit(
}
cs_array = malloc(sizeof(struct radeon_winsys_cs *) *
- (pSubmits[i].commandBufferCount + do_flush));
-
- if(do_flush)
- cs_array[0] = pSubmits[i].waitSemaphoreCount ?
- queue->device->flush_shader_cs[queue->queue_family_index] :
- queue->device->flush_cs[queue->queue_family_index];
+ (pSubmits[i].commandBufferCount));
for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer,
pSubmits[i].pCommandBuffers[j]);
assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
- cs_array[j + do_flush] = cmd_buffer->cs;
+ cs_array[j] = cmd_buffer->cs;
if ((cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
can_patch = false;
}
- for (uint32_t j = 0; j < pSubmits[i].commandBufferCount + do_flush; j += advance) {
+ for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j += advance) {
+ struct radeon_winsys_cs *initial_preamble = (do_flush && !j) ? initial_flush_preamble_cs : initial_preamble_cs;
advance = MIN2(max_cs_submission,
- pSubmits[i].commandBufferCount + do_flush - j);
+ pSubmits[i].commandBufferCount - j);
if (queue->device->trace_bo)
*queue->device->trace_id_ptr = 0;
sem_info.cs_emit_wait = j == 0;
- sem_info.cs_emit_signal = j + advance == pSubmits[i].commandBufferCount + do_flush;
+ sem_info.cs_emit_signal = j + advance == pSubmits[i].commandBufferCount;
ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j,
- advance, initial_preamble_cs, continue_preamble_cs,
+ advance, initial_preamble, continue_preamble_cs,
&sem_info,
can_patch, base_fence);