summaryrefslogtreecommitdiffstats
path: root/src/amd
diff options
context:
space:
mode:
authorDave Airlie <[email protected]>2017-06-06 09:01:48 +1000
committerDave Airlie <[email protected]>2017-06-06 09:43:40 +1000
commitc2fbeb7ca057b3bee8c8cd0f7076af2b90d28111 (patch)
tree4e8263e1a9be4ba109e76008bdafe642778db081 /src/amd
parentb11c4a554681607bbec67f45442b815b51573236 (diff)
radv: add GFX9 cache flushing support.
GFX9 needs to write event EOP to a fence buffer, allocate some space for this, and just write an ever increasing number to it, this isn't exactly what radeonsi does, but it seems to work. Reviewed-by: Bas Nieuwenhuizen <[email protected]> Signed-off-by: Dave Airlie <[email protected]>
Diffstat (limited to 'src/amd')
-rw-r--r--src/amd/vulkan/radv_cmd_buffer.c8
-rw-r--r--src/amd/vulkan/radv_device.c3
-rw-r--r--src/amd/vulkan/radv_private.h10
-rw-r--r--src/amd/vulkan/si_cmd_buffer.c175
4 files changed, 145 insertions, 51 deletions
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index d66f8979e8f..d078421182d 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -234,6 +234,14 @@ static void radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
cmd_buffer->record_fail = false;
cmd_buffer->ring_offsets_idx = -1;
+
+ if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
+ void *fence_ptr;
+ radv_cmd_buffer_upload_alloc(cmd_buffer, 8, 0,
+ &cmd_buffer->gfx9_fence_offset,
+ &fence_ptr);
+ cmd_buffer->gfx9_fence_bo = cmd_buffer->upload.upload_bo;
+ }
}
static bool
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index ca42ab8e0e1..9d510ea59ea 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -1103,6 +1103,7 @@ VkResult radv_CreateDevice(
case RADV_QUEUE_COMPUTE:
si_cs_emit_cache_flush(device->flush_cs[family],
device->physical_device->rad_info.chip_class,
+ NULL, 0,
family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK,
RADV_CMD_FLAG_INV_ICACHE |
RADV_CMD_FLAG_INV_SMEM_L1 |
@@ -1118,6 +1119,7 @@ VkResult radv_CreateDevice(
case RADV_QUEUE_COMPUTE:
si_cs_emit_cache_flush(device->flush_shader_cs[family],
device->physical_device->rad_info.chip_class,
+ NULL, 0,
family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK,
family == RADV_QUEUE_COMPUTE ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH) |
RADV_CMD_FLAG_INV_ICACHE |
@@ -1763,6 +1765,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
if (!i) {
si_cs_emit_cache_flush(cs,
queue->device->physical_device->rad_info.chip_class,
+ NULL, 0,
queue->queue_family_index == RING_COMPUTE &&
queue->device->physical_device->rad_info.chip_class >= CIK,
RADV_CMD_FLAG_INV_ICACHE |
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index e1b9a29cee1..6a6c1e2351a 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -822,6 +822,9 @@ struct radv_cmd_buffer {
bool record_fail;
int ring_offsets_idx; /* just used for verification */
+ uint32_t gfx9_fence_offset;
+ struct radeon_winsys_bo *gfx9_fence_bo;
+ uint32_t gfx9_fence_idx;
};
struct radv_image;
@@ -854,9 +857,10 @@ void si_emit_wait_fence(struct radeon_winsys_cs *cs,
uint64_t va, uint32_t ref,
uint32_t mask);
void si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
- enum chip_class chip_class,
- bool is_mec,
- enum radv_cmd_flush_bits flush_bits);
+ enum chip_class chip_class,
+ uint32_t *fence_ptr, uint64_t va,
+ bool is_mec,
+ enum radv_cmd_flush_bits flush_bits);
void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer);
void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer,
uint64_t src_va, uint64_t dest_va,
diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c
index eda24be462a..3e0b8ee0200 100644
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -823,15 +823,18 @@ void si_cs_emit_write_event_eop(struct radeon_winsys_cs *cs,
unsigned op = EVENT_TYPE(event) |
EVENT_INDEX(5) |
event_flags;
+ unsigned is_gfx8_mec = is_mec && chip_class < GFX9;
- if (is_mec) {
- radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 5, 0));
+ if (chip_class >= GFX9 || is_gfx8_mec) {
+ radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, is_gfx8_mec ? 5 : 6, 0));
radeon_emit(cs, op);
radeon_emit(cs, EOP_DATA_SEL(data_sel));
radeon_emit(cs, va); /* address lo */
radeon_emit(cs, va >> 32); /* address hi */
radeon_emit(cs, new_fence); /* immediate data lo */
radeon_emit(cs, 0); /* immediate data hi */
+ if (!is_gfx8_mec)
+ radeon_emit(cs, 0); /* unused */
} else {
if (chip_class == CIK ||
chip_class == VI) {
@@ -872,15 +875,16 @@ si_emit_wait_fence(struct radeon_winsys_cs *cs,
static void
si_emit_acquire_mem(struct radeon_winsys_cs *cs,
- bool is_mec,
+ bool is_mec, bool is_gfx9,
unsigned cp_coher_cntl)
{
- if (is_mec) {
+ if (is_mec || is_gfx9) {
+ uint32_t hi_val = is_gfx9 ? 0xffffff : 0xff;
radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0) |
- PKT3_SHADER_TYPE_S(1));
+ PKT3_SHADER_TYPE_S(is_mec));
radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
- radeon_emit(cs, 0xff); /* CP_COHER_SIZE_HI */
+ radeon_emit(cs, hi_val); /* CP_COHER_SIZE_HI */
radeon_emit(cs, 0); /* CP_COHER_BASE */
radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
@@ -897,40 +901,45 @@ si_emit_acquire_mem(struct radeon_winsys_cs *cs,
void
si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
enum chip_class chip_class,
+ uint32_t *flush_cnt,
+ uint64_t flush_va,
bool is_mec,
enum radv_cmd_flush_bits flush_bits)
{
unsigned cp_coher_cntl = 0;
-
+ uint32_t flush_cb_db = flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
+ RADV_CMD_FLAG_FLUSH_AND_INV_DB);
+
if (flush_bits & RADV_CMD_FLAG_INV_ICACHE)
cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
if (flush_bits & RADV_CMD_FLAG_INV_SMEM_L1)
cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
- if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) {
- cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
- S_0085F0_CB0_DEST_BASE_ENA(1) |
- S_0085F0_CB1_DEST_BASE_ENA(1) |
- S_0085F0_CB2_DEST_BASE_ENA(1) |
- S_0085F0_CB3_DEST_BASE_ENA(1) |
- S_0085F0_CB4_DEST_BASE_ENA(1) |
- S_0085F0_CB5_DEST_BASE_ENA(1) |
- S_0085F0_CB6_DEST_BASE_ENA(1) |
- S_0085F0_CB7_DEST_BASE_ENA(1);
-
- /* Necessary for DCC */
- if (chip_class >= VI) {
- si_cs_emit_write_event_eop(cs,
- chip_class,
- is_mec,
- V_028A90_FLUSH_AND_INV_CB_DATA_TS,
- 0, 0, 0, 0, 0);
+ if (chip_class <= VI) {
+ if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) {
+ cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
+ S_0085F0_CB0_DEST_BASE_ENA(1) |
+ S_0085F0_CB1_DEST_BASE_ENA(1) |
+ S_0085F0_CB2_DEST_BASE_ENA(1) |
+ S_0085F0_CB3_DEST_BASE_ENA(1) |
+ S_0085F0_CB4_DEST_BASE_ENA(1) |
+ S_0085F0_CB5_DEST_BASE_ENA(1) |
+ S_0085F0_CB6_DEST_BASE_ENA(1) |
+ S_0085F0_CB7_DEST_BASE_ENA(1);
+
+ /* Necessary for DCC */
+ if (chip_class >= VI) {
+ si_cs_emit_write_event_eop(cs,
+ chip_class,
+ is_mec,
+ V_028A90_FLUSH_AND_INV_CB_DATA_TS,
+ 0, 0, 0, 0, 0);
+ }
+ }
+ if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) {
+ cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
+ S_0085F0_DB_DEST_BASE_ENA(1);
}
- }
-
- if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) {
- cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
- S_0085F0_DB_DEST_BASE_ENA(1);
}
if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB_META) {
@@ -943,8 +952,7 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
}
- if (!(flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
- RADV_CMD_FLAG_FLUSH_AND_INV_DB))) {
+ if (!flush_cb_db) {
if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
@@ -959,6 +967,54 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
}
+ if (chip_class >= GFX9 && flush_cb_db) {
+ unsigned cb_db_event, tc_flags;
+
+ /* Set the CB/DB flush event. */
+ switch (flush_cb_db) {
+ case RADV_CMD_FLAG_FLUSH_AND_INV_CB:
+ cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
+ break;
+ case RADV_CMD_FLAG_FLUSH_AND_INV_DB:
+ cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
+ break;
+ default:
+ /* both CB & DB */
+ cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
+ }
+
+ /* TC | TC_WB = invalidate L2 data
+ * TC_MD | TC_WB = invalidate L2 metadata
+ * TC | TC_WB | TC_MD = invalidate L2 data & metadata
+ *
+ * The metadata cache must always be invalidated for coherency
+ * between CB/DB and shaders. (metadata = HTILE, CMASK, DCC)
+ *
+ * TC must be invalidated on GFX9 only if the CB/DB surface is
+ * not pipe-aligned. If the surface is RB-aligned, it might not
+ * strictly be pipe-aligned since RB alignment takes precendence.
+ */
+ tc_flags = EVENT_TC_WB_ACTION_ENA |
+ EVENT_TC_MD_ACTION_ENA;
+
+ /* Ideally flush TC together with CB/DB. */
+ if (flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) {
+ tc_flags |= EVENT_TC_ACTION_ENA |
+ EVENT_TCL1_ACTION_ENA;
+
+ /* Clear the flags. */
+ flush_bits &= ~(RADV_CMD_FLAG_INV_GLOBAL_L2 |
+ RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2 |
+ RADV_CMD_FLAG_INV_VMEM_L1);
+ }
+ assert(flush_cnt);
+ uint32_t old_fence = (*flush_cnt)++;
+
+ si_cs_emit_write_event_eop(cs, chip_class, false, cb_db_event, tc_flags, 1,
+ flush_va, old_fence, *flush_cnt);
+ si_emit_wait_fence(cs, flush_va, *flush_cnt, 0xffffffff);
+ }
+
/* VGT state sync */
if (flush_bits & RADV_CMD_FLAG_VGT_FLUSH) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
@@ -968,7 +1024,11 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
/* Make sure ME is idle (it executes most packets) before continuing.
* This prevents read-after-write hazards between PFP and ME.
*/
- if ((cp_coher_cntl || (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) &&
+ if ((cp_coher_cntl ||
+ (flush_bits & (RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
+ RADV_CMD_FLAG_INV_VMEM_L1 |
+ RADV_CMD_FLAG_INV_GLOBAL_L2 |
+ RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) &&
!is_mec) {
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0);
@@ -976,34 +1036,46 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
if ((flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) ||
(chip_class <= CIK && (flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) {
- cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1);
- if (chip_class >= VI)
- cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1);
- } else if(flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2) {
- cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1) |
- S_0301F0_TC_NC_ACTION_ENA(1);
-
- /* L2 writeback doesn't combine with L1 invalidate */
- si_emit_acquire_mem(cs, is_mec, cp_coher_cntl);
-
+ si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9,
+ cp_coher_cntl |
+ S_0085F0_TC_ACTION_ENA(1) |
+ S_0085F0_TCL1_ACTION_ENA(1) |
+ S_0301F0_TC_WB_ACTION_ENA(chip_class >= VI));
cp_coher_cntl = 0;
+ } else {
+ if(flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2) {
+ /* WB = write-back
+ * NC = apply to non-coherent MTYPEs
+ * (i.e. MTYPE <= 1, which is what we use everywhere)
+ *
+ * WB doesn't work without NC.
+ */
+ si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9,
+ cp_coher_cntl |
+ S_0301F0_TC_WB_ACTION_ENA(1) |
+ S_0301F0_TC_NC_ACTION_ENA(1));
+ cp_coher_cntl = 0;
+ }
+ if (flush_bits & RADV_CMD_FLAG_INV_VMEM_L1) {
+ si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9,
+ cp_coher_cntl |
+ S_0085F0_TCL1_ACTION_ENA(1));
+ cp_coher_cntl = 0;
+ }
}
- if (flush_bits & RADV_CMD_FLAG_INV_VMEM_L1)
- cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
-
/* When one of the DEST_BASE flags is set, SURFACE_SYNC waits for idle.
* Therefore, it should be last. Done in PFP.
*/
if (cp_coher_cntl)
- si_emit_acquire_mem(cs, is_mec, cp_coher_cntl);
+ si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9, cp_coher_cntl);
}
void
si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
{
bool is_compute = cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE;
-
+ enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class;
if (is_compute)
cmd_buffer->state.flush_bits &= ~(RADV_CMD_FLAG_FLUSH_AND_INV_CB |
RADV_CMD_FLAG_FLUSH_AND_INV_CB_META |
@@ -1015,8 +1087,15 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 128);
+ uint32_t *ptr = NULL;
+ uint64_t va = 0;
+ if (chip_class == GFX9) {
+ va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->gfx9_fence_bo) + cmd_buffer->gfx9_fence_offset;
+ ptr = &cmd_buffer->gfx9_fence_idx;
+ }
si_cs_emit_cache_flush(cmd_buffer->cs,
cmd_buffer->device->physical_device->rad_info.chip_class,
+ ptr, va,
radv_cmd_buffer_uses_mec(cmd_buffer),
cmd_buffer->state.flush_bits);