diff options
author | Dave Airlie <[email protected]> | 2017-06-06 09:01:48 +1000 |
---|---|---|
committer | Dave Airlie <[email protected]> | 2017-06-06 09:43:40 +1000 |
commit | c2fbeb7ca057b3bee8c8cd0f7076af2b90d28111 (patch) | |
tree | 4e8263e1a9be4ba109e76008bdafe642778db081 /src/amd/vulkan/si_cmd_buffer.c | |
parent | b11c4a554681607bbec67f45442b815b51573236 (diff) |
radv: add GFX9 cache flushing support.
GFX9 needs to write event EOP to a fence buffer, allocate some
space for this, and just write an ever increasing number to it,
this isn't exactly what radeonsi does, but it seems to work.
Reviewed-by: Bas Nieuwenhuizen <[email protected]>
Signed-off-by: Dave Airlie <[email protected]>
Diffstat (limited to 'src/amd/vulkan/si_cmd_buffer.c')
-rw-r--r-- | src/amd/vulkan/si_cmd_buffer.c | 175 |
1 files changed, 127 insertions, 48 deletions
diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c index eda24be462a..3e0b8ee0200 100644 --- a/src/amd/vulkan/si_cmd_buffer.c +++ b/src/amd/vulkan/si_cmd_buffer.c @@ -823,15 +823,18 @@ void si_cs_emit_write_event_eop(struct radeon_winsys_cs *cs, unsigned op = EVENT_TYPE(event) | EVENT_INDEX(5) | event_flags; + unsigned is_gfx8_mec = is_mec && chip_class < GFX9; - if (is_mec) { - radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 5, 0)); + if (chip_class >= GFX9 || is_gfx8_mec) { + radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, is_gfx8_mec ? 5 : 6, 0)); radeon_emit(cs, op); radeon_emit(cs, EOP_DATA_SEL(data_sel)); radeon_emit(cs, va); /* address lo */ radeon_emit(cs, va >> 32); /* address hi */ radeon_emit(cs, new_fence); /* immediate data lo */ radeon_emit(cs, 0); /* immediate data hi */ + if (!is_gfx8_mec) + radeon_emit(cs, 0); /* unused */ } else { if (chip_class == CIK || chip_class == VI) { @@ -872,15 +875,16 @@ si_emit_wait_fence(struct radeon_winsys_cs *cs, static void si_emit_acquire_mem(struct radeon_winsys_cs *cs, - bool is_mec, + bool is_mec, bool is_gfx9, unsigned cp_coher_cntl) { - if (is_mec) { + if (is_mec || is_gfx9) { + uint32_t hi_val = is_gfx9 ? 0xffffff : 0xff; radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0) | - PKT3_SHADER_TYPE_S(1)); + PKT3_SHADER_TYPE_S(is_mec)); radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ - radeon_emit(cs, 0xff); /* CP_COHER_SIZE_HI */ + radeon_emit(cs, hi_val); /* CP_COHER_SIZE_HI */ radeon_emit(cs, 0); /* CP_COHER_BASE */ radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ @@ -897,40 +901,45 @@ si_emit_acquire_mem(struct radeon_winsys_cs *cs, void si_cs_emit_cache_flush(struct radeon_winsys_cs *cs, enum chip_class chip_class, + uint32_t *flush_cnt, + uint64_t flush_va, bool is_mec, enum radv_cmd_flush_bits flush_bits) { unsigned cp_coher_cntl = 0; - + uint32_t flush_cb_db = flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | + RADV_CMD_FLAG_FLUSH_AND_INV_DB); + if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1); if (flush_bits & RADV_CMD_FLAG_INV_SMEM_L1) cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1); - if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) { - cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | - S_0085F0_CB0_DEST_BASE_ENA(1) | - S_0085F0_CB1_DEST_BASE_ENA(1) | - S_0085F0_CB2_DEST_BASE_ENA(1) | - S_0085F0_CB3_DEST_BASE_ENA(1) | - S_0085F0_CB4_DEST_BASE_ENA(1) | - S_0085F0_CB5_DEST_BASE_ENA(1) | - S_0085F0_CB6_DEST_BASE_ENA(1) | - S_0085F0_CB7_DEST_BASE_ENA(1); - - /* Necessary for DCC */ - if (chip_class >= VI) { - si_cs_emit_write_event_eop(cs, - chip_class, - is_mec, - V_028A90_FLUSH_AND_INV_CB_DATA_TS, - 0, 0, 0, 0, 0); + if (chip_class <= VI) { + if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) { + cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | + S_0085F0_CB0_DEST_BASE_ENA(1) | + S_0085F0_CB1_DEST_BASE_ENA(1) | + S_0085F0_CB2_DEST_BASE_ENA(1) | + S_0085F0_CB3_DEST_BASE_ENA(1) | + S_0085F0_CB4_DEST_BASE_ENA(1) | + S_0085F0_CB5_DEST_BASE_ENA(1) | + S_0085F0_CB6_DEST_BASE_ENA(1) | + S_0085F0_CB7_DEST_BASE_ENA(1); + + /* Necessary for DCC */ + if (chip_class >= VI) { + si_cs_emit_write_event_eop(cs, + chip_class, + is_mec, + V_028A90_FLUSH_AND_INV_CB_DATA_TS, + 0, 0, 0, 0, 0); + } + } + if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) { + cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | + S_0085F0_DB_DEST_BASE_ENA(1); } - } - - if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) { - cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | - S_0085F0_DB_DEST_BASE_ENA(1); } if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB_META) { @@ -943,8 +952,7 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs, radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); } - if (!(flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | - RADV_CMD_FLAG_FLUSH_AND_INV_DB))) { + if (!flush_cb_db) { if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); @@ -959,6 +967,54 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs, radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); } + if (chip_class >= GFX9 && flush_cb_db) { + unsigned cb_db_event, tc_flags; + + /* Set the CB/DB flush event. */ + switch (flush_cb_db) { + case RADV_CMD_FLAG_FLUSH_AND_INV_CB: + cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS; + break; + case RADV_CMD_FLAG_FLUSH_AND_INV_DB: + cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS; + break; + default: + /* both CB & DB */ + cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT; + } + + /* TC | TC_WB = invalidate L2 data + * TC_MD | TC_WB = invalidate L2 metadata + * TC | TC_WB | TC_MD = invalidate L2 data & metadata + * + * The metadata cache must always be invalidated for coherency + * between CB/DB and shaders. (metadata = HTILE, CMASK, DCC) + * + * TC must be invalidated on GFX9 only if the CB/DB surface is + * not pipe-aligned. If the surface is RB-aligned, it might not + * strictly be pipe-aligned since RB alignment takes precendence. + */ + tc_flags = EVENT_TC_WB_ACTION_ENA | + EVENT_TC_MD_ACTION_ENA; + + /* Ideally flush TC together with CB/DB. */ + if (flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) { + tc_flags |= EVENT_TC_ACTION_ENA | + EVENT_TCL1_ACTION_ENA; + + /* Clear the flags. */ + flush_bits &= ~(RADV_CMD_FLAG_INV_GLOBAL_L2 | + RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2 | + RADV_CMD_FLAG_INV_VMEM_L1); + } + assert(flush_cnt); + uint32_t old_fence = (*flush_cnt)++; + + si_cs_emit_write_event_eop(cs, chip_class, false, cb_db_event, tc_flags, 1, + flush_va, old_fence, *flush_cnt); + si_emit_wait_fence(cs, flush_va, *flush_cnt, 0xffffffff); + } + /* VGT state sync */ if (flush_bits & RADV_CMD_FLAG_VGT_FLUSH) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); @@ -968,7 +1024,11 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs, /* Make sure ME is idle (it executes most packets) before continuing. * This prevents read-after-write hazards between PFP and ME. */ - if ((cp_coher_cntl || (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) && + if ((cp_coher_cntl || + (flush_bits & (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | + RADV_CMD_FLAG_INV_VMEM_L1 | + RADV_CMD_FLAG_INV_GLOBAL_L2 | + RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) && !is_mec) { radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); radeon_emit(cs, 0); @@ -976,34 +1036,46 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs, if ((flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) || (chip_class <= CIK && (flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) { - cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1); - if (chip_class >= VI) - cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1); - } else if(flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2) { - cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1) | - S_0301F0_TC_NC_ACTION_ENA(1); - - /* L2 writeback doesn't combine with L1 invalidate */ - si_emit_acquire_mem(cs, is_mec, cp_coher_cntl); - + si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9, + cp_coher_cntl | + S_0085F0_TC_ACTION_ENA(1) | + S_0085F0_TCL1_ACTION_ENA(1) | + S_0301F0_TC_WB_ACTION_ENA(chip_class >= VI)); cp_coher_cntl = 0; + } else { + if(flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2) { + /* WB = write-back + * NC = apply to non-coherent MTYPEs + * (i.e. MTYPE <= 1, which is what we use everywhere) + * + * WB doesn't work without NC. + */ + si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9, + cp_coher_cntl | + S_0301F0_TC_WB_ACTION_ENA(1) | + S_0301F0_TC_NC_ACTION_ENA(1)); + cp_coher_cntl = 0; + } + if (flush_bits & RADV_CMD_FLAG_INV_VMEM_L1) { + si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9, + cp_coher_cntl | + S_0085F0_TCL1_ACTION_ENA(1)); + cp_coher_cntl = 0; + } } - if (flush_bits & RADV_CMD_FLAG_INV_VMEM_L1) - cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1); - /* When one of the DEST_BASE flags is set, SURFACE_SYNC waits for idle. * Therefore, it should be last. Done in PFP. */ if (cp_coher_cntl) - si_emit_acquire_mem(cs, is_mec, cp_coher_cntl); + si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9, cp_coher_cntl); } void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer) { bool is_compute = cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE; - + enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class; if (is_compute) cmd_buffer->state.flush_bits &= ~(RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META | @@ -1015,8 +1087,15 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer) radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 128); + uint32_t *ptr = NULL; + uint64_t va = 0; + if (chip_class == GFX9) { + va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->gfx9_fence_bo) + cmd_buffer->gfx9_fence_offset; + ptr = &cmd_buffer->gfx9_fence_idx; + } si_cs_emit_cache_flush(cmd_buffer->cs, cmd_buffer->device->physical_device->rad_info.chip_class, + ptr, va, radv_cmd_buffer_uses_mec(cmd_buffer), cmd_buffer->state.flush_bits); |