summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMarek Olšák <[email protected]>2016-10-10 18:49:22 +0200
committerMarek Olšák <[email protected]>2016-10-12 18:29:40 +0200
commit8cdce30cc20983dcb971dd906a9a9007e282081d (patch)
treed807f6a1400a42c99add320d75eae3aac988e2e5
parent65a4d55a9ff12b44655803da10112d3b1b42ce13 (diff)
radeonsi: implement TC L2 write-back (flush) without cache invalidation
Reviewed-by: Nicolai Hähnle <[email protected]>
-rw-r--r--src/gallium/drivers/radeonsi/si_pipe.h21
-rw-r--r--src/gallium/drivers/radeonsi/si_state_draw.c81
2 files changed, 74 insertions, 28 deletions
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 3cefee7ad2f..e10d3fb4fc5 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -50,17 +50,20 @@
#define SI_CONTEXT_INV_VMEM_L1 (R600_CONTEXT_PRIVATE_FLAG << 2)
/* Used by everything except CB/DB, can be bypassed (SLC=1). Other names: TC L2 */
#define SI_CONTEXT_INV_GLOBAL_L2 (R600_CONTEXT_PRIVATE_FLAG << 3)
+/* Write dirty L2 lines back to memory (shader and CP DMA stores), but don't
+ * invalidate L2. SI-CIK can't do it, so they will do complete invalidation. */
+#define SI_CONTEXT_WRITEBACK_GLOBAL_L2 (R600_CONTEXT_PRIVATE_FLAG << 4)
/* Framebuffer caches. */
-#define SI_CONTEXT_FLUSH_AND_INV_CB_META (R600_CONTEXT_PRIVATE_FLAG << 4)
-#define SI_CONTEXT_FLUSH_AND_INV_DB_META (R600_CONTEXT_PRIVATE_FLAG << 5)
-#define SI_CONTEXT_FLUSH_AND_INV_DB (R600_CONTEXT_PRIVATE_FLAG << 6)
-#define SI_CONTEXT_FLUSH_AND_INV_CB (R600_CONTEXT_PRIVATE_FLAG << 7)
+#define SI_CONTEXT_FLUSH_AND_INV_CB_META (R600_CONTEXT_PRIVATE_FLAG << 5)
+#define SI_CONTEXT_FLUSH_AND_INV_DB_META (R600_CONTEXT_PRIVATE_FLAG << 6)
+#define SI_CONTEXT_FLUSH_AND_INV_DB (R600_CONTEXT_PRIVATE_FLAG << 7)
+#define SI_CONTEXT_FLUSH_AND_INV_CB (R600_CONTEXT_PRIVATE_FLAG << 8)
/* Engine synchronization. */
-#define SI_CONTEXT_VS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 8)
-#define SI_CONTEXT_PS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 9)
-#define SI_CONTEXT_CS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 10)
-#define SI_CONTEXT_VGT_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 11)
-#define SI_CONTEXT_VGT_STREAMOUT_SYNC (R600_CONTEXT_PRIVATE_FLAG << 12)
+#define SI_CONTEXT_VS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 9)
+#define SI_CONTEXT_PS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 10)
+#define SI_CONTEXT_CS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 11)
+#define SI_CONTEXT_VGT_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 12)
+#define SI_CONTEXT_VGT_STREAMOUT_SYNC (R600_CONTEXT_PRIVATE_FLAG << 13)
#define SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER (SI_CONTEXT_FLUSH_AND_INV_CB | \
SI_CONTEXT_FLUSH_AND_INV_CB_META | \
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 38e5cb45785..33b6b232343 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -696,6 +696,19 @@ static void si_emit_draw_packets(struct si_context *sctx,
}
}
+static void si_emit_surface_sync(struct r600_common_context *rctx,
+ unsigned cp_coher_cntl)
+{
+ struct radeon_winsys_cs *cs = rctx->gfx.cs;
+
+ /* ACQUIRE_MEM is only required on a compute ring. */
+ radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
+ radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
+ radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
+ radeon_emit(cs, 0); /* CP_COHER_BASE */
+ radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
+}
+
void si_emit_cache_flush(struct si_context *sctx)
{
struct r600_common_context *rctx = &sctx->b;
@@ -715,15 +728,6 @@ void si_emit_cache_flush(struct si_context *sctx)
if (rctx->flags & SI_CONTEXT_INV_SMEM_L1)
cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
- if (rctx->flags & SI_CONTEXT_INV_VMEM_L1)
- cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
- if (rctx->flags & SI_CONTEXT_INV_GLOBAL_L2) {
- cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1);
-
- if (rctx->chip_class >= VI)
- cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1);
- }
-
if (rctx->flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
S_0085F0_CB0_DEST_BASE_ENA(1) |
@@ -806,23 +810,62 @@ void si_emit_cache_flush(struct si_context *sctx)
/* Make sure ME is idle (it executes most packets) before continuing.
* This prevents read-after-write hazards between PFP and ME.
*/
- if (cp_coher_cntl || (rctx->flags & SI_CONTEXT_CS_PARTIAL_FLUSH)) {
+ if (cp_coher_cntl ||
+ (rctx->flags & (SI_CONTEXT_CS_PARTIAL_FLUSH |
+ SI_CONTEXT_INV_VMEM_L1 |
+ SI_CONTEXT_INV_GLOBAL_L2 |
+ SI_CONTEXT_WRITEBACK_GLOBAL_L2))) {
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0);
}
- /* When one of the DEST_BASE flags is set, SURFACE_SYNC waits for idle.
- * Therefore, it should be last. Done in PFP.
+ /* When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
+ * waits for idle. Therefore, it should be last. SURFACE_SYNC is done
+ * in PFP.
+ *
+ * cp_coher_cntl should contain all necessary flags except TC flags
+ * at this point.
+ *
+ * SI-CIK don't support L2 write-back.
*/
- if (cp_coher_cntl) {
- /* ACQUIRE_MEM is only required on a compute ring. */
- radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
- radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
- radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
- radeon_emit(cs, 0); /* CP_COHER_BASE */
- radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
+ if (rctx->flags & SI_CONTEXT_INV_GLOBAL_L2 ||
+ (rctx->chip_class <= CIK &&
+ (rctx->flags & SI_CONTEXT_WRITEBACK_GLOBAL_L2))) {
+ /* Invalidate L1 & L2. (L1 is always invalidated)
+ * WB must be set on VI+ when TC_ACTION is set.
+ */
+ si_emit_surface_sync(rctx, cp_coher_cntl |
+ S_0085F0_TC_ACTION_ENA(1) |
+ S_0301F0_TC_WB_ACTION_ENA(rctx->chip_class >= VI));
+ cp_coher_cntl = 0;
+ } else {
+ /* L1 invalidation and L2 writeback must be done separately,
+ * because both operations can't be done together.
+ */
+ if (rctx->flags & SI_CONTEXT_WRITEBACK_GLOBAL_L2) {
+ /* WB = write-back
+ * NC = apply to non-coherent MTYPEs
+ * (i.e. MTYPE <= 1, which is what we use everywhere)
+ *
+ * WB doesn't work without NC.
+ */
+ si_emit_surface_sync(rctx, cp_coher_cntl |
+ S_0301F0_TC_WB_ACTION_ENA(1) |
+ S_0301F0_TC_NC_ACTION_ENA(1));
+ cp_coher_cntl = 0;
+ }
+ if (rctx->flags & SI_CONTEXT_INV_VMEM_L1) {
+ /* Invalidate per-CU VMEM L1. */
+ si_emit_surface_sync(rctx, cp_coher_cntl |
+ S_0085F0_TCL1_ACTION_ENA(1));
+ cp_coher_cntl = 0;
+ }
}
+ /* If TC flushes haven't cleared this... */
+ if (cp_coher_cntl)
+ si_emit_surface_sync(rctx, cp_coher_cntl);
+
if (rctx->flags & R600_CONTEXT_START_PIPELINE_STATS) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) |