summaryrefslogtreecommitdiffstats
path: root/src/gallium
diff options
context:
space:
mode:
authorMarek Olšák <marek.olsak@amd.com>2016-05-26 22:00:03 +0200
committerMarek Olšák <marek.olsak@amd.com>2016-06-04 15:42:33 +0200
commit5ea5ed60500a8612166853975b42abd40a459216 (patch)
tree5b9bc8656166ed7b359b3ceca87c54591a3bdb6e /src/gallium
parentade16e1f5d046f6407c4f0046efb8363520adcf0 (diff)
r600g: fix CP DMA hazard with index buffer fetches (v3)
v3: use PFP_SYNC_ME on EG-CM only when supported by the kernel, otherwise use MEM_WRITE + WAIT_REG_MEM to emulate that Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Tested-by: Grazvydas Ignotas <notasas@gmail.com> Tested-by: Dieter Nützel <Dieter@nuetzel-hh.de>
Diffstat (limited to 'src/gallium')
-rw-r--r--src/gallium/drivers/r600/evergreen_hw_context.c16
-rw-r--r--src/gallium/drivers/r600/evergreend.h1
-rw-r--r--src/gallium/drivers/r600/r600_blit.c2
-rw-r--r--src/gallium/drivers/r600/r600_hw_context.c69
-rw-r--r--src/gallium/drivers/r600/r600_pipe.h5
-rw-r--r--src/gallium/drivers/r600/r600d.h5
-rw-r--r--src/gallium/drivers/radeonsi/sid.h2
7 files changed, 93 insertions, 7 deletions
diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c
index f456696970c..2feb8015082 100644
--- a/src/gallium/drivers/r600/evergreen_hw_context.c
+++ b/src/gallium/drivers/r600/evergreen_hw_context.c
@@ -85,7 +85,8 @@ void evergreen_dma_copy_buffer(struct r600_context *rctx,
void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
struct pipe_resource *dst, uint64_t offset,
- unsigned size, uint32_t clear_value)
+ unsigned size, uint32_t clear_value,
+ enum r600_coherency coher)
{
struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
@@ -117,7 +118,9 @@ void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
unsigned reloc;
- r600_need_cs_space(rctx, 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0), FALSE);
+ r600_need_cs_space(rctx,
+ 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) +
+ R600_MAX_PFP_SYNC_ME_DWORDS, FALSE);
/* Flush the caches for the first copy only. */
if (rctx->b.flags) {
@@ -148,9 +151,16 @@ void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
offset += byte_count;
}
+ /* CP DMA is executed in ME, but index buffers are read by PFP.
+ * This ensures that ME (CP DMA) is idle before PFP starts fetching
+ * indices. If we wanted to execute CP DMA in PFP, this packet
+ * should precede it.
+ */
+ if (coher == R600_COHERENCY_SHADER)
+ r600_emit_pfp_sync_me(rctx);
+
/* Invalidate the read caches. */
rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
R600_CONTEXT_INV_VERTEX_CACHE |
R600_CONTEXT_INV_TEX_CACHE;
}
-
diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h
index c1c616910de..a81b6c5fc81 100644
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -88,6 +88,7 @@
#define WAIT_REG_MEM_EQUAL 3
#define PKT3_MEM_WRITE 0x3D
#define PKT3_INDIRECT_BUFFER 0x32
+#define PKT3_PFP_SYNC_ME 0x42
#define PKT3_SURFACE_SYNC 0x43
#define PKT3_ME_INITIALIZE 0x44
#define PKT3_COND_WRITE 0x45
diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index 282645f1496..76c3364a818 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -589,7 +589,7 @@ static void r600_clear_buffer(struct pipe_context *ctx, struct pipe_resource *ds
if (rctx->screen->b.has_cp_dma &&
rctx->b.chip_class >= EVERGREEN &&
offset % 4 == 0 && size % 4 == 0) {
- evergreen_cp_dma_clear_buffer(rctx, dst, offset, size, value);
+ evergreen_cp_dma_clear_buffer(rctx, dst, offset, size, value, coher);
} else if (rctx->screen->b.has_streamout && offset % 4 == 0 && size % 4 == 0) {
union pipe_color_union clear_value;
clear_value.ui[0] = value;
diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index 808bd27607f..3ba723d0541 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -364,6 +364,66 @@ void r600_begin_new_cs(struct r600_context *ctx)
ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->current.cdw;
}
+void r600_emit_pfp_sync_me(struct r600_context *rctx)
+{
+ struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+
+ if (rctx->b.chip_class >= EVERGREEN &&
+ rctx->b.screen->info.drm_minor >= 46) {
+ radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+ radeon_emit(cs, 0);
+ } else {
+ /* Emulate PFP_SYNC_ME by writing a value to memory in ME and
+ * waiting for it in PFP.
+ */
+ struct r600_resource *buf = NULL;
+ unsigned offset, reloc;
+ uint64_t va;
+
+ /* 16-byte address alignment is required by WAIT_REG_MEM. */
+ u_suballocator_alloc(rctx->b.allocator_zeroed_memory, 4, 16,
+ &offset, (struct pipe_resource**)&buf);
+ if (!buf) {
+ /* This is too heavyweight, but will work. */
+ rctx->b.gfx.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
+ return;
+ }
+
+ reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, buf,
+ RADEON_USAGE_READWRITE,
+ RADEON_PRIO_FENCE);
+
+ va = buf->gpu_address + offset;
+ assert(va % 16 == 0);
+
+ /* Write 1 to memory in ME. */
+ radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0));
+ radeon_emit(cs, va);
+ radeon_emit(cs, ((va >> 32) & 0xff) | MEM_WRITE_32_BITS);
+ radeon_emit(cs, 1);
+ radeon_emit(cs, 0);
+
+ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(cs, reloc);
+
+ /* Wait in PFP (PFP can only do GEQUAL against memory). */
+ radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+ radeon_emit(cs, WAIT_REG_MEM_GEQUAL |
+ WAIT_REG_MEM_MEMORY |
+ WAIT_REG_MEM_PFP);
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+ radeon_emit(cs, 1); /* reference value */
+ radeon_emit(cs, 0xffffffff); /* mask */
+ radeon_emit(cs, 4); /* poll interval */
+
+ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(cs, reloc);
+
+ r600_resource_reference(&buf, NULL);
+ }
+}
+
/* The max number of bytes to copy per packet. */
#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
@@ -407,7 +467,7 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx,
r600_need_cs_space(rctx,
10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) +
- 3, FALSE);
+ 3 + R600_MAX_PFP_SYNC_ME_DWORDS, FALSE);
/* Flush the caches for the first copy only. */
if (rctx->b.flags) {
@@ -447,6 +507,13 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx,
radeon_set_config_reg(cs, R_008040_WAIT_UNTIL,
S_008040_WAIT_CP_DMA_IDLE(1));
+ /* CP DMA is executed in ME, but index buffers are read by PFP.
+ * This ensures that ME (CP DMA) is idle before PFP starts fetching
+ * indices. If we wanted to execute CP DMA in PFP, this packet
+ * should precede it.
+ */
+ r600_emit_pfp_sync_me(rctx);
+
/* Invalidate the read caches. */
rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
R600_CONTEXT_INV_VERTEX_CACHE |
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 76178c22509..313bf69c314 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -57,6 +57,7 @@
/* the number of CS dwords for flushing and drawing */
#define R600_MAX_FLUSH_CS_DWORDS 18
#define R600_MAX_DRAW_CS_DWORDS 58
+#define R600_MAX_PFP_SYNC_ME_DWORDS 16
#define R600_MAX_USER_CONST_BUFFERS 13
#define R600_MAX_DRIVER_CONST_BUFFERS 3
@@ -663,13 +664,15 @@ void r600_context_gfx_flush(void *context, unsigned flags,
void r600_begin_new_cs(struct r600_context *ctx);
void r600_flush_emit(struct r600_context *ctx);
void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean count_draw_in);
+void r600_emit_pfp_sync_me(struct r600_context *rctx);
void r600_cp_dma_copy_buffer(struct r600_context *rctx,
struct pipe_resource *dst, uint64_t dst_offset,
struct pipe_resource *src, uint64_t src_offset,
unsigned size);
void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
struct pipe_resource *dst, uint64_t offset,
- unsigned size, uint32_t clear_value);
+ unsigned size, uint32_t clear_value,
+ enum r600_coherency coher);
void r600_dma_copy_buffer(struct r600_context *rctx,
struct pipe_resource *dst,
struct pipe_resource *src,
diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h
index 24f599ea6e9..75d64c13081 100644
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -96,8 +96,13 @@
#define COPY_DW_DST_IS_MEM (1 << 1)
#define PKT3_WAIT_REG_MEM 0x3C
#define WAIT_REG_MEM_EQUAL 3
+#define WAIT_REG_MEM_GEQUAL 5
+#define WAIT_REG_MEM_MEMORY (1 << 4)
+#define WAIT_REG_MEM_PFP (1 << 8)
#define PKT3_MEM_WRITE 0x3D
+#define MEM_WRITE_32_BITS (1 << 18)
#define PKT3_INDIRECT_BUFFER 0x32
+#define PKT3_PFP_SYNC_ME 0x42 /* EG+ */
#define PKT3_SURFACE_SYNC 0x43
#define PKT3_ME_INITIALIZE 0x44
#define PKT3_COND_WRITE 0x45
diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
index 1b466aae574..a6d5c05ec11 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -143,7 +143,7 @@
#define COPY_DATA_DST_SEL(x) (((unsigned)(x) & 0xf) << 8)
#define COPY_DATA_COUNT_SEL (1 << 16)
#define COPY_DATA_WR_CONFIRM (1 << 20)
-#define PKT3_PFP_SYNC_ME 0x42 /* r7xx+ */
+#define PKT3_PFP_SYNC_ME 0x42
#define PKT3_SURFACE_SYNC 0x43 /* deprecated on CIK, use ACQUIRE_MEM */
#define PKT3_ME_INITIALIZE 0x44 /* not on CIK */
#define PKT3_COND_WRITE 0x45