aboutsummaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/radeonsi/si_cp_dma.c
diff options
context:
space:
mode:
authorMarek Olšák <[email protected]>2018-08-24 00:29:04 -0400
committerMarek Olšák <[email protected]>2018-10-16 17:23:25 -0400
commit9b331e462e5021d994859756d46cd2519d9c9c6e (patch)
tree7f51689c2bafc268c9993227c43bf323c522dd16 /src/gallium/drivers/radeonsi/si_cp_dma.c
parent5030adcbe05f2ae97826f645f43c612f774411e8 (diff)
radeonsi: use compute shaders for clear_buffer & copy_buffer
Fast color clears should be much faster. Also, fast color clears on evicted buffers should be 200x faster on GFX8 and older.
Diffstat (limited to 'src/gallium/drivers/radeonsi/si_cp_dma.c')
-rw-r--r--src/gallium/drivers/radeonsi/si_cp_dma.c180
1 files changed, 5 insertions, 175 deletions
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index c1ecd5fb3e8..839b31b7fdf 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -25,12 +25,6 @@
#include "si_pipe.h"
#include "sid.h"
-/* Recommended maximum sizes for optimal performance.
- * Fall back to compute or SDMA if the size is greater.
- */
-#define CP_DMA_COPY_PERF_THRESHOLD (64 * 1024) /* copied from Vulkan */
-#define CP_DMA_CLEAR_PERF_THRESHOLD (32 * 1024) /* guess (clear is much slower) */
-
/* Set this if you want the ME to wait until CP DMA is done.
* It should be set on the last CP DMA packet. */
#define CP_DMA_SYNC (1 << 0)
@@ -155,35 +149,6 @@ void si_cp_dma_wait_for_idle(struct si_context *sctx)
si_emit_cp_dma(sctx, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS);
}
-static unsigned get_flush_flags(struct si_context *sctx, enum si_coherency coher,
- enum si_cache_policy cache_policy)
-{
- switch (coher) {
- default:
- case SI_COHERENCY_NONE:
- return 0;
- case SI_COHERENCY_SHADER:
- assert(sctx->chip_class != SI || cache_policy == L2_BYPASS);
- return SI_CONTEXT_INV_SMEM_L1 |
- SI_CONTEXT_INV_VMEM_L1 |
- (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 0);
- case SI_COHERENCY_CB_META:
- assert(sctx->chip_class >= GFX9 ? cache_policy != L2_BYPASS :
- cache_policy == L2_BYPASS);
- return SI_CONTEXT_FLUSH_AND_INV_CB;
- }
-}
-
-static enum si_cache_policy get_cache_policy(struct si_context *sctx,
- enum si_coherency coher)
-{
- if ((sctx->chip_class >= GFX9 && coher == SI_COHERENCY_CB_META) ||
- (sctx->chip_class >= CIK && coher == SI_COHERENCY_SHADER))
- return L2_LRU;
-
- return L2_BYPASS;
-}
-
static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst,
struct pipe_resource *src, unsigned byte_count,
uint64_t remaining_size, unsigned user_flags,
@@ -262,7 +227,7 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
/* Flush the caches. */
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
SI_CONTEXT_CS_PARTIAL_FLUSH |
- get_flush_flags(sctx, coher, cache_policy);
+ si_get_flush_flags(sctx, coher, cache_policy);
while (size) {
unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
@@ -286,122 +251,6 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
sctx->num_cp_dma_calls++;
}
-/* dst == NULL means GDS. */
-void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
- uint64_t offset, uint64_t size, unsigned value,
- enum si_coherency coher)
-{
- struct radeon_winsys *ws = sctx->ws;
- struct r600_resource *rdst = r600_resource(dst);
- enum si_cache_policy cache_policy = get_cache_policy(sctx, coher);
- uint64_t dma_clear_size;
-
- if (!size)
- return;
-
- dma_clear_size = size & ~3ull;
-
- /* dma_clear_buffer can use clear_buffer on failure. Make sure that
- * doesn't happen. We don't want an infinite recursion: */
- if (sctx->dma_cs &&
- !(dst->flags & PIPE_RESOURCE_FLAG_SPARSE) &&
- (offset % 4 == 0) &&
- /* CP DMA is very slow. Always use SDMA for big clears. This
- * alone improves DeusEx:MD performance by 70%. */
- (size > CP_DMA_CLEAR_PERF_THRESHOLD ||
- /* Buffers not used by the GFX IB yet will be cleared by SDMA.
- * This happens to move most buffer clears to SDMA, including
- * DCC and CMASK clears, because pipe->clear clears them before
- * si_emit_framebuffer_state (in a draw call) adds them.
- * For example, DeusEx:MD has 21 buffer clears per frame and all
- * of them are moved to SDMA thanks to this. */
- !ws->cs_is_buffer_referenced(sctx->gfx_cs, rdst->buf,
- RADEON_USAGE_READWRITE))) {
- si_sdma_clear_buffer(sctx, dst, offset, dma_clear_size, value);
-
- offset += dma_clear_size;
- size -= dma_clear_size;
- } else if (dma_clear_size >= 4) {
- si_cp_dma_clear_buffer(sctx, dst, offset, dma_clear_size, value,
- coher, cache_policy);
-
- offset += dma_clear_size;
- size -= dma_clear_size;
- }
-
- if (size) {
- /* Handle non-dword alignment.
- *
- * This function is called for embedded texture metadata clears,
- * but those should always be properly aligned. */
- assert(dst);
- assert(dst->target == PIPE_BUFFER);
- assert(size < 4);
-
- pipe_buffer_write(&sctx->b, dst, offset, size, &value);
- }
-}
-
-static void si_pipe_clear_buffer(struct pipe_context *ctx,
- struct pipe_resource *dst,
- unsigned offset, unsigned size,
- const void *clear_value_ptr,
- int clear_value_size)
-{
- struct si_context *sctx = (struct si_context*)ctx;
- uint32_t dword_value;
- unsigned i;
-
- assert(offset % clear_value_size == 0);
- assert(size % clear_value_size == 0);
-
- if (clear_value_size > 4) {
- const uint32_t *u32 = clear_value_ptr;
- bool clear_dword_duplicated = true;
-
- /* See if we can lower large fills to dword fills. */
- for (i = 1; i < clear_value_size / 4; i++)
- if (u32[0] != u32[i]) {
- clear_dword_duplicated = false;
- break;
- }
-
- if (!clear_dword_duplicated) {
- /* Use transform feedback for 64-bit, 96-bit, and
- * 128-bit fills.
- */
- union pipe_color_union clear_value;
-
- memcpy(&clear_value, clear_value_ptr, clear_value_size);
- si_blitter_begin(sctx, SI_DISABLE_RENDER_COND);
- util_blitter_clear_buffer(sctx->blitter, dst, offset,
- size, clear_value_size / 4,
- &clear_value);
- si_blitter_end(sctx);
- return;
- }
- }
-
- /* Expand the clear value to a dword. */
- switch (clear_value_size) {
- case 1:
- dword_value = *(uint8_t*)clear_value_ptr;
- dword_value |= (dword_value << 8) |
- (dword_value << 16) |
- (dword_value << 24);
- break;
- case 2:
- dword_value = *(uint16_t*)clear_value_ptr;
- dword_value |= dword_value << 16;
- break;
- default:
- dword_value = *(uint32_t*)clear_value_ptr;
- }
-
- si_clear_buffer(sctx, dst, offset, size, dword_value,
- SI_COHERENCY_SHADER);
-}
-
/**
* Realign the CP DMA engine. This must be done after a copy with an unaligned
* size.
@@ -509,7 +358,7 @@ void si_cp_dma_copy_buffer(struct si_context *sctx,
if ((dst || src) && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) {
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
SI_CONTEXT_CS_PARTIAL_FLUSH |
- get_flush_flags(sctx, coher, cache_policy);
+ si_get_flush_flags(sctx, coher, cache_policy);
}
/* This is the main part doing the copying. Src is always aligned. */
@@ -549,26 +398,12 @@ void si_cp_dma_copy_buffer(struct si_context *sctx,
si_cp_dma_realign_engine(sctx, realign_size, user_flags, coher,
cache_policy, &is_first);
}
-}
-
-void si_copy_buffer(struct si_context *sctx,
- struct pipe_resource *dst, struct pipe_resource *src,
- uint64_t dst_offset, uint64_t src_offset, unsigned size)
-{
- enum si_coherency coher = SI_COHERENCY_SHADER;
- enum si_cache_policy cache_policy = get_cache_policy(sctx, coher);
-
- if (!size)
- return;
-
- si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
- 0, coher, cache_policy);
- if (cache_policy != L2_BYPASS)
+ if (dst && cache_policy != L2_BYPASS)
r600_resource(dst)->TC_L2_dirty = true;
- /* If it's not a prefetch... */
- if (dst_offset != src_offset)
+ /* If it's not a prefetch or GDS copy... */
+ if (dst && src && (dst != src || dst_offset != src_offset))
sctx->num_cp_dma_calls++;
}
@@ -744,8 +579,3 @@ void si_test_gds(struct si_context *sctx)
pipe_resource_reference(&dst, NULL);
exit(0);
}
-
-void si_init_cp_dma_functions(struct si_context *sctx)
-{
- sctx->b.clear_buffer = si_pipe_clear_buffer;
-}