diff options
author | Marek Olšák <[email protected]> | 2018-08-24 00:29:04 -0400 |
---|---|---|
committer | Marek Olšák <[email protected]> | 2018-10-16 17:23:25 -0400 |
commit | 9b331e462e5021d994859756d46cd2519d9c9c6e (patch) | |
tree | 7f51689c2bafc268c9993227c43bf323c522dd16 /src/gallium/drivers/radeonsi/si_cp_dma.c | |
parent | 5030adcbe05f2ae97826f645f43c612f774411e8 (diff) |
radeonsi: use compute shaders for clear_buffer & copy_buffer
Fast color clears should be much faster. Also, fast color clears on
evicted buffers should be 200x faster on GFX8 and older.
Diffstat (limited to 'src/gallium/drivers/radeonsi/si_cp_dma.c')
-rw-r--r-- | src/gallium/drivers/radeonsi/si_cp_dma.c | 180 |
1 files changed, 5 insertions, 175 deletions
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index c1ecd5fb3e8..839b31b7fdf 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -25,12 +25,6 @@ #include "si_pipe.h" #include "sid.h" -/* Recommended maximum sizes for optimal performance. - * Fall back to compute or SDMA if the size is greater. - */ -#define CP_DMA_COPY_PERF_THRESHOLD (64 * 1024) /* copied from Vulkan */ -#define CP_DMA_CLEAR_PERF_THRESHOLD (32 * 1024) /* guess (clear is much slower) */ - /* Set this if you want the ME to wait until CP DMA is done. * It should be set on the last CP DMA packet. */ #define CP_DMA_SYNC (1 << 0) @@ -155,35 +149,6 @@ void si_cp_dma_wait_for_idle(struct si_context *sctx) si_emit_cp_dma(sctx, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS); } -static unsigned get_flush_flags(struct si_context *sctx, enum si_coherency coher, - enum si_cache_policy cache_policy) -{ - switch (coher) { - default: - case SI_COHERENCY_NONE: - return 0; - case SI_COHERENCY_SHADER: - assert(sctx->chip_class != SI || cache_policy == L2_BYPASS); - return SI_CONTEXT_INV_SMEM_L1 | - SI_CONTEXT_INV_VMEM_L1 | - (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 0); - case SI_COHERENCY_CB_META: - assert(sctx->chip_class >= GFX9 ? cache_policy != L2_BYPASS : - cache_policy == L2_BYPASS); - return SI_CONTEXT_FLUSH_AND_INV_CB; - } -} - -static enum si_cache_policy get_cache_policy(struct si_context *sctx, - enum si_coherency coher) -{ - if ((sctx->chip_class >= GFX9 && coher == SI_COHERENCY_CB_META) || - (sctx->chip_class >= CIK && coher == SI_COHERENCY_SHADER)) - return L2_LRU; - - return L2_BYPASS; -} - static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, unsigned byte_count, uint64_t remaining_size, unsigned user_flags, @@ -262,7 +227,7 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, /* Flush the caches. */ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | - get_flush_flags(sctx, coher, cache_policy); + si_get_flush_flags(sctx, coher, cache_policy); while (size) { unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx)); @@ -286,122 +251,6 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, sctx->num_cp_dma_calls++; } -/* dst == NULL means GDS. */ -void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, - uint64_t offset, uint64_t size, unsigned value, - enum si_coherency coher) -{ - struct radeon_winsys *ws = sctx->ws; - struct r600_resource *rdst = r600_resource(dst); - enum si_cache_policy cache_policy = get_cache_policy(sctx, coher); - uint64_t dma_clear_size; - - if (!size) - return; - - dma_clear_size = size & ~3ull; - - /* dma_clear_buffer can use clear_buffer on failure. Make sure that - * doesn't happen. We don't want an infinite recursion: */ - if (sctx->dma_cs && - !(dst->flags & PIPE_RESOURCE_FLAG_SPARSE) && - (offset % 4 == 0) && - /* CP DMA is very slow. Always use SDMA for big clears. This - * alone improves DeusEx:MD performance by 70%. */ - (size > CP_DMA_CLEAR_PERF_THRESHOLD || - /* Buffers not used by the GFX IB yet will be cleared by SDMA. - * This happens to move most buffer clears to SDMA, including - * DCC and CMASK clears, because pipe->clear clears them before - * si_emit_framebuffer_state (in a draw call) adds them. - * For example, DeusEx:MD has 21 buffer clears per frame and all - * of them are moved to SDMA thanks to this. */ - !ws->cs_is_buffer_referenced(sctx->gfx_cs, rdst->buf, - RADEON_USAGE_READWRITE))) { - si_sdma_clear_buffer(sctx, dst, offset, dma_clear_size, value); - - offset += dma_clear_size; - size -= dma_clear_size; - } else if (dma_clear_size >= 4) { - si_cp_dma_clear_buffer(sctx, dst, offset, dma_clear_size, value, - coher, cache_policy); - - offset += dma_clear_size; - size -= dma_clear_size; - } - - if (size) { - /* Handle non-dword alignment. - * - * This function is called for embedded texture metadata clears, - * but those should always be properly aligned. */ - assert(dst); - assert(dst->target == PIPE_BUFFER); - assert(size < 4); - - pipe_buffer_write(&sctx->b, dst, offset, size, &value); - } -} - -static void si_pipe_clear_buffer(struct pipe_context *ctx, - struct pipe_resource *dst, - unsigned offset, unsigned size, - const void *clear_value_ptr, - int clear_value_size) -{ - struct si_context *sctx = (struct si_context*)ctx; - uint32_t dword_value; - unsigned i; - - assert(offset % clear_value_size == 0); - assert(size % clear_value_size == 0); - - if (clear_value_size > 4) { - const uint32_t *u32 = clear_value_ptr; - bool clear_dword_duplicated = true; - - /* See if we can lower large fills to dword fills. */ - for (i = 1; i < clear_value_size / 4; i++) - if (u32[0] != u32[i]) { - clear_dword_duplicated = false; - break; - } - - if (!clear_dword_duplicated) { - /* Use transform feedback for 64-bit, 96-bit, and - * 128-bit fills. - */ - union pipe_color_union clear_value; - - memcpy(&clear_value, clear_value_ptr, clear_value_size); - si_blitter_begin(sctx, SI_DISABLE_RENDER_COND); - util_blitter_clear_buffer(sctx->blitter, dst, offset, - size, clear_value_size / 4, - &clear_value); - si_blitter_end(sctx); - return; - } - } - - /* Expand the clear value to a dword. */ - switch (clear_value_size) { - case 1: - dword_value = *(uint8_t*)clear_value_ptr; - dword_value |= (dword_value << 8) | - (dword_value << 16) | - (dword_value << 24); - break; - case 2: - dword_value = *(uint16_t*)clear_value_ptr; - dword_value |= dword_value << 16; - break; - default: - dword_value = *(uint32_t*)clear_value_ptr; - } - - si_clear_buffer(sctx, dst, offset, size, dword_value, - SI_COHERENCY_SHADER); -} - /** * Realign the CP DMA engine. This must be done after a copy with an unaligned * size. @@ -509,7 +358,7 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, if ((dst || src) && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) { sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | - get_flush_flags(sctx, coher, cache_policy); + si_get_flush_flags(sctx, coher, cache_policy); } /* This is the main part doing the copying. Src is always aligned. */ @@ -549,26 +398,12 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, si_cp_dma_realign_engine(sctx, realign_size, user_flags, coher, cache_policy, &is_first); } -} - -void si_copy_buffer(struct si_context *sctx, - struct pipe_resource *dst, struct pipe_resource *src, - uint64_t dst_offset, uint64_t src_offset, unsigned size) -{ - enum si_coherency coher = SI_COHERENCY_SHADER; - enum si_cache_policy cache_policy = get_cache_policy(sctx, coher); - - if (!size) - return; - - si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, - 0, coher, cache_policy); - if (cache_policy != L2_BYPASS) + if (dst && cache_policy != L2_BYPASS) r600_resource(dst)->TC_L2_dirty = true; - /* If it's not a prefetch... */ - if (dst_offset != src_offset) + /* If it's not a prefetch or GDS copy... */ + if (dst && src && (dst != src || dst_offset != src_offset)) sctx->num_cp_dma_calls++; } @@ -744,8 +579,3 @@ void si_test_gds(struct si_context *sctx) pipe_resource_reference(&dst, NULL); exit(0); } - -void si_init_cp_dma_functions(struct si_context *sctx) -{ - sctx->b.clear_buffer = si_pipe_clear_buffer; -} |