diff options
author | Marek Olšák <[email protected]> | 2017-06-16 14:25:34 +0200 |
---|---|---|
committer | Marek Olšák <[email protected]> | 2018-04-16 16:58:10 -0400 |
commit | 1b3199d14d5ca96cb794dca4213bf5c17d1e264b (patch) | |
tree | 57bd896decbdb21206473fbe4f9e7d975ab00731 | |
parent | d19b488339be4b908dbd3b1636a677d11a2232f0 (diff) |
radeonsi: implement mechanism for IBs without partial flushes at the end (v6)
(This patch doesn't enable the behavior. It will be enabled in a later
commit.)
Draw calls from multiple IBs can be executed in parallel.
v2: do emit partial flushes on SI
v3: invalidate all shader caches at the beginning of IBs
v4: don't call si_emit_cache_flush in si_flush_gfx_cs if not needed,
only do this for flushes invoked internally
v5: empty IBs should wait for idle if the flush requires it
v6: split the commit
If we artificially limit the number of draw calls per IB to 5, we'll get
a lot more IBs, leading to a lot more partial flushes. Let's see how
the removal of partial flushes changes GPU utilization in that scenario:
With partial flushes (time busy):
CP: 99%
SPI: 86%
CB: 73:
Without partial flushes (time busy):
CP: 99%
SPI: 93%
CB: 81%
Tested-by: Benedikt Schemmer <[email protected]>
Reviewed-by: Nicolai Hähnle <[email protected]>
-rw-r--r-- | src/gallium/drivers/radeon/radeon_winsys.h | 7 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_gfx_cs.c | 55 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_pipe.h | 1 |
3 files changed, 47 insertions, 16 deletions
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h index 157b2e40550..fae4fb7a95d 100644 --- a/src/gallium/drivers/radeon/radeon_winsys.h +++ b/src/gallium/drivers/radeon/radeon_winsys.h @@ -28,6 +28,13 @@ /* The public winsys interface header for the radeon driver. */ +/* Whether the next IB can start immediately and not wait for draws and + * dispatches from the current IB to finish. */ +#define RADEON_FLUSH_START_NEXT_GFX_IB_NOW (1u << 31) + +#define RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW \ + (PIPE_FLUSH_ASYNC | RADEON_FLUSH_START_NEXT_GFX_IB_NOW) + #include "pipebuffer/pb_buffer.h" #include "amd/common/ac_gpu_info.h" diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index b1ed620b0c6..147433b69b6 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -69,11 +69,28 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, { struct radeon_winsys_cs *cs = ctx->gfx_cs; struct radeon_winsys *ws = ctx->ws; + unsigned wait_flags = 0; if (ctx->gfx_flush_in_progress) return; - if (!radeon_emitted(cs, ctx->initial_gfx_cs_size)) + if (ctx->chip_class == VI && ctx->screen->info.drm_minor <= 1) { + /* DRM 3.1.0 doesn't flush TC for VI correctly. */ + wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH | + SI_CONTEXT_INV_GLOBAL_L2; + } else if (ctx->chip_class == SI) { + /* The kernel flushes L2 before shaders are finished. */ + wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH; + } else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) { + wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH; + } + + /* Drop this flush if it's a no-op. */ + if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) && + (!wait_flags || !ctx->gfx_last_ib_is_busy)) return; if (si_check_device_reset(ctx)) @@ -103,20 +120,17 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, ctx->streamout.suspended = true; } - ctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | - SI_CONTEXT_PS_PARTIAL_FLUSH; - - /* DRM 3.1.0 doesn't flush TC for VI correctly. */ - if (ctx->chip_class == VI && ctx->screen->info.drm_minor <= 1) - ctx->flags |= SI_CONTEXT_INV_GLOBAL_L2 | - SI_CONTEXT_INV_VMEM_L1; - /* Make sure CP DMA is idle at the end of IBs after L2 prefetches * because the kernel doesn't wait for it. */ if (ctx->chip_class >= CIK) si_cp_dma_wait_for_idle(ctx); - si_emit_cache_flush(ctx); + /* Wait for draw calls to finish if needed. */ + if (wait_flags) { + ctx->flags |= wait_flags; + si_emit_cache_flush(ctx); + } + ctx->gfx_last_ib_is_busy = wait_flags == 0; if (ctx->current_saved_cs) { si_trace_emit(ctx); @@ -189,12 +203,21 @@ void si_begin_new_gfx_cs(struct si_context *ctx) if (ctx->is_debug) si_begin_gfx_cs_debug(ctx); - /* Flush read caches at the beginning of CS not flushed by the kernel. */ - if (ctx->chip_class >= CIK) - ctx->flags |= SI_CONTEXT_INV_SMEM_L1 | - SI_CONTEXT_INV_ICACHE; - - ctx->flags |= SI_CONTEXT_START_PIPELINE_STATS; + /* Always invalidate caches at the beginning of IBs, because external + * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our + * buffers. + * + * Note that the cache flush done by the kernel at the end of GFX IBs + * isn't useful here, because that flush can finish after the following + * IB starts drawing. + * + * TODO: Do we also need to invalidate CB & DB caches? + */ + ctx->flags |= SI_CONTEXT_INV_ICACHE | + SI_CONTEXT_INV_SMEM_L1 | + SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | + SI_CONTEXT_START_PIPELINE_STATS; /* set all valid group as dirty so they get reemited on * next draw command diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 3a2f7ca11d1..125b3a72bfb 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -559,6 +559,7 @@ struct si_context { uint16_t prefetch_L2_mask; bool gfx_flush_in_progress:1; + bool gfx_last_ib_is_busy:1; bool compute_is_busy:1; unsigned num_gfx_cs_flushes; |