diff options
-rw-r--r-- | src/gallium/drivers/radeonsi/si_cp_dma.c | 12 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_pipe.h | 2 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state_draw.c | 37 |
3 files changed, 50 insertions, 1 deletions
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 934fd8df2db..f06b8ddb79b 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -386,6 +386,18 @@ void si_copy_buffer(struct si_context *sctx, sctx->b.num_cp_dma_calls++; } +void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, + uint64_t offset, unsigned size) +{ + assert(sctx->b.chip_class >= CIK); + + si_copy_buffer(sctx, buf, buf, offset, offset, size, + SI_CPDMA_SKIP_CHECK_CS_SPACE | + SI_CPDMA_SKIP_SYNC_AFTER | + SI_CPDMA_SKIP_SYNC_BEFORE | + SI_CPDMA_SKIP_GFX_SYNC); +} + void si_init_cp_dma_functions(struct si_context *sctx) { sctx->b.clear_buffer = si_clear_buffer; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index dc37c8d28f3..c0a4636cc63 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -381,6 +381,8 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned user_flags); +void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, + uint64_t offset, unsigned size); void si_init_cp_dma_functions(struct si_context *sctx); /* si_debug.c */ diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index b3f664eff2f..7b756025f81 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -937,6 +937,17 @@ void si_ce_post_draw_synchronization(struct si_context *sctx) } } +static void cik_prefetch_shader_async(struct si_context *sctx, + struct si_pm4_state *state) +{ + if (state) { + struct pipe_resource *bo = &state->bo[0]->b.b; + assert(state->nbo == 1); + + cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0); + } +} + void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) { struct si_context *sctx = (struct si_context *)ctx; @@ -1114,10 +1125,34 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) if (!si_upload_vertex_buffer_descriptors(sctx)) return; - /* Flushed caches prior to emitting states. */ + /* Flushed caches prior to prefetching shaders. */ if (sctx->b.flags) si_emit_cache_flush(sctx); + /* Prefetch shaders and VBO descriptors to TC L2. */ + if (sctx->b.chip_class >= CIK) { + if (si_pm4_state_changed(sctx, ls)) + cik_prefetch_shader_async(sctx, sctx->queued.named.ls); + if (si_pm4_state_changed(sctx, hs)) + cik_prefetch_shader_async(sctx, sctx->queued.named.hs); + if (si_pm4_state_changed(sctx, es)) + cik_prefetch_shader_async(sctx, sctx->queued.named.es); + if (si_pm4_state_changed(sctx, gs)) + cik_prefetch_shader_async(sctx, sctx->queued.named.gs); + if (si_pm4_state_changed(sctx, vs)) + cik_prefetch_shader_async(sctx, sctx->queued.named.vs); + + /* Vertex buffer descriptors are uploaded uncached, so prefetch + * them right after the VS binary. */ + if (sctx->vertex_buffers.pointer_dirty) { + cik_prefetch_TC_L2_async(sctx, &sctx->vertex_buffers.buffer->b.b, + sctx->vertex_buffers.buffer_offset, + sctx->vertex_elements->count * 16); + } + if (si_pm4_state_changed(sctx, ps)) + cik_prefetch_shader_async(sctx, sctx->queued.named.ps); + } + /* Emit states. */ mask = sctx->dirty_atoms; while (mask) { |