From 2a7b57ad4269e7267922ff695a57b9a2ce413a06 Mon Sep 17 00:00:00 2001 From: Marek Olšák Date: Thu, 24 Apr 2014 03:03:43 +0200 Subject: radeonsi: implement ARB_draw_indirect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Michel Dänzer --- docs/GL3.txt | 4 +- docs/relnotes/10.3.html | 3 +- src/gallium/drivers/radeonsi/si_commands.c | 53 ++++++++++++++++++++ src/gallium/drivers/radeonsi/si_pipe.c | 1 + src/gallium/drivers/radeonsi/si_state.h | 7 +++ src/gallium/drivers/radeonsi/si_state_draw.c | 73 ++++++++++++++++++++++------ src/gallium/drivers/radeonsi/sid.h | 11 ++++- 7 files changed, 132 insertions(+), 20 deletions(-) diff --git a/docs/GL3.txt b/docs/GL3.txt index 924f47990a6..0f37da4102d 100644 --- a/docs/GL3.txt +++ b/docs/GL3.txt @@ -98,7 +98,7 @@ GL 4.0: GLSL 4.0 not started GL_ARB_draw_buffers_blend DONE (i965, nv50, nvc0, r600, radeonsi, softpipe) - GL_ARB_draw_indirect DONE (i965, nvc0) + GL_ARB_draw_indirect DONE (i965, nvc0, radeonsi, softpipe, llvmpipe) GL_ARB_gpu_shader5 started - 'precise' qualifier DONE - Dynamically uniform sampler array indices started (Chris) @@ -165,7 +165,7 @@ GL 4.3: GL_ARB_framebuffer_no_attachments not started GL_ARB_internalformat_query2 not started GL_ARB_invalidate_subdata DONE (all drivers) - GL_ARB_multi_draw_indirect DONE (i965, nvc0) + GL_ARB_multi_draw_indirect DONE (i965, nvc0, radeonsi, softpipe, llvmpipe) GL_ARB_program_interface_query not started GL_ARB_robust_buffer_access_behavior not started GL_ARB_shader_image_size not started diff --git a/docs/relnotes/10.3.html b/docs/relnotes/10.3.html index b757e5f0175..90247c09c05 100644 --- a/docs/relnotes/10.3.html +++ b/docs/relnotes/10.3.html @@ -45,7 +45,9 @@ Note: some of the new features are only available with certain drivers. diff --git a/src/gallium/drivers/radeonsi/si_commands.c b/src/gallium/drivers/radeonsi/si_commands.c index 5ddc40e1ec0..2efdedaf499 100644 --- a/src/gallium/drivers/radeonsi/si_commands.c +++ b/src/gallium/drivers/radeonsi/si_commands.c @@ -57,6 +57,59 @@ void si_cmd_draw_index_auto(struct si_pm4_state *pm4, uint32_t count, si_pm4_cmd_end(pm4, predicate); } +void si_cmd_draw_indirect(struct si_pm4_state *pm4, uint64_t indirect_va, + uint32_t indirect_offset, uint32_t base_vtx_loc, + uint32_t start_inst_loc, bool predicate) +{ + assert(indirect_va % 8 == 0); + assert(indirect_offset % 4 == 0); + + si_pm4_cmd_begin(pm4, PKT3_SET_BASE); + si_pm4_cmd_add(pm4, 1); + si_pm4_cmd_add(pm4, indirect_va); + si_pm4_cmd_add(pm4, indirect_va >> 32); + si_pm4_cmd_end(pm4, predicate); + + si_pm4_cmd_begin(pm4, PKT3_DRAW_INDIRECT); + si_pm4_cmd_add(pm4, indirect_offset); + si_pm4_cmd_add(pm4, (base_vtx_loc - SI_SH_REG_OFFSET) >> 2); + si_pm4_cmd_add(pm4, (start_inst_loc - SI_SH_REG_OFFSET) >> 2); + si_pm4_cmd_add(pm4, V_0287F0_DI_SRC_SEL_AUTO_INDEX); + si_pm4_cmd_end(pm4, predicate); +} + +void si_cmd_draw_index_indirect(struct si_pm4_state *pm4, uint64_t indirect_va, + uint64_t index_va, uint32_t index_max_size, + uint32_t indirect_offset, uint32_t base_vtx_loc, + uint32_t start_inst_loc, bool predicate) +{ + assert(indirect_va % 8 == 0); + assert(index_va % 2 == 0); + assert(indirect_offset % 4 == 0); + + si_pm4_cmd_begin(pm4, PKT3_SET_BASE); + si_pm4_cmd_add(pm4, 1); + si_pm4_cmd_add(pm4, indirect_va); + si_pm4_cmd_add(pm4, indirect_va >> 32); + si_pm4_cmd_end(pm4, predicate); + + si_pm4_cmd_begin(pm4, PKT3_INDEX_BASE); + si_pm4_cmd_add(pm4, index_va); + si_pm4_cmd_add(pm4, index_va >> 32); + si_pm4_cmd_end(pm4, predicate); + + si_pm4_cmd_begin(pm4, PKT3_INDEX_BUFFER_SIZE); + si_pm4_cmd_add(pm4, index_max_size); + si_pm4_cmd_end(pm4, predicate); + + si_pm4_cmd_begin(pm4, PKT3_DRAW_INDEX_INDIRECT); + si_pm4_cmd_add(pm4, indirect_offset); + si_pm4_cmd_add(pm4, (base_vtx_loc - SI_SH_REG_OFFSET) >> 2); + si_pm4_cmd_add(pm4, (start_inst_loc - SI_SH_REG_OFFSET) >> 2); + si_pm4_cmd_add(pm4, V_0287F0_DI_SRC_SEL_DMA); + si_pm4_cmd_end(pm4, predicate); +} + void si_cmd_surface_sync(struct si_pm4_state *pm4, uint32_t cp_coher_cntl) { if (pm4->chip_class >= CIK) { diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index cef6e5073e2..4f192687289 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -214,6 +214,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT: case PIPE_CAP_CUBE_MAP_ARRAY: case PIPE_CAP_SAMPLE_SHADING: + case PIPE_CAP_DRAW_INDIRECT: return 1; case PIPE_CAP_TEXTURE_MULTISAMPLE: diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index fc3b1b97ea1..ae42e6673b5 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -265,6 +265,13 @@ void si_cmd_draw_index_2(struct si_pm4_state *pm4, uint32_t max_size, uint32_t initiator, bool predicate); void si_cmd_draw_index_auto(struct si_pm4_state *pm4, uint32_t count, uint32_t initiator, bool predicate); +void si_cmd_draw_indirect(struct si_pm4_state *pm4, uint64_t indirect_va, + uint32_t indirect_offset, uint32_t base_vtx_loc, + uint32_t start_inst_loc, bool predicate); +void si_cmd_draw_index_indirect(struct si_pm4_state *pm4, uint64_t indirect_va, + uint64_t index_va, uint32_t index_max_size, + uint32_t indirect_offset, uint32_t base_vtx_loc, + uint32_t start_inst_loc, bool predicate); void si_cmd_surface_sync(struct si_pm4_state *pm4, uint32_t cp_coher_cntl); #endif diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index e2b29c349be..bac18464a1d 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -783,15 +783,18 @@ static void si_state_draw(struct si_context *sctx, } si_pm4_cmd_end(pm4, sctx->b.predicate_drawing); - si_pm4_cmd_begin(pm4, PKT3_NUM_INSTANCES); - si_pm4_cmd_add(pm4, info->instance_count); - si_pm4_cmd_end(pm4, sctx->b.predicate_drawing); - if (!info->indirect) { + si_pm4_cmd_begin(pm4, PKT3_NUM_INSTANCES); + si_pm4_cmd_add(pm4, info->instance_count); + si_pm4_cmd_end(pm4, sctx->b.predicate_drawing); + si_pm4_set_reg(pm4, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, info->indexed ? info->index_bias : info->start); si_pm4_set_reg(pm4, sh_base_reg + SI_SGPR_START_INSTANCE * 4, info->start_instance); + } else { + si_pm4_add_bo(pm4, (struct r600_resource *)info->indirect, + RADEON_USAGE_READ, RADEON_PRIO_MIN); } if (info->indexed) { @@ -803,14 +806,35 @@ static void si_state_draw(struct si_context *sctx, si_pm4_add_bo(pm4, (struct r600_resource *)ib->buffer, RADEON_USAGE_READ, RADEON_PRIO_MIN); - va += info->start * ib->index_size; - si_cmd_draw_index_2(pm4, max_size, va, info->count, - V_0287F0_DI_SRC_SEL_DMA, - sctx->b.predicate_drawing); + + if (info->indirect) { + uint64_t indirect_va = r600_resource_va(&sctx->screen->b.b, + info->indirect); + si_cmd_draw_index_indirect(pm4, indirect_va, va, max_size, + info->indirect_offset, + sh_base_reg + SI_SGPR_BASE_VERTEX * 4, + sh_base_reg + SI_SGPR_START_INSTANCE * 4, + sctx->b.predicate_drawing); + } else { + va += info->start * ib->index_size; + si_cmd_draw_index_2(pm4, max_size, va, info->count, + V_0287F0_DI_SRC_SEL_DMA, + sctx->b.predicate_drawing); + } } else { - uint32_t initiator = V_0287F0_DI_SRC_SEL_AUTO_INDEX; - initiator |= S_0287F0_USE_OPAQUE(!!info->count_from_stream_output); - si_cmd_draw_index_auto(pm4, info->count, initiator, sctx->b.predicate_drawing); + if (info->indirect) { + uint64_t indirect_va = r600_resource_va(&sctx->screen->b.b, + info->indirect); + si_cmd_draw_indirect(pm4, indirect_va, info->indirect_offset, + sh_base_reg + SI_SGPR_BASE_VERTEX * 4, + sh_base_reg + SI_SGPR_START_INSTANCE * 4, + sctx->b.predicate_drawing); + } else { + si_cmd_draw_index_auto(pm4, info->count, + V_0287F0_DI_SRC_SEL_AUTO_INDEX | + S_0287F0_USE_OPAQUE(!!info->count_from_stream_output), + sctx->b.predicate_drawing); + } } si_pm4_set_state(sctx, draw, pm4); @@ -898,13 +922,32 @@ void si_emit_cache_flush(struct r600_common_context *sctx, struct r600_atom *ato const struct r600_atom si_atom_cache_flush = { si_emit_cache_flush, 13 }; /* number of CS dwords */ +static void si_get_draw_start_count(struct si_context *sctx, + const struct pipe_draw_info *info, + unsigned *start, unsigned *count) +{ + if (info->indirect) { + struct r600_resource *indirect = + (struct r600_resource*)info->indirect; + int *data = r600_buffer_map_sync_with_rings(&sctx->b, + indirect, PIPE_TRANSFER_READ); + data += info->indirect_offset/sizeof(int); + *start = data[2]; + *count = data[0]; + } else { + *start = info->start; + *count = info->count; + } +} + void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) { struct si_context *sctx = (struct si_context *)ctx; struct pipe_index_buffer ib = {}; uint32_t i; - if (!info->count && (info->indexed || !info->count_from_stream_output)) + if (!info->count && !info->indirect && + (info->indexed || !info->count_from_stream_output)) return; if (!sctx->ps_shader || !sctx->vs_shader) @@ -926,8 +969,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) unsigned out_offset, start, count, start_offset; void *ptr; - start = info->start; - count = info->count; + si_get_draw_start_count(sctx, info, &start, &count); start_offset = start * ib.index_size; u_upload_alloc(sctx->b.uploader, start_offset, count * 2, @@ -946,8 +988,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) } else if (ib.user_buffer && !ib.buffer) { unsigned start, count, start_offset; - start = info->start; - count = info->count; + si_get_draw_start_count(sctx, info, &start, &count); start_offset = start * ib.index_size; u_upload_data(sctx->b.uploader, start_offset, count * ib.index_size, diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h index e3f788ef6a7..3241725ca99 100644 --- a/src/gallium/drivers/radeonsi/sid.h +++ b/src/gallium/drivers/radeonsi/sid.h @@ -70,18 +70,27 @@ #define R600_TEXEL_PITCH_ALIGNMENT_MASK 0x7 #define PKT3_NOP 0x10 +#define PKT3_SET_BASE 0x11 +#define PKT3_CLEAR_STATE 0x12 +#define PKT3_INDEX_BUFFER_SIZE 0x13 #define PKT3_DISPATCH_DIRECT 0x15 #define PKT3_DISPATCH_INDIRECT 0x16 #define PKT3_OCCLUSION_QUERY 0x1F /* new for CIK */ #define PKT3_SET_PREDICATION 0x20 #define PKT3_COND_EXEC 0x22 #define PKT3_PRED_EXEC 0x23 +#define PKT3_DRAW_INDIRECT 0x24 +#define PKT3_DRAW_INDEX_INDIRECT 0x25 +#define PKT3_INDEX_BASE 0x26 #define PKT3_DRAW_INDEX_2 0x27 #define PKT3_CONTEXT_CONTROL 0x28 #define PKT3_INDEX_TYPE 0x2A +#define PKT3_DRAW_INDIRECT_MULTI 0x2C #define PKT3_DRAW_INDEX_AUTO 0x2D #define PKT3_DRAW_INDEX_IMMD 0x2E /* not on CIK */ #define PKT3_NUM_INSTANCES 0x2F +#define PKT3_DRAW_INDEX_MULTI_AUTO 0x30 +#define PKT3_INDIRECT_BUFFER 0x32 #define PKT3_STRMOUT_BUFFER_UPDATE 0x34 #define PKT3_DRAW_INDEX_OFFSET_2 0x35 #define PKT3_DRAW_PREAMBLE 0x36 /* new on CIK, required on GFX7.2 and later */ @@ -99,12 +108,12 @@ #define PKT3_WRITE_DATA_ENGINE_SEL_ME 0 #define PKT3_WRITE_DATA_ENGINE_SEL_PFP 1 #define PKT3_WRITE_DATA_ENGINE_SEL_CE 2 +#define PKT3_DRAW_INDEX_INDIRECT_MULTI 0x38 #define PKT3_MEM_SEMAPHORE 0x39 #define PKT3_MPEG_INDEX 0x3A /* not on CIK */ #define PKT3_WAIT_REG_MEM 0x3C #define WAIT_REG_MEM_EQUAL 3 #define PKT3_MEM_WRITE 0x3D /* not on CIK */ -#define PKT3_INDIRECT_BUFFER 0x32 #define PKT3_COPY_DATA 0x40 #define COPY_DATA_SRC_SEL(x) ((x) & 0xf) #define COPY_DATA_REG 0 -- cgit v1.2.3