diff options
-rw-r--r-- | src/gallium/drivers/radeonsi/si_descriptors.c | 23 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader.c | 72 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader.h | 2 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state.h | 3 |
4 files changed, 87 insertions, 13 deletions
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 0c1fca87181..da6efa83947 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -126,6 +126,7 @@ static void si_init_descriptors(struct si_descriptors *desc, desc->element_dw_size = element_dw_size; desc->num_elements = num_elements; desc->shader_userdata_offset = shader_userdata_index * 4; + desc->slot_index_to_bind_directly = -1; } static void si_release_descriptors(struct si_descriptors *desc) @@ -148,6 +149,20 @@ static bool si_upload_descriptors(struct si_context *sctx, if (!upload_size) return true; + /* If there is just one active descriptor, bind it directly. */ + if ((int)desc->first_active_slot == desc->slot_index_to_bind_directly && + desc->num_active_slots == 1) { + uint32_t *descriptor = &desc->list[desc->slot_index_to_bind_directly * + desc->element_dw_size]; + + /* The buffer is already in the buffer list. */ + r600_resource_reference(&desc->buffer, NULL); + desc->gpu_list = NULL; + desc->gpu_address = si_desc_extract_buffer_address(descriptor); + si_mark_atom_dirty(sctx, &sctx->shader_pointers.atom); + return true; + } + uint32_t *ptr; int buffer_offset; u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size, @@ -2531,14 +2546,15 @@ void si_init_all_descriptors(struct si_context *sctx) bool gfx9_gs = false; unsigned num_sampler_slots = SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS; unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS; + struct si_descriptors *desc; if (sctx->b.chip_class >= GFX9) { gfx9_tcs = i == PIPE_SHADER_TESS_CTRL; gfx9_gs = i == PIPE_SHADER_GEOMETRY; } - si_init_buffer_resources(&sctx->const_and_shader_buffers[i], - si_const_and_shader_buffer_descriptors(sctx, i), + desc = si_const_and_shader_buffer_descriptors(sctx, i); + si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc, num_buffer_slots, gfx9_tcs ? GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS : gfx9_gs ? GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS : @@ -2547,8 +2563,9 @@ void si_init_all_descriptors(struct si_context *sctx) RADEON_USAGE_READ, RADEON_PRIO_SHADER_RW_BUFFER, RADEON_PRIO_CONST_BUFFER); + desc->slot_index_to_bind_directly = si_get_constbuf_slot(0); - struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, i); + desc = si_sampler_and_image_descriptors(sctx, i); si_init_descriptors(desc, gfx9_tcs ? GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES : gfx9_gs ? GFX9_SGPR_GS_SAMPLERS_AND_IMAGES : diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index f72e5af31fd..c3fe13deeaa 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1973,6 +1973,7 @@ static LLVMValueRef fetch_constant( unsigned swizzle) { struct si_shader_context *ctx = si_shader_context(bld_base); + struct si_shader_selector *sel = ctx->shader->selector; const struct tgsi_ind_register *ireg = ®->Indirect; unsigned buf, idx; @@ -1996,9 +1997,60 @@ static LLVMValueRef fetch_constant( return si_llvm_emit_fetch_64bit(bld_base, type, lo, hi); } + idx = reg->Register.Index * 4 + swizzle; + if (reg->Register.Indirect) { + addr = si_get_indirect_index(ctx, ireg, 16, idx * 4); + } else { + addr = LLVMConstInt(ctx->i32, idx * 4, 0); + } + + /* Fast path when user data SGPRs point to constant buffer 0 directly. */ + if (sel->info.const_buffers_declared == 1 && + sel->info.shader_buffers_declared == 0) { + LLVMValueRef ptr = + LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers); + + /* This enables use of s_load_dword and flat_load_dword for const buffer 0 + * loads, and up to x4 load opcode merging. However, it leads to horrible + * code reducing SIMD wave occupancy from 8 to 2 in many cases. + * + * Using s_buffer_load_dword (x1) seems to be the best option right now. + */ +#if 0 /* keep this codepath disabled */ + if (!reg->Register.Indirect) { + addr = LLVMBuildLShr(ctx->ac.builder, addr, LLVMConstInt(ctx->i32, 2, 0), ""); + LLVMValueRef result = ac_build_load_invariant(&ctx->ac, ptr, addr); + return bitcast(bld_base, type, result); + } +#endif + + /* Do the bounds checking with a descriptor, because + * doing computation and manual bounds checking of 64-bit + * addresses generates horrible VALU code with very high + * VGPR usage and very low SIMD occupancy. + */ + ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->i64, ""); + ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->v2i32, ""); + + LLVMValueRef desc_elems[] = { + LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_0, ""), + LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_1, ""), + LLVMConstInt(ctx->i32, (sel->info.const_file_max[0] + 1) * 16, 0), + LLVMConstInt(ctx->i32, + S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0) + }; + LLVMValueRef desc = ac_build_gather_values(&ctx->ac, desc_elems, 4); + LLVMValueRef result = buffer_load_const(ctx, desc, addr); + return bitcast(bld_base, type, result); + } + assert(reg->Register.Dimension); buf = reg->Dimension.Index; - idx = reg->Register.Index * 4 + swizzle; if (reg->Dimension.Indirect) { LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers); @@ -2012,12 +2064,6 @@ static LLVMValueRef fetch_constant( } else bufp = load_const_buffer_desc(ctx, buf); - if (reg->Register.Indirect) { - addr = si_get_indirect_index(ctx, ireg, 16, idx * 4); - } else { - addr = LLVMConstInt(ctx->i32, idx * 4, 0); - } - return bitcast(bld_base, type, buffer_load_const(ctx, bufp, addr)); } @@ -4255,10 +4301,18 @@ static void declare_per_stage_desc_pointers(struct si_shader_context *ctx, struct si_function_info *fninfo, bool assign_params) { + LLVMTypeRef const_shader_buf_type; + + if (ctx->shader->selector->info.const_buffers_declared == 1 && + ctx->shader->selector->info.shader_buffers_declared == 0) + const_shader_buf_type = ctx->f32; + else + const_shader_buf_type = ctx->v4i32; + unsigned const_and_shader_buffers = add_arg(fninfo, ARG_SGPR, - si_const_array(ctx->v4i32, - SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS)); + si_const_array(const_shader_buf_type, 0)); + unsigned samplers_and_images = add_arg(fninfo, ARG_SGPR, si_const_array(ctx->v8i32, diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index ba80f550e49..ebe956e709e 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -161,7 +161,7 @@ enum { SI_SGPR_RW_BUFFERS_HI, SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES, SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES_HI, - SI_SGPR_CONST_AND_SHADER_BUFFERS, + SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */ SI_SGPR_CONST_AND_SHADER_BUFFERS_HI, SI_SGPR_SAMPLERS_AND_IMAGES, SI_SGPR_SAMPLERS_AND_IMAGES_HI, diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index eb1901b5a9e..7eb0aa3c925 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -279,6 +279,9 @@ struct si_descriptors { ubyte shader_userdata_offset; /* The size of one descriptor. */ ubyte element_dw_size; + /* If there is only one slot enabled, bind it directly instead of + * uploading descriptors. -1 if disabled. */ + signed char slot_index_to_bind_directly; }; struct si_buffer_resources { |