summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/gallium/drivers/radeonsi/si_descriptors.c23
-rw-r--r--src/gallium/drivers/radeonsi/si_shader.c72
-rw-r--r--src/gallium/drivers/radeonsi/si_shader.h2
-rw-r--r--src/gallium/drivers/radeonsi/si_state.h3
4 files changed, 87 insertions, 13 deletions
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 0c1fca87181..da6efa83947 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -126,6 +126,7 @@ static void si_init_descriptors(struct si_descriptors *desc,
desc->element_dw_size = element_dw_size;
desc->num_elements = num_elements;
desc->shader_userdata_offset = shader_userdata_index * 4;
+ desc->slot_index_to_bind_directly = -1;
}
static void si_release_descriptors(struct si_descriptors *desc)
@@ -148,6 +149,20 @@ static bool si_upload_descriptors(struct si_context *sctx,
if (!upload_size)
return true;
+ /* If there is just one active descriptor, bind it directly. */
+ if ((int)desc->first_active_slot == desc->slot_index_to_bind_directly &&
+ desc->num_active_slots == 1) {
+ uint32_t *descriptor = &desc->list[desc->slot_index_to_bind_directly *
+ desc->element_dw_size];
+
+ /* The buffer is already in the buffer list. */
+ r600_resource_reference(&desc->buffer, NULL);
+ desc->gpu_list = NULL;
+ desc->gpu_address = si_desc_extract_buffer_address(descriptor);
+ si_mark_atom_dirty(sctx, &sctx->shader_pointers.atom);
+ return true;
+ }
+
uint32_t *ptr;
int buffer_offset;
u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size,
@@ -2531,14 +2546,15 @@ void si_init_all_descriptors(struct si_context *sctx)
bool gfx9_gs = false;
unsigned num_sampler_slots = SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS;
unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS;
+ struct si_descriptors *desc;
if (sctx->b.chip_class >= GFX9) {
gfx9_tcs = i == PIPE_SHADER_TESS_CTRL;
gfx9_gs = i == PIPE_SHADER_GEOMETRY;
}
- si_init_buffer_resources(&sctx->const_and_shader_buffers[i],
- si_const_and_shader_buffer_descriptors(sctx, i),
+ desc = si_const_and_shader_buffer_descriptors(sctx, i);
+ si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc,
num_buffer_slots,
gfx9_tcs ? GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS :
gfx9_gs ? GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS :
@@ -2547,8 +2563,9 @@ void si_init_all_descriptors(struct si_context *sctx)
RADEON_USAGE_READ,
RADEON_PRIO_SHADER_RW_BUFFER,
RADEON_PRIO_CONST_BUFFER);
+ desc->slot_index_to_bind_directly = si_get_constbuf_slot(0);
- struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, i);
+ desc = si_sampler_and_image_descriptors(sctx, i);
si_init_descriptors(desc,
gfx9_tcs ? GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES :
gfx9_gs ? GFX9_SGPR_GS_SAMPLERS_AND_IMAGES :
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index f72e5af31fd..c3fe13deeaa 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1973,6 +1973,7 @@ static LLVMValueRef fetch_constant(
unsigned swizzle)
{
struct si_shader_context *ctx = si_shader_context(bld_base);
+ struct si_shader_selector *sel = ctx->shader->selector;
const struct tgsi_ind_register *ireg = &reg->Indirect;
unsigned buf, idx;
@@ -1996,9 +1997,60 @@ static LLVMValueRef fetch_constant(
return si_llvm_emit_fetch_64bit(bld_base, type, lo, hi);
}
+ idx = reg->Register.Index * 4 + swizzle;
+ if (reg->Register.Indirect) {
+ addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
+ } else {
+ addr = LLVMConstInt(ctx->i32, idx * 4, 0);
+ }
+
+ /* Fast path when user data SGPRs point to constant buffer 0 directly. */
+ if (sel->info.const_buffers_declared == 1 &&
+ sel->info.shader_buffers_declared == 0) {
+ LLVMValueRef ptr =
+ LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
+
+ /* This enables use of s_load_dword and flat_load_dword for const buffer 0
+ * loads, and up to x4 load opcode merging. However, it leads to horrible
+ * code reducing SIMD wave occupancy from 8 to 2 in many cases.
+ *
+ * Using s_buffer_load_dword (x1) seems to be the best option right now.
+ */
+#if 0 /* keep this codepath disabled */
+ if (!reg->Register.Indirect) {
+ addr = LLVMBuildLShr(ctx->ac.builder, addr, LLVMConstInt(ctx->i32, 2, 0), "");
+ LLVMValueRef result = ac_build_load_invariant(&ctx->ac, ptr, addr);
+ return bitcast(bld_base, type, result);
+ }
+#endif
+
+ /* Do the bounds checking with a descriptor, because
+ * doing computation and manual bounds checking of 64-bit
+ * addresses generates horrible VALU code with very high
+ * VGPR usage and very low SIMD occupancy.
+ */
+ ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->i64, "");
+ ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->v2i32, "");
+
+ LLVMValueRef desc_elems[] = {
+ LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_0, ""),
+ LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_1, ""),
+ LLVMConstInt(ctx->i32, (sel->info.const_file_max[0] + 1) * 16, 0),
+ LLVMConstInt(ctx->i32,
+ S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+ S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+ S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+ S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+ S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0)
+ };
+ LLVMValueRef desc = ac_build_gather_values(&ctx->ac, desc_elems, 4);
+ LLVMValueRef result = buffer_load_const(ctx, desc, addr);
+ return bitcast(bld_base, type, result);
+ }
+
assert(reg->Register.Dimension);
buf = reg->Dimension.Index;
- idx = reg->Register.Index * 4 + swizzle;
if (reg->Dimension.Indirect) {
LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
@@ -2012,12 +2064,6 @@ static LLVMValueRef fetch_constant(
} else
bufp = load_const_buffer_desc(ctx, buf);
- if (reg->Register.Indirect) {
- addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
- } else {
- addr = LLVMConstInt(ctx->i32, idx * 4, 0);
- }
-
return bitcast(bld_base, type, buffer_load_const(ctx, bufp, addr));
}
@@ -4255,10 +4301,18 @@ static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
struct si_function_info *fninfo,
bool assign_params)
{
+ LLVMTypeRef const_shader_buf_type;
+
+ if (ctx->shader->selector->info.const_buffers_declared == 1 &&
+ ctx->shader->selector->info.shader_buffers_declared == 0)
+ const_shader_buf_type = ctx->f32;
+ else
+ const_shader_buf_type = ctx->v4i32;
+
unsigned const_and_shader_buffers =
add_arg(fninfo, ARG_SGPR,
- si_const_array(ctx->v4i32,
- SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS));
+ si_const_array(const_shader_buf_type, 0));
+
unsigned samplers_and_images =
add_arg(fninfo, ARG_SGPR,
si_const_array(ctx->v8i32,
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index ba80f550e49..ebe956e709e 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -161,7 +161,7 @@ enum {
SI_SGPR_RW_BUFFERS_HI,
SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES_HI,
- SI_SGPR_CONST_AND_SHADER_BUFFERS,
+ SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */
SI_SGPR_CONST_AND_SHADER_BUFFERS_HI,
SI_SGPR_SAMPLERS_AND_IMAGES,
SI_SGPR_SAMPLERS_AND_IMAGES_HI,
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index eb1901b5a9e..7eb0aa3c925 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -279,6 +279,9 @@ struct si_descriptors {
ubyte shader_userdata_offset;
/* The size of one descriptor. */
ubyte element_dw_size;
+ /* If there is only one slot enabled, bind it directly instead of
+ * uploading descriptors. -1 if disabled. */
+ signed char slot_index_to_bind_directly;
};
struct si_buffer_resources {