From 56017d81004072fd00979f3a987ff2e94e3b6315 Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Tue, 27 Mar 2018 10:26:16 +1100 Subject: radeonsi: create load_const_buffer_desc_fast_path() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will be shared by the TGSI and NIR backends. For simplicity we leave the SI LLVM 5.0 and lower work around only in the TGSI backend. Reviewed-by: Marek Olšák --- src/gallium/drivers/radeonsi/si_shader.c | 88 ++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 39 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 00ebbb9b0f2..1661b54d056 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -2319,6 +2319,49 @@ void si_tgsi_declare_compute_memory(struct si_shader_context *ctx, si_declare_compute_memory(ctx); } +static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *ctx) +{ + LLVMValueRef ptr = + LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers); + struct si_shader_selector *sel = ctx->shader->selector; + + /* Do the bounds checking with a descriptor, because + * doing computation and manual bounds checking of 64-bit + * addresses generates horrible VALU code with very high + * VGPR usage and very low SIMD occupancy. + */ + ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, ""); + + LLVMValueRef desc0, desc1; + if (HAVE_32BIT_POINTERS) { + desc0 = ptr; + desc1 = LLVMConstInt(ctx->i32, + S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0); + } else { + ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->v2i32, ""); + desc0 = LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_0, ""); + desc1 = LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_1, ""); + /* Mask out all bits except BASE_ADDRESS_HI. */ + desc1 = LLVMBuildAnd(ctx->ac.builder, desc1, + LLVMConstInt(ctx->i32, ~C_008F04_BASE_ADDRESS_HI, 0), ""); + } + + LLVMValueRef desc_elems[] = { + desc0, + desc1, + LLVMConstInt(ctx->i32, (sel->info.const_file_max[0] + 1) * 16, 0), + LLVMConstInt(ctx->i32, + S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0) + }; + + return ac_build_gather_values(&ctx->ac, desc_elems, 4); +} + static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i) { LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn, @@ -2397,8 +2440,6 @@ static LLVMValueRef fetch_constant( /* Fast path when user data SGPRs point to constant buffer 0 directly. */ if (sel->info.const_buffers_declared == 1 && sel->info.shader_buffers_declared == 0) { - LLVMValueRef ptr = - LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers); /* This enables use of s_load_dword and flat_load_dword for const buffer 0 * loads, and up to x4 load opcode merging. However, it leads to horrible @@ -2413,48 +2454,17 @@ static LLVMValueRef fetch_constant( * s_buffer_load_dword (that we have to prevent) is when we use use * a literal offset where we don't need bounds checking. */ - if (ctx->screen->info.chip_class == SI && - HAVE_LLVM < 0x0600 && - !reg->Register.Indirect) { + if (ctx->screen->info.chip_class == SI && HAVE_LLVM < 0x0600 && + !reg->Register.Indirect) { + LLVMValueRef ptr = + LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers); + addr = LLVMBuildLShr(ctx->ac.builder, addr, LLVMConstInt(ctx->i32, 2, 0), ""); LLVMValueRef result = ac_build_load_invariant(&ctx->ac, ptr, addr); return bitcast(bld_base, type, result); } - /* Do the bounds checking with a descriptor, because - * doing computation and manual bounds checking of 64-bit - * addresses generates horrible VALU code with very high - * VGPR usage and very low SIMD occupancy. - */ - ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, ""); - - LLVMValueRef desc0, desc1; - if (HAVE_32BIT_POINTERS) { - desc0 = ptr; - desc1 = LLVMConstInt(ctx->i32, - S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0); - } else { - ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->v2i32, ""); - desc0 = LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_0, ""); - desc1 = LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_1, ""); - /* Mask out all bits except BASE_ADDRESS_HI. */ - desc1 = LLVMBuildAnd(ctx->ac.builder, desc1, - LLVMConstInt(ctx->i32, ~C_008F04_BASE_ADDRESS_HI, 0), ""); - } - - LLVMValueRef desc_elems[] = { - desc0, - desc1, - LLVMConstInt(ctx->i32, (sel->info.const_file_max[0] + 1) * 16, 0), - LLVMConstInt(ctx->i32, - S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | - S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0) - }; - LLVMValueRef desc = ac_build_gather_values(&ctx->ac, desc_elems, 4); + LLVMValueRef desc = load_const_buffer_desc_fast_path(ctx); LLVMValueRef result = buffer_load_const(ctx, desc, addr); return bitcast(bld_base, type, result); } -- cgit v1.2.3