diff options
author | Marek Olšák <[email protected]> | 2018-09-22 22:02:32 -0400 |
---|---|---|
committer | Marek Olšák <[email protected]> | 2018-10-16 17:23:25 -0400 |
commit | 0b40fbc8796b8e204e7af45b3d39b67d20fb3da7 (patch) | |
tree | e53e6f95f9a4b86339d14c1ce2ea07b355d94d00 /src/gallium/drivers/radeonsi/si_shader.c | |
parent | bfc795670ec82af5767cf360806e32322664604e (diff) |
radeonsi: use faster integer division for instance divisors
We know the divisors when we upload them, so instead we can precompute
and upload division factors derived from each divisor.
This fast division consists of add, mul_hi, and two shifts,
and we have to load 4 dwords intead of 1.
This probably won't affect any apps.
Diffstat (limited to 'src/gallium/drivers/radeonsi/si_shader.c')
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader.c | 52 |
1 files changed, 24 insertions, 28 deletions
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index acd4d34f899..19522cc97b1 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -428,20 +428,6 @@ static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx) } } -static LLVMValueRef get_instance_index_for_fetch( - struct si_shader_context *ctx, - unsigned param_start_instance, LLVMValueRef divisor) -{ - LLVMValueRef result = ctx->abi.instance_id; - - /* The division must be done before START_INSTANCE is added. */ - if (divisor != ctx->i32_1) - result = LLVMBuildUDiv(ctx->ac.builder, result, divisor, ""); - - return LLVMBuildAdd(ctx->ac.builder, result, - LLVMGetParam(ctx->main_fn, param_start_instance), ""); -} - /* Bitcast <4 x float> to <2 x double>, extract the component, and convert * to float. */ static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx, @@ -7302,22 +7288,32 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx, key->vs_prolog.states.instance_divisor_is_one & (1u << i); bool divisor_is_fetched = key->vs_prolog.states.instance_divisor_is_fetched & (1u << i); - LLVMValueRef index; - - if (divisor_is_one || divisor_is_fetched) { - LLVMValueRef divisor = ctx->i32_1; - - if (divisor_is_fetched) { - divisor = buffer_load_const(ctx, instance_divisor_constbuf, - LLVMConstInt(ctx->i32, i * 4, 0)); - divisor = ac_to_integer(&ctx->ac, divisor); + LLVMValueRef index = NULL; + + if (divisor_is_one) { + index = ctx->abi.instance_id; + } else if (divisor_is_fetched) { + LLVMValueRef udiv_factors[4]; + + for (unsigned j = 0; j < 4; j++) { + udiv_factors[j] = + buffer_load_const(ctx, instance_divisor_constbuf, + LLVMConstInt(ctx->i32, i*16 + j*4, 0)); + udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]); } + /* The faster NUW version doesn't work when InstanceID == UINT_MAX. + * Such InstanceID might not be achievable in a reasonable time though. + */ + index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, + udiv_factors[0], udiv_factors[1], + udiv_factors[2], udiv_factors[3]); + } - /* InstanceID / Divisor + StartInstance */ - index = get_instance_index_for_fetch(ctx, - user_sgpr_base + - SI_SGPR_START_INSTANCE, - divisor); + if (divisor_is_one || divisor_is_fetched) { + /* Add StartInstance. */ + index = LLVMBuildAdd(ctx->ac.builder, index, + LLVMGetParam(ctx->main_fn, user_sgpr_base + + SI_SGPR_START_INSTANCE), ""); } else { /* VertexID + BaseVertex */ index = LLVMBuildAdd(ctx->ac.builder, |