diff options
author | Samuel Pitoiset <[email protected]> | 2019-08-23 17:53:05 +0200 |
---|---|---|
committer | Dylan Baker <[email protected]> | 2019-12-04 13:43:32 -0800 |
commit | 5c98b3657791d0786abc60ff55f5aa755d8f82ce (patch) | |
tree | 886448a004662f06c70cd9a4749b24c86f97cda1 /src | |
parent | a3869c14c0d5a1731e07c5dceb2e8052793e208e (diff) |
radv/gfx10: fix implementation of exclusive scans
This implementation is loosely based on ROCm.
https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/master/ockl/src/wfredscan.cl
This fixes dEQP-VK.subgroups.arithmetic.*.subgroupexclusive* on GFX10.
Fixes: 227c29a80de ("amd/common/gfx10: implement scan & reduce operations")
Signed-off-by: Samuel Pitoiset <[email protected]>
Reviewed-by: Bas Nieuwenhuizen <[email protected]>
(cherry picked from commit c9aa843961d2c3cb34e7cb2dc843b93d723e0692)
Conflicts resolved by Dylan Baker
Diffstat (limited to 'src')
-rw-r--r-- | src/amd/common/ac_llvm_build.c | 83 |
1 files changed, 58 insertions, 25 deletions
diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 5abae00d8f6..07f356a5ee2 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -4218,8 +4218,43 @@ ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValu { LLVMValueRef result, tmp; - if (ctx->chip_class >= GFX10) { - result = inclusive ? src : identity; + if (inclusive) { + result = src; + } else if (ctx->chip_class >= GFX10) { + /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */ + LLVMValueRef active, tmp1, tmp2; + LLVMValueRef tid = ac_get_thread_id(ctx); + + tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false); + + tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false); + + if (maxprefix > 32) { + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, + LLVMConstInt(ctx->i32, 32, false), ""); + + tmp2 = LLVMBuildSelect(ctx->builder, active, + ac_build_readlane(ctx, src, + LLVMConstInt(ctx->i32, 31, false)), + tmp2, ""); + + active = LLVMBuildOr(ctx->builder, active, + LLVMBuildICmp(ctx->builder, LLVMIntEQ, + LLVMBuildAnd(ctx->builder, tid, + LLVMConstInt(ctx->i32, 0x1f, false), ""), + LLVMConstInt(ctx->i32, 0x10, false), ""), ""); + src = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + } else if (maxprefix > 16) { + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, + LLVMConstInt(ctx->i32, 16, false), ""); + + src = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + } + + result = src; + } else if (ctx->chip_class >= GFX8) { + src = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false); + result = src; } else { if (!inclusive) src = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false); @@ -4249,33 +4284,31 @@ ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValu return result; if (ctx->chip_class >= GFX10) { - /* dpp_row_bcast{15,31} are not supported on gfx10. */ - LLVMBuilderRef builder = ctx->builder; LLVMValueRef tid = ac_get_thread_id(ctx); - LLVMValueRef cc; - /* TODO-GFX10: Can we get better code-gen by putting this into - * a branch so that LLVM generates EXEC mask manipulations? */ - if (inclusive) - tmp = result; - else - tmp = ac_build_alu_op(ctx, result, src, op); - tmp = ac_build_permlane16(ctx, tmp, ~(uint64_t)0, true, false); - tmp = ac_build_alu_op(ctx, result, tmp, op); - cc = LLVMBuildAnd(builder, tid, LLVMConstInt(ctx->i32, 16, false), ""); - cc = LLVMBuildICmp(builder, LLVMIntNE, cc, ctx->i32_0, ""); - result = LLVMBuildSelect(builder, cc, tmp, result, ""); + LLVMValueRef active; + + tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false); + + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, + LLVMConstInt(ctx->i32, 16, false), ""), + ctx->i32_0, ""); + + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + + result = ac_build_alu_op(ctx, result, tmp, op); + if (maxprefix <= 32) return result; - if (inclusive) - tmp = result; - else - tmp = ac_build_alu_op(ctx, result, src, op); - tmp = ac_build_readlane(ctx, tmp, LLVMConstInt(ctx->i32, 31, false)); - tmp = ac_build_alu_op(ctx, result, tmp, op); - cc = LLVMBuildICmp(builder, LLVMIntUGE, tid, - LLVMConstInt(ctx->i32, 32, false), ""); - result = LLVMBuildSelect(builder, cc, tmp, result, ""); + tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false)); + + active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, + LLVMConstInt(ctx->i32, 32, false), ""); + + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + + result = ac_build_alu_op(ctx, result, tmp, op); return result; } |