summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorMarek Olšák <[email protected]>2017-04-06 00:33:45 +0200
committerMarek Olšák <[email protected]>2017-04-28 21:47:35 +0200
commit0af00f179ec3303c2bf2bccbfe80703b96b4500d (patch)
tree2c7591918e25a3595b170b425bb1f4340a4c7fbb /src
parent0d6d25475d7c3ddc04349afee8e11b67d5ecf726 (diff)
radeonsi/gfx9: add support for monolithic merged LS-HS
Reviewed-by: Nicolai Hähnle <[email protected]>
Diffstat (limited to 'src')
-rw-r--r--src/gallium/drivers/radeonsi/si_shader.c143
-rw-r--r--src/gallium/drivers/radeonsi/si_shader.h2
2 files changed, 128 insertions, 17 deletions
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 126f7dc7e59..3d23d72d213 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -7463,20 +7463,23 @@ static void si_build_gs_prolog_function(struct si_shader_context *ctx,
static void si_build_wrapper_function(struct si_shader_context *ctx,
LLVMValueRef *parts,
unsigned num_parts,
- unsigned main_part)
+ unsigned main_part,
+ unsigned next_shader_first_part)
{
struct gallivm_state *gallivm = &ctx->gallivm;
LLVMBuilderRef builder = ctx->gallivm.builder;
/* PS epilog has one arg per color component */
LLVMTypeRef param_types[48];
- LLVMValueRef out[48];
+ LLVMValueRef initial[48], out[48];
LLVMTypeRef function_type;
unsigned num_params;
- unsigned num_out;
+ unsigned num_out, initial_num_out;
MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
+ MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
unsigned num_sgprs, num_vgprs;
unsigned last_sgpr_param;
unsigned gprs;
+ struct lp_build_if_state if_state;
for (unsigned i = 0; i < num_parts; ++i) {
lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
@@ -7528,6 +7531,13 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param);
+ if (is_merged_shader(ctx->shader)) {
+ LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
+ lp_build_intrinsic(ctx->gallivm.builder,
+ "llvm.amdgcn.init.exec", ctx->voidt,
+ &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
+ }
+
/* Record the arguments of the function as if they were an output of
* a previous part.
*/
@@ -7564,6 +7574,10 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
num_out_sgpr = num_out;
}
+ memcpy(initial, out, sizeof(out));
+ initial_num_out = num_out;
+ initial_num_out_sgpr = num_out_sgpr;
+
/* Now chain the parts. */
for (unsigned part = 0; part < num_parts; ++part) {
LLVMValueRef in[48];
@@ -7574,6 +7588,24 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
num_params = LLVMCountParams(parts[part]);
assert(num_params <= ARRAY_SIZE(param_types));
+ /* Merged shaders are executed conditionally depending
+ * on the number of enabled threads passed in the input SGPRs. */
+ if (is_merged_shader(ctx->shader) &&
+ (part == 0 || part == next_shader_first_part)) {
+ LLVMValueRef ena, count = initial[3];
+
+ /* The thread count for the 2nd shader is at bit-offset 8. */
+ if (part == next_shader_first_part) {
+ count = LLVMBuildLShr(builder, count,
+ LLVMConstInt(ctx->i32, 8, 0), "");
+ }
+ count = LLVMBuildAnd(builder, count,
+ LLVMConstInt(ctx->i32, 0x7f, 0), "");
+ ena = LLVMBuildICmp(builder, LLVMIntULT,
+ ac_get_thread_id(&ctx->ac), count, "");
+ lp_build_if(&if_state, &ctx->gallivm, ena);
+ }
+
/* Derive arguments for the next part from outputs of the
* previous one.
*/
@@ -7621,9 +7653,33 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
}
ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
- ret_type = LLVMTypeOf(ret);
+
+ if (is_merged_shader(ctx->shader) &&
+ (part + 1 == next_shader_first_part ||
+ part + 1 == num_parts)) {
+ lp_build_endif(&if_state);
+
+ if (part + 1 == next_shader_first_part) {
+ /* A barrier is required between 2 merged shaders. */
+ si_llvm_emit_barrier(NULL, &ctx->bld_base, NULL);
+
+ /* The second half of the merged shader should use
+ * the inputs from the toplevel (wrapper) function,
+ * not the return value from the last call.
+ *
+ * That's because the last call was executed condi-
+ * tionally, so we can't consume it in the main
+ * block.
+ */
+ memcpy(out, initial, sizeof(initial));
+ num_out = initial_num_out;
+ num_out_sgpr = initial_num_out_sgpr;
+ }
+ continue;
+ }
/* Extract the returned GPRs. */
+ ret_type = LLVMTypeOf(ret);
num_out = 0;
num_out_sgpr = 0;
@@ -7711,19 +7767,70 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
}
si_build_wrapper_function(&ctx, parts, 1 + need_prolog + need_epilog,
- need_prolog ? 1 : 0);
+ need_prolog ? 1 : 0, 0);
} else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
- LLVMValueRef parts[2];
- union si_shader_part_key epilog_key;
+ if (sscreen->b.chip_class >= GFX9) {
+ struct si_shader_selector *ls = shader->key.part.tcs.ls;
+ LLVMValueRef parts[4];
+
+ /* TCS main part */
+ parts[2] = ctx.main_fn;
+
+ /* TCS epilog */
+ union si_shader_part_key tcs_epilog_key;
+ memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
+ tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+ si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
+ parts[3] = ctx.main_fn;
+
+ /* VS prolog */
+ if (ls->vs_needs_prolog) {
+ union si_shader_part_key vs_prolog_key;
+ si_get_vs_prolog_key(&ls->info,
+ shader->info.num_input_sgprs,
+ &shader->key.part.tcs.ls_prolog,
+ shader, &vs_prolog_key);
+ vs_prolog_key.vs_prolog.is_monolithic = true;
+ si_build_vs_prolog_function(&ctx, &vs_prolog_key);
+ parts[0] = ctx.main_fn;
+ }
- parts[0] = ctx.main_fn;
+ /* VS as LS main part */
+ struct si_shader shader_ls = {};
+ shader_ls.selector = ls;
+ shader_ls.key.as_ls = 1;
+ shader_ls.key.mono = shader->key.mono;
+ shader_ls.key.opt = shader->key.opt;
+ si_llvm_context_set_tgsi(&ctx, &shader_ls);
- memset(&epilog_key, 0, sizeof(epilog_key));
- epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
- si_build_tcs_epilog_function(&ctx, &epilog_key);
- parts[1] = ctx.main_fn;
+ if (!si_compile_tgsi_main(&ctx, true)) {
+ si_llvm_dispose(&ctx);
+ return -1;
+ }
+ shader->info.uses_instanceid |= ls->info.uses_instanceid;
+ parts[1] = ctx.main_fn;
+
+ /* Reset the shader context. */
+ ctx.shader = shader;
+ ctx.type = PIPE_SHADER_TESS_CTRL;
- si_build_wrapper_function(&ctx, parts, 2, 0);
+ si_build_wrapper_function(&ctx,
+ parts + !ls->vs_needs_prolog,
+ 4 - !ls->vs_needs_prolog, 0,
+ ls->vs_needs_prolog ? 2 : 1);
+ } else {
+ LLVMValueRef parts[2];
+ union si_shader_part_key epilog_key;
+
+ parts[0] = ctx.main_fn;
+
+ memset(&epilog_key, 0, sizeof(epilog_key));
+ epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+ si_build_tcs_epilog_function(&ctx, &epilog_key);
+ parts[1] = ctx.main_fn;
+
+ si_build_wrapper_function(&ctx, parts, 2, 0, 0);
+ }
} else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
!shader->key.as_es) {
LLVMValueRef parts[2];
@@ -7735,7 +7842,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
si_build_vs_epilog_function(&ctx, &epilog_key);
parts[1] = ctx.main_fn;
- si_build_wrapper_function(&ctx, parts, 2, 0);
+ si_build_wrapper_function(&ctx, parts, 2, 0, 0);
} else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
LLVMValueRef parts[2];
union si_shader_part_key prolog_key;
@@ -7747,7 +7854,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
si_build_gs_prolog_function(&ctx, &prolog_key);
parts[0] = ctx.main_fn;
- si_build_wrapper_function(&ctx, parts, 2, 1);
+ si_build_wrapper_function(&ctx, parts, 2, 1, 0);
} else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
LLVMValueRef parts[3];
union si_shader_part_key prolog_key;
@@ -7768,7 +7875,8 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
si_build_ps_epilog_function(&ctx, &epilog_key);
parts[need_prolog ? 2 : 1] = ctx.main_fn;
- si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2, need_prolog ? 1 : 0);
+ si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
+ need_prolog ? 1 : 0, 0);
}
/* Dump LLVM IR before any optimization passes */
@@ -8028,7 +8136,8 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
num_params, last_sgpr);
func = ctx->main_fn;
- if (key->vs_prolog.num_merged_next_stage_vgprs)
+ if (key->vs_prolog.num_merged_next_stage_vgprs &&
+ !key->vs_prolog.is_monolithic)
si_init_exec_from_input(ctx, 3, 0);
/* Copy inputs to outputs. This should be no-op, as the registers match,
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 2d660ac6cc1..299cbab54c9 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -355,6 +355,8 @@ union si_shader_part_key {
/* For merged stages such as LS-HS, HS input VGPRs are first. */
unsigned num_merged_next_stage_vgprs:3;
unsigned last_input:4;
+ /* Prologs for monolithic shaders shouldn't set EXEC. */
+ unsigned is_monolithic:1;
} vs_prolog;
struct {
struct si_vs_epilog_bits states;