radeonsi: tell LLVM not to remove s_barrier instructions

LLVM 5.0 removes s_barrier instructions if the max-work-group-size attribute is not set. What a surprise. Reviewed-by: Nicolai Hähnle <[email protected]>
author: Marek Olšák <[email protected]> 2017-04-23 00:46:55 +0200
committer: Marek Olšák <[email protected]> 2017-04-28 21:47:35 +0200
commit: 55445ff1891724c78e6573d2f8c721e14c0449fc (patch)
tree: 8aeb6fd6f1887dd0fd39b641642894d436510e5f /src/gallium/drivers/radeonsi
parent: 0490074cab0393b1128d1f316948c7bdd6d45e57 (diff)
1 files changed, 33 insertions, 12 deletions
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index a330b3043c5..c0e190c04d7 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5683,7 +5683,7 @@ static void si_create_function(struct si_shader_context *ctx,
 			       const char *name,
 			       LLVMTypeRef *returns, unsigned num_returns,
 			       LLVMTypeRef *params, unsigned num_params,
-			       int last_sgpr)
+			       int last_sgpr, unsigned max_workgroup_size)
 {
 	int i;
 
@@ -5710,6 +5710,10 @@ static void si_create_function(struct si_shader_context *ctx,
 			lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
 	}
 
+	if (max_workgroup_size) {
+		si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size",
+				      max_workgroup_size);
+	}
 	LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
 					   "no-signed-zeros-fp-math",
 					   "true");
@@ -5791,6 +5795,22 @@ static void declare_lds_as_pointer(struct si_shader_context *ctx)
 
 static unsigned si_get_max_workgroup_size(struct si_shader *shader)
 {
+	switch (shader->selector->type) {
+	case PIPE_SHADER_TESS_CTRL:
+		/* Return this so that LLVM doesn't remove s_barrier
+		 * instructions on chips where we use s_barrier. */
+		return shader->selector->screen->b.chip_class >= CIK ? 128 : 64;
+
+	case PIPE_SHADER_GEOMETRY:
+		return shader->selector->screen->b.chip_class >= GFX9 ? 128 : 64;
+
+	case PIPE_SHADER_COMPUTE:
+		break; /* see below */
+
+	default:
+		return 0;
+	}
+
 	const unsigned *properties = shader->selector->info.properties;
 	unsigned max_work_group_size =
 	               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
@@ -6181,7 +6201,8 @@ static void create_function(struct si_shader_context *ctx)
 	assert(num_params <= ARRAY_SIZE(params));
 
 	si_create_function(ctx, "main", returns, num_returns, params,
-			   num_params, last_sgpr);
+			   num_params, last_sgpr,
+			   si_get_max_workgroup_size(shader));
 
 	/* Reserve register locations for VGPR inputs the PS prolog may need. */
 	if (ctx->type == PIPE_SHADER_FRAGMENT &&
@@ -6196,10 +6217,6 @@ static void create_function(struct si_shader_context *ctx)
 				      S_0286D0_LINEAR_CENTROID_ENA(1) |
 				      S_0286D0_FRONT_FACE_ENA(1) |
 				      S_0286D0_POS_FIXED_PT_ENA(1));
-	} else if (ctx->type == PIPE_SHADER_COMPUTE) {
-		si_llvm_add_attribute(ctx->main_fn,
-				      "amdgpu-max-work-group-size",
-				      si_get_max_workgroup_size(shader));
 	}
 
 	shader->info.num_input_sgprs = 0;
@@ -7573,7 +7590,7 @@ static void si_build_gs_prolog_function(struct si_shader_context *ctx,
 
 	/* Create the function. */
 	si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
-			   params, num_sgprs + num_vgprs, num_sgprs - 1);
+			   params, num_sgprs + num_vgprs, num_sgprs - 1, 0);
 	func = ctx->main_fn;
 
 	/* Set the full EXEC mask for the prolog, because we are only fiddling
@@ -7733,7 +7750,9 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
 		gprs += size;
 	}
 
-	si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param);
+	si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params,
+			   last_sgpr_param,
+			   si_get_max_workgroup_size(ctx->shader));
 
 	if (is_merged_shader(ctx->shader))
 		si_init_exec_full_mask(ctx);
@@ -8371,7 +8390,7 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
 
 	/* Create the function. */
 	si_create_function(ctx, "vs_prolog", returns, num_returns, params,
-			   num_params, last_sgpr);
+			   num_params, last_sgpr, 0);
 	func = ctx->main_fn;
 
 	if (key->vs_prolog.num_merged_next_stage_vgprs &&
@@ -8515,7 +8534,8 @@ static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
 	params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */
 
 	/* Create the function. */
-	si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr);
+	si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr,
+			   ctx->screen->b.chip_class >= CIK ? 128 : 64);
 	declare_lds_as_pointer(ctx);
 	func = ctx->main_fn;
 
@@ -8636,7 +8656,7 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx,
 
 	/* Create the function. */
 	si_create_function(ctx, "ps_prolog", params, num_returns, params,
-			   num_params, last_sgpr);
+			   num_params, last_sgpr, 0);
 	func = ctx->main_fn;
 
 	/* Copy inputs to outputs. This should be no-op, as the registers match,
@@ -8878,7 +8898,8 @@ static void si_build_ps_epilog_function(struct si_shader_context *ctx,
 		params[i] = ctx->f32;
 
 	/* Create the function. */
-	si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params, last_sgpr);
+	si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params,
+			   last_sgpr, 0);
 	/* Disable elimination of unused inputs. */
 	si_llvm_add_attribute(ctx->main_fn,
 				  "InitialPSInputAddr", 0xffffff);
author	Marek Olšák <[email protected]>	2017-04-23 00:46:55 +0200
committer	Marek Olšák <[email protected]>	2017-04-28 21:47:35 +0200
commit	55445ff1891724c78e6573d2f8c721e14c0449fc (patch)
tree	8aeb6fd6f1887dd0fd39b641642894d436510e5f /src/gallium/drivers/radeonsi
parent	0490074cab0393b1128d1f316948c7bdd6d45e57 (diff)