From 45c5c444518b7e83d9accd9f44702fa49282a3b8 Mon Sep 17 00:00:00 2001
From: Nicolai Hähnle <nicolai.haehnle@amd.com>
Date: Tue, 29 Aug 2017 23:11:38 +0200
Subject: radeonsi/gfx9: proper workaround for LS/HS VGPR initialization bug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the HS wave is empty, the hardware writes the LS VGPRs starting at
v0 instead of v2. Workaround by shifting them back into place when
necessary. For simplicity, this is always done in the LS prolog.

According to the hardware team, this will be fixed in future chips,
so take that into account already.

Note that this is not a bug fix, as the bug was already worked
around by commit 166823bfd26 ("radeonsi/gfx9: add a temporary workaround
for a tessellation driver bug"). This change merely replaces the
workaround by one that should be better.

v2: add workaround code to shader only when necessary
v3: clarify the prefer_mono comment

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_pipe.h          |  1 +
 src/gallium/drivers/radeonsi/si_shader.c        | 71 ++++++++++++++++++-------
 src/gallium/drivers/radeonsi/si_shader.h        |  1 +
 src/gallium/drivers/radeonsi/si_state_draw.c    | 27 ++++++++--
 src/gallium/drivers/radeonsi/si_state_shaders.c |  9 ++++
 5 files changed, 85 insertions(+), 24 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index dde0c115dc5..8db7028c9a1 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -389,6 +389,7 @@ struct si_context {
 
 	/* Emitted draw state. */
 	bool			gs_tri_strip_adj_fix:1;
+	bool			ls_vgpr_fix:1;
 	int			last_index_size;
 	int			last_base_vertex;
 	int			last_start_instance;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 0e89ccac09d..db8297ddc4a 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5444,6 +5444,8 @@ static void si_dump_shader_key_vs(const struct si_shader_key *key,
 		prefix, prolog->instance_divisor_is_one);
 	fprintf(f, "  %s.instance_divisor_is_fetched = %u\n",
 		prefix, prolog->instance_divisor_is_fetched);
+	fprintf(f, "  %s.ls_vgpr_fix = %u\n",
+		prefix, prolog->ls_vgpr_fix);
 
 	fprintf(f, "  mono.vs.fix_fetch = {");
 	for (int i = 0; i < SI_MAX_ATTRIBS; i++)
@@ -5636,6 +5638,14 @@ static void si_init_exec_from_input(struct si_shader_context *ctx,
 			   ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
 }
 
+static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
+			       const struct si_vs_prolog_bits *key)
+{
+	/* VGPR initialization fixup for Vega10 and Raven is always done in the
+	 * VS prolog. */
+	return sel->vs_needs_prolog || key->ls_vgpr_fix;
+}
+
 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
 				 bool is_monolithic)
 {
@@ -5712,7 +5722,7 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
 		    (shader->key.as_es || shader->key.as_ls) &&
 		    (ctx->type == PIPE_SHADER_TESS_EVAL ||
 		     (ctx->type == PIPE_SHADER_VERTEX &&
-		      !sel->vs_needs_prolog))) {
+		      !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
 			si_init_exec_from_input(ctx,
 						ctx->param_merged_wave_info, 0);
 		} else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
@@ -6364,6 +6374,8 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
 		if (sscreen->b.chip_class >= GFX9) {
 			struct si_shader_selector *ls = shader->key.part.tcs.ls;
 			LLVMValueRef parts[4];
+			bool vs_needs_prolog =
+				si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
 
 			/* TCS main part */
 			parts[2] = ctx.main_fn;
@@ -6376,7 +6388,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
 			parts[3] = ctx.main_fn;
 
 			/* VS prolog */
-			if (ls->vs_needs_prolog) {
+			if (vs_needs_prolog) {
 				union si_shader_part_key vs_prolog_key;
 				si_get_vs_prolog_key(&ls->info,
 						     shader->info.num_input_sgprs,
@@ -6407,9 +6419,9 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
 			ctx.type = PIPE_SHADER_TESS_CTRL;
 
 			si_build_wrapper_function(&ctx,
-						  parts + !ls->vs_needs_prolog,
-						  4 - !ls->vs_needs_prolog, 0,
-						  ls->vs_needs_prolog ? 2 : 1);
+						  parts + !vs_needs_prolog,
+						  4 - !vs_needs_prolog, 0,
+						  vs_needs_prolog ? 2 : 1);
 		} else {
 			LLVMValueRef parts[2];
 			union si_shader_part_key epilog_key;
@@ -6746,9 +6758,9 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
 	LLVMTypeRef *returns;
 	LLVMValueRef ret, func;
 	int num_returns, i;
-	unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
-				 key->vs_prolog.num_merged_next_stage_vgprs;
+	unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
 	unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
+	LLVMValueRef input_vgprs[9];
 	unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
 				      num_input_vgprs;
 	unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
@@ -6768,13 +6780,10 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
 
 	/* Preloaded VGPRs (outputs must be floats) */
 	for (i = 0; i < num_input_vgprs; i++) {
-		add_arg(&fninfo, ARG_VGPR, ctx->i32);
+		add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]);
 		returns[num_returns++] = ctx->f32;
 	}
 
-	fninfo.assign[first_vs_vgpr] = &ctx->abi.vertex_id;
-	fninfo.assign[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)] = &ctx->abi.instance_id;
-
 	/* Vertex load indices. */
 	for (i = 0; i <= key->vs_prolog.last_input; i++)
 		returns[num_returns++] = ctx->f32;
@@ -6783,9 +6792,33 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
 	si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0);
 	func = ctx->main_fn;
 
-	if (key->vs_prolog.num_merged_next_stage_vgprs &&
-	    !key->vs_prolog.is_monolithic)
-		si_init_exec_from_input(ctx, 3, 0);
+	if (key->vs_prolog.num_merged_next_stage_vgprs) {
+		if (!key->vs_prolog.is_monolithic)
+			si_init_exec_from_input(ctx, 3, 0);
+
+		if (key->vs_prolog.as_ls &&
+		    (ctx->screen->b.family == CHIP_VEGA10 ||
+		     ctx->screen->b.family == CHIP_RAVEN)) {
+			/* If there are no HS threads, SPI loads the LS VGPRs
+			 * starting at VGPR 0. Shift them back to where they
+			 * belong.
+			 */
+			LLVMValueRef has_hs_threads =
+				LLVMBuildICmp(gallivm->builder, LLVMIntNE,
+				    unpack_param(ctx, 3, 8, 8),
+				    ctx->i32_0, "");
+
+			for (i = 4; i > 0; --i) {
+				input_vgprs[i + 1] =
+					LLVMBuildSelect(gallivm->builder, has_hs_threads,
+						        input_vgprs[i + 1],
+						        input_vgprs[i - 1], "");
+			}
+		}
+	}
+
+	ctx->abi.vertex_id = input_vgprs[first_vs_vgpr];
+	ctx->abi.instance_id = input_vgprs[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)];
 
 	/* Copy inputs to outputs. This should be no-op, as the registers match,
 	 * but it will prevent the compiler from overwriting them unintentionally.
@@ -6795,10 +6828,11 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
 		LLVMValueRef p = LLVMGetParam(func, i);
 		ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
 	}
-	for (; i < fninfo.num_params; i++) {
-		LLVMValueRef p = LLVMGetParam(func, i);
+	for (i = 0; i < num_input_vgprs; i++) {
+		LLVMValueRef p = input_vgprs[i];
 		p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
-		ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
+		ret = LLVMBuildInsertValue(gallivm->builder, ret, p,
+					   key->vs_prolog.num_input_sgprs + i, "");
 	}
 
 	/* Compute vertex load indices from instance divisors. */
@@ -6859,8 +6893,7 @@ static bool si_get_vs_prolog(struct si_screen *sscreen,
 {
 	struct si_shader_selector *vs = main_part->selector;
 
-	/* The prolog is a no-op if there are no inputs. */
-	if (!vs->vs_needs_prolog)
+	if (!si_vs_needs_prolog(vs, key))
 		return true;
 
 	/* Get the prolog. */
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 0c0fa10f40f..ee6b0c167f9 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -398,6 +398,7 @@ struct si_vs_prolog_bits {
 	 */
 	uint16_t	instance_divisor_is_one;     /* bitmask of inputs */
 	uint16_t	instance_divisor_is_fetched; /* bitmask of inputs */
+	unsigned	ls_vgpr_fix:1;
 };
 
 /* Common TCS bits between the shader key and the epilog key. */
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 7ee6cf88e88..051dfea8f7c 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -195,11 +195,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	 */
 	*num_patches = MIN2(*num_patches, 40);
 
-	if (sctx->b.chip_class == SI ||
-	    /* TODO: fix GFX9 where a threadgroup contains more than 1 wave and
-	     * LS vertices per patch > HS vertices per patch. Piglit: 16in-1out */
-	    (sctx->b.chip_class == GFX9 &&
-	     num_tcs_input_cp > num_tcs_output_cp)) {
+	if (sctx->b.chip_class == SI) {
 		/* SI bug workaround, related to power management. Limit LS-HS
 		 * threadgroups to only one wave.
 		 */
@@ -1264,6 +1260,27 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		sctx->do_update_shaders = true;
 	}
 
+	if (sctx->tes_shader.cso &&
+	    (sctx->b.family == CHIP_VEGA10 || sctx->b.family == CHIP_RAVEN)) {
+		/* Determine whether the LS VGPR fix should be applied.
+		 *
+		 * It is only required when num input CPs > num output CPs,
+		 * which cannot happen with the fixed function TCS. We should
+		 * also update this bit when switching from TCS to fixed
+		 * function TCS.
+		 */
+		struct si_shader_selector *tcs = sctx->tcs_shader.cso;
+		bool ls_vgpr_fix =
+			tcs &&
+			info->vertices_per_patch >
+			tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
+
+		if (ls_vgpr_fix != sctx->ls_vgpr_fix) {
+			sctx->ls_vgpr_fix = ls_vgpr_fix;
+			sctx->do_update_shaders = true;
+		}
+	}
+
 	if (sctx->gs_shader.cso) {
 		/* Determine whether the GS triangle strip adjacency fix should
 		 * be applied. Rotate every other triangle if
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 4f04bbdfaff..d8791a2a62e 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1283,6 +1283,15 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 			si_shader_selector_key_vs(sctx, sctx->vs_shader.cso,
 						  key, &key->part.tcs.ls_prolog);
 			key->part.tcs.ls = sctx->vs_shader.cso;
+
+			/* When the LS VGPR fix is needed, monolithic shaders
+			 * can:
+			 *  - avoid initializing EXEC in both the LS prolog
+			 *    and the LS main part when !vs_needs_prolog
+			 *  - remove the fixup for unused input VGPRs
+			 */
+			key->part.tcs.ls_prolog.ls_vgpr_fix = sctx->ls_vgpr_fix;
+			key->opt.prefer_mono = sctx->ls_vgpr_fix;
 		}
 
 		key->part.tcs.epilog.prim_mode =
-- 
cgit v1.2.3