summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/gallium/drivers/radeonsi/si_pipe.h1
-rw-r--r--src/gallium/drivers/radeonsi/si_shader.c71
-rw-r--r--src/gallium/drivers/radeonsi/si_shader.h1
-rw-r--r--src/gallium/drivers/radeonsi/si_state_draw.c27
-rw-r--r--src/gallium/drivers/radeonsi/si_state_shaders.c9
5 files changed, 85 insertions, 24 deletions
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index dde0c115dc5..8db7028c9a1 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -389,6 +389,7 @@ struct si_context {
/* Emitted draw state. */
bool gs_tri_strip_adj_fix:1;
+ bool ls_vgpr_fix:1;
int last_index_size;
int last_base_vertex;
int last_start_instance;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 0e89ccac09d..db8297ddc4a 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5444,6 +5444,8 @@ static void si_dump_shader_key_vs(const struct si_shader_key *key,
prefix, prolog->instance_divisor_is_one);
fprintf(f, " %s.instance_divisor_is_fetched = %u\n",
prefix, prolog->instance_divisor_is_fetched);
+ fprintf(f, " %s.ls_vgpr_fix = %u\n",
+ prefix, prolog->ls_vgpr_fix);
fprintf(f, " mono.vs.fix_fetch = {");
for (int i = 0; i < SI_MAX_ATTRIBS; i++)
@@ -5636,6 +5638,14 @@ static void si_init_exec_from_input(struct si_shader_context *ctx,
ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
}
+static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
+ const struct si_vs_prolog_bits *key)
+{
+ /* VGPR initialization fixup for Vega10 and Raven is always done in the
+ * VS prolog. */
+ return sel->vs_needs_prolog || key->ls_vgpr_fix;
+}
+
static bool si_compile_tgsi_main(struct si_shader_context *ctx,
bool is_monolithic)
{
@@ -5712,7 +5722,7 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
(shader->key.as_es || shader->key.as_ls) &&
(ctx->type == PIPE_SHADER_TESS_EVAL ||
(ctx->type == PIPE_SHADER_VERTEX &&
- !sel->vs_needs_prolog))) {
+ !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
si_init_exec_from_input(ctx,
ctx->param_merged_wave_info, 0);
} else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
@@ -6364,6 +6374,8 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
if (sscreen->b.chip_class >= GFX9) {
struct si_shader_selector *ls = shader->key.part.tcs.ls;
LLVMValueRef parts[4];
+ bool vs_needs_prolog =
+ si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
/* TCS main part */
parts[2] = ctx.main_fn;
@@ -6376,7 +6388,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
parts[3] = ctx.main_fn;
/* VS prolog */
- if (ls->vs_needs_prolog) {
+ if (vs_needs_prolog) {
union si_shader_part_key vs_prolog_key;
si_get_vs_prolog_key(&ls->info,
shader->info.num_input_sgprs,
@@ -6407,9 +6419,9 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
ctx.type = PIPE_SHADER_TESS_CTRL;
si_build_wrapper_function(&ctx,
- parts + !ls->vs_needs_prolog,
- 4 - !ls->vs_needs_prolog, 0,
- ls->vs_needs_prolog ? 2 : 1);
+ parts + !vs_needs_prolog,
+ 4 - !vs_needs_prolog, 0,
+ vs_needs_prolog ? 2 : 1);
} else {
LLVMValueRef parts[2];
union si_shader_part_key epilog_key;
@@ -6746,9 +6758,9 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
LLVMTypeRef *returns;
LLVMValueRef ret, func;
int num_returns, i;
- unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
- key->vs_prolog.num_merged_next_stage_vgprs;
+ unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
+ LLVMValueRef input_vgprs[9];
unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
num_input_vgprs;
unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
@@ -6768,13 +6780,10 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
/* Preloaded VGPRs (outputs must be floats) */
for (i = 0; i < num_input_vgprs; i++) {
- add_arg(&fninfo, ARG_VGPR, ctx->i32);
+ add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]);
returns[num_returns++] = ctx->f32;
}
- fninfo.assign[first_vs_vgpr] = &ctx->abi.vertex_id;
- fninfo.assign[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)] = &ctx->abi.instance_id;
-
/* Vertex load indices. */
for (i = 0; i <= key->vs_prolog.last_input; i++)
returns[num_returns++] = ctx->f32;
@@ -6783,9 +6792,33 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0);
func = ctx->main_fn;
- if (key->vs_prolog.num_merged_next_stage_vgprs &&
- !key->vs_prolog.is_monolithic)
- si_init_exec_from_input(ctx, 3, 0);
+ if (key->vs_prolog.num_merged_next_stage_vgprs) {
+ if (!key->vs_prolog.is_monolithic)
+ si_init_exec_from_input(ctx, 3, 0);
+
+ if (key->vs_prolog.as_ls &&
+ (ctx->screen->b.family == CHIP_VEGA10 ||
+ ctx->screen->b.family == CHIP_RAVEN)) {
+ /* If there are no HS threads, SPI loads the LS VGPRs
+ * starting at VGPR 0. Shift them back to where they
+ * belong.
+ */
+ LLVMValueRef has_hs_threads =
+ LLVMBuildICmp(gallivm->builder, LLVMIntNE,
+ unpack_param(ctx, 3, 8, 8),
+ ctx->i32_0, "");
+
+ for (i = 4; i > 0; --i) {
+ input_vgprs[i + 1] =
+ LLVMBuildSelect(gallivm->builder, has_hs_threads,
+ input_vgprs[i + 1],
+ input_vgprs[i - 1], "");
+ }
+ }
+ }
+
+ ctx->abi.vertex_id = input_vgprs[first_vs_vgpr];
+ ctx->abi.instance_id = input_vgprs[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)];
/* Copy inputs to outputs. This should be no-op, as the registers match,
* but it will prevent the compiler from overwriting them unintentionally.
@@ -6795,10 +6828,11 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
LLVMValueRef p = LLVMGetParam(func, i);
ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
}
- for (; i < fninfo.num_params; i++) {
- LLVMValueRef p = LLVMGetParam(func, i);
+ for (i = 0; i < num_input_vgprs; i++) {
+ LLVMValueRef p = input_vgprs[i];
p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
- ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
+ ret = LLVMBuildInsertValue(gallivm->builder, ret, p,
+ key->vs_prolog.num_input_sgprs + i, "");
}
/* Compute vertex load indices from instance divisors. */
@@ -6859,8 +6893,7 @@ static bool si_get_vs_prolog(struct si_screen *sscreen,
{
struct si_shader_selector *vs = main_part->selector;
- /* The prolog is a no-op if there are no inputs. */
- if (!vs->vs_needs_prolog)
+ if (!si_vs_needs_prolog(vs, key))
return true;
/* Get the prolog. */
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 0c0fa10f40f..ee6b0c167f9 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -398,6 +398,7 @@ struct si_vs_prolog_bits {
*/
uint16_t instance_divisor_is_one; /* bitmask of inputs */
uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
+ unsigned ls_vgpr_fix:1;
};
/* Common TCS bits between the shader key and the epilog key. */
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 7ee6cf88e88..051dfea8f7c 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -195,11 +195,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
*/
*num_patches = MIN2(*num_patches, 40);
- if (sctx->b.chip_class == SI ||
- /* TODO: fix GFX9 where a threadgroup contains more than 1 wave and
- * LS vertices per patch > HS vertices per patch. Piglit: 16in-1out */
- (sctx->b.chip_class == GFX9 &&
- num_tcs_input_cp > num_tcs_output_cp)) {
+ if (sctx->b.chip_class == SI) {
/* SI bug workaround, related to power management. Limit LS-HS
* threadgroups to only one wave.
*/
@@ -1264,6 +1260,27 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
sctx->do_update_shaders = true;
}
+ if (sctx->tes_shader.cso &&
+ (sctx->b.family == CHIP_VEGA10 || sctx->b.family == CHIP_RAVEN)) {
+ /* Determine whether the LS VGPR fix should be applied.
+ *
+ * It is only required when num input CPs > num output CPs,
+ * which cannot happen with the fixed function TCS. We should
+ * also update this bit when switching from TCS to fixed
+ * function TCS.
+ */
+ struct si_shader_selector *tcs = sctx->tcs_shader.cso;
+ bool ls_vgpr_fix =
+ tcs &&
+ info->vertices_per_patch >
+ tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
+
+ if (ls_vgpr_fix != sctx->ls_vgpr_fix) {
+ sctx->ls_vgpr_fix = ls_vgpr_fix;
+ sctx->do_update_shaders = true;
+ }
+ }
+
if (sctx->gs_shader.cso) {
/* Determine whether the GS triangle strip adjacency fix should
* be applied. Rotate every other triangle if
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 4f04bbdfaff..d8791a2a62e 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1283,6 +1283,15 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
si_shader_selector_key_vs(sctx, sctx->vs_shader.cso,
key, &key->part.tcs.ls_prolog);
key->part.tcs.ls = sctx->vs_shader.cso;
+
+ /* When the LS VGPR fix is needed, monolithic shaders
+ * can:
+ * - avoid initializing EXEC in both the LS prolog
+ * and the LS main part when !vs_needs_prolog
+ * - remove the fixup for unused input VGPRs
+ */
+ key->part.tcs.ls_prolog.ls_vgpr_fix = sctx->ls_vgpr_fix;
+ key->opt.prefer_mono = sctx->ls_vgpr_fix;
}
key->part.tcs.epilog.prim_mode =