From 4accb02d7a3722b3e1eb12252201846353f002b7 Mon Sep 17 00:00:00 2001 From: Marek Olšák Date: Thu, 30 Jun 2016 02:16:16 +0200 Subject: radeonsi: enable the barycentric optimization in all cases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Handle the bc_optimize SGPR bit if both CENTER and CENTROID are enabled. This should increase the PS launch rate for big primitives with MSAA. Based on discussion with SPI guys. Reviewed-by: Nicolai Hähnle --- src/gallium/drivers/radeonsi/si_shader.c | 118 +++++++++++++++++++++++- src/gallium/drivers/radeonsi/si_shader.h | 7 +- src/gallium/drivers/radeonsi/si_state_shaders.c | 18 ++-- 3 files changed, 125 insertions(+), 18 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index a59c28e75bf..abd58855da9 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1435,6 +1435,56 @@ static void interp_fs_input(struct si_shader_context *ctx, } } +/* LLVMGetParam with bc_optimize resolved. */ +static LLVMValueRef get_interp_param(struct si_shader_context *ctx, + int interp_param_idx) +{ + LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder; + LLVMValueRef main_fn = ctx->radeon_bld.main_fn; + LLVMValueRef param = NULL; + + /* Handle PRIM_MASK[31] (bc_optimize). */ + if (ctx->is_monolithic && + ((ctx->shader->key.ps.prolog.bc_optimize_for_persp && + interp_param_idx == SI_PARAM_PERSP_CENTROID) || + (ctx->shader->key.ps.prolog.bc_optimize_for_linear && + interp_param_idx == SI_PARAM_LINEAR_CENTROID))) { + /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER; + * The hw doesn't compute CENTROID if the whole wave only + * contains fully-covered quads. + */ + LLVMValueRef bc_optimize = + LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK); + bc_optimize = LLVMBuildLShr(builder, + bc_optimize, + LLVMConstInt(ctx->i32, 31, 0), ""); + bc_optimize = LLVMBuildTrunc(builder, bc_optimize, ctx->i1, ""); + + if (ctx->shader->key.ps.prolog.bc_optimize_for_persp && + interp_param_idx == SI_PARAM_PERSP_CENTROID) { + param = LLVMBuildSelect(builder, bc_optimize, + LLVMGetParam(main_fn, + SI_PARAM_PERSP_CENTER), + LLVMGetParam(main_fn, + SI_PARAM_PERSP_CENTROID), + ""); + } + if (ctx->shader->key.ps.prolog.bc_optimize_for_linear && + interp_param_idx == SI_PARAM_LINEAR_CENTROID) { + param = LLVMBuildSelect(builder, bc_optimize, + LLVMGetParam(main_fn, + SI_PARAM_LINEAR_CENTER), + LLVMGetParam(main_fn, + SI_PARAM_LINEAR_CENTROID), + ""); + } + } + + if (!param) + param = LLVMGetParam(main_fn, interp_param_idx); + return param; +} + static void declare_input_fs( struct radeon_llvm_context *radeon_bld, unsigned input_index, @@ -1475,7 +1525,7 @@ static void declare_input_fs( else if (interp_param_idx) { interp_param_idx = select_interp_param(ctx, interp_param_idx); - interp_param = LLVMGetParam(main_fn, interp_param_idx); + interp_param = get_interp_param(ctx, interp_param_idx); } interp_fs_input(ctx, input_index, decl->Semantic.Name, @@ -5041,7 +5091,7 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action, if (interp_param_idx == -1) return; else if (interp_param_idx) - interp_param = LLVMGetParam(ctx->radeon_bld.main_fn, interp_param_idx); + interp_param = get_interp_param(ctx, interp_param_idx); else interp_param = NULL; @@ -6398,6 +6448,8 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f) fprintf(f, " prolog.force_linear_sample_interp = %u\n", key->ps.prolog.force_linear_sample_interp); fprintf(f, " prolog.force_persp_center_interp = %u\n", key->ps.prolog.force_persp_center_interp); fprintf(f, " prolog.force_linear_center_interp = %u\n", key->ps.prolog.force_linear_center_interp); + fprintf(f, " prolog.bc_optimize_for_persp = %u\n", key->ps.prolog.bc_optimize_for_persp); + fprintf(f, " prolog.bc_optimize_for_linear = %u\n", key->ps.prolog.bc_optimize_for_linear); fprintf(f, " epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format); fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8); fprintf(f, " epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf); @@ -7192,6 +7244,55 @@ static bool si_compile_ps_prolog(struct si_screen *sscreen, si_llvm_emit_polygon_stipple(&ctx, list, pos); } + if (key->ps_prolog.states.bc_optimize_for_persp || + key->ps_prolog.states.bc_optimize_for_linear) { + unsigned i, base = key->ps_prolog.num_input_sgprs; + LLVMValueRef center[2], centroid[2], tmp, bc_optimize; + + /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER; + * The hw doesn't compute CENTROID if the whole wave only + * contains fully-covered quads. + * + * PRIM_MASK is after user SGPRs. + */ + bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); + bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize, + LLVMConstInt(ctx.i32, 31, 0), ""); + bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize, + ctx.i1, ""); + + if (key->ps_prolog.states.bc_optimize_for_persp) { + /* Read PERSP_CENTER. */ + for (i = 0; i < 2; i++) + center[i] = LLVMGetParam(func, base + 2 + i); + /* Read PERSP_CENTROID. */ + for (i = 0; i < 2; i++) + centroid[i] = LLVMGetParam(func, base + 4 + i); + /* Select PERSP_CENTROID. */ + for (i = 0; i < 2; i++) { + tmp = LLVMBuildSelect(gallivm->builder, bc_optimize, + center[i], centroid[i], ""); + ret = LLVMBuildInsertValue(gallivm->builder, ret, + tmp, base + 4 + i, ""); + } + } + if (key->ps_prolog.states.bc_optimize_for_linear) { + /* Read LINEAR_CENTER. */ + for (i = 0; i < 2; i++) + center[i] = LLVMGetParam(func, base + 8 + i); + /* Read LINEAR_CENTROID. */ + for (i = 0; i < 2; i++) + centroid[i] = LLVMGetParam(func, base + 10 + i); + /* Select LINEAR_CENTROID. */ + for (i = 0; i < 2; i++) { + tmp = LLVMBuildSelect(gallivm->builder, bc_optimize, + center[i], centroid[i], ""); + ret = LLVMBuildInsertValue(gallivm->builder, ret, + tmp, base + 10 + i, ""); + } + } + } + /* Interpolate colors. */ for (i = 0; i < 2; i++) { unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf; @@ -7208,8 +7309,11 @@ static bool si_compile_ps_prolog(struct si_screen *sscreen, unsigned interp_vgpr = key->ps_prolog.num_input_sgprs + key->ps_prolog.color_interp_vgpr_index[i]; - interp[0] = LLVMGetParam(func, interp_vgpr); - interp[1] = LLVMGetParam(func, interp_vgpr + 1); + /* Get the (i,j) updated by bc_optimize handling. */ + interp[0] = LLVMBuildExtractValue(gallivm->builder, ret, + interp_vgpr, ""); + interp[1] = LLVMBuildExtractValue(gallivm->builder, ret, + interp_vgpr + 1, ""); interp_ij = lp_build_gather_values(gallivm, interp, 2); interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij, ctx.v2i32, ""); @@ -7466,7 +7570,9 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen, prolog_key.ps_prolog.states.force_persp_sample_interp || prolog_key.ps_prolog.states.force_linear_sample_interp || prolog_key.ps_prolog.states.force_persp_center_interp || - prolog_key.ps_prolog.states.force_linear_center_interp); + prolog_key.ps_prolog.states.force_linear_center_interp || + prolog_key.ps_prolog.states.bc_optimize_for_persp || + prolog_key.ps_prolog.states.bc_optimize_for_linear); if (info->colors_read) { unsigned *color = shader->selector->color_attr_index; @@ -7557,6 +7663,8 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen, prolog_key.ps_prolog.states.force_linear_sample_interp || prolog_key.ps_prolog.states.force_persp_center_interp || prolog_key.ps_prolog.states.force_linear_center_interp || + prolog_key.ps_prolog.states.bc_optimize_for_persp || + prolog_key.ps_prolog.states.bc_optimize_for_linear || prolog_key.ps_prolog.states.poly_stipple) { shader->prolog = si_get_shader_part(sscreen, &sscreen->ps_prologs, diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 064773605fb..3b7b3e155b3 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -317,11 +317,8 @@ struct si_ps_prolog_bits { unsigned force_linear_sample_interp:1; unsigned force_persp_center_interp:1; unsigned force_linear_center_interp:1; - /* TODO: - * - add force_center_interp_bc_optimize to force center interpolation - * based on the bc_optimize SGPR bit if MSAA is enabled, centroid is - * present and sample isn't present. - */ + unsigned bc_optimize_for_persp:1; + unsigned bc_optimize_for_linear:1; }; /* Common PS bits between the shader key and the epilog key. */ diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index d679825914d..dc4f187bc65 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -664,7 +664,6 @@ static void si_shader_ps(struct si_shader *shader) unsigned spi_ps_in_control, spi_shader_col_format, cb_shader_mask; unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1); uint64_t va; - bool has_centroid; unsigned input_ena = shader->config.spi_ps_input_ena; /* we need to enable at least one of them, otherwise we hang the GPU */ @@ -729,11 +728,7 @@ static void si_shader_ps(struct si_shader *shader) shader->config.spi_ps_input_addr); /* Set interpolation controls. */ - has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena) || - G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena); - - spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader)) | - S_0286D8_BC_OPTIMIZE_DISABLE(has_centroid); + spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader)); /* Set registers. */ si_pm4_set_reg(pm4, R_0286E0_SPI_BARYC_CNTL, spi_baryc_cntl); @@ -946,8 +941,15 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, key->ps.prolog.force_linear_sample_interp = sel->info.uses_linear_center || sel->info.uses_linear_centroid; - } else if (!rs->multisample_enable || - sctx->framebuffer.nr_samples <= 1) { + } else if (rs->multisample_enable && + sctx->framebuffer.nr_samples > 1) { + key->ps.prolog.bc_optimize_for_persp = + sel->info.uses_persp_center && + sel->info.uses_persp_centroid; + key->ps.prolog.bc_optimize_for_linear = + sel->info.uses_linear_center && + sel->info.uses_linear_centroid; + } else { /* Make sure SPI doesn't compute more than 1 pair * of (i,j), which is the optimization here. */ key->ps.prolog.force_persp_center_interp = -- cgit v1.2.3