summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/radeonsi
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers/radeonsi')
-rw-r--r--src/gallium/drivers/radeonsi/si_compute.c31
-rw-r--r--src/gallium/drivers/radeonsi/si_pipe.c35
-rw-r--r--src/gallium/drivers/radeonsi/si_pipe.h26
-rw-r--r--src/gallium/drivers/radeonsi/si_shader.c1784
-rw-r--r--src/gallium/drivers/radeonsi/si_shader.h155
-rw-r--r--src/gallium/drivers/radeonsi/si_state.c2
-rw-r--r--src/gallium/drivers/radeonsi/si_state.h2
-rw-r--r--src/gallium/drivers/radeonsi/si_state_shaders.c383
8 files changed, 2152 insertions, 266 deletions
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 7370a113d3d..9f5f4c682bc 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -196,9 +196,7 @@ static unsigned compute_num_waves_for_scratch(
}
static void si_launch_grid(
- struct pipe_context *ctx,
- const uint *block_layout, const uint *grid_layout,
- uint32_t pc, const void *input)
+ struct pipe_context *ctx, const struct pipe_grid_info *info)
{
struct si_context *sctx = (struct si_context*)ctx;
struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
@@ -232,7 +230,7 @@ static void si_launch_grid(
pm4->compute_pkt = true;
/* Read the config information */
- si_shader_binary_read_config(&shader->binary, &shader->config, pc);
+ si_shader_binary_read_config(&shader->binary, &shader->config, info->pc);
/* Upload the kernel arguments */
@@ -242,15 +240,16 @@ static void si_launch_grid(
kernel_args = sctx->b.ws->buffer_map(input_buffer->buf,
sctx->b.gfx.cs, PIPE_TRANSFER_WRITE);
for (i = 0; i < 3; i++) {
- kernel_args[i] = grid_layout[i];
- kernel_args[i + 3] = grid_layout[i] * block_layout[i];
- kernel_args[i + 6] = block_layout[i];
+ kernel_args[i] = info->grid[i];
+ kernel_args[i + 3] = info->grid[i] * info->block[i];
+ kernel_args[i + 6] = info->block[i];
}
num_waves_for_scratch = compute_num_waves_for_scratch(
- &sctx->screen->b.info, block_layout, grid_layout);
+ &sctx->screen->b.info, info->block, info->grid);
- memcpy(kernel_args + (num_work_size_bytes / 4), input, program->input_size);
+ memcpy(kernel_args + (num_work_size_bytes / 4), info->input,
+ program->input_size);
if (shader->config.scratch_bytes_per_wave > 0) {
@@ -291,11 +290,11 @@ static void si_launch_grid(
si_pm4_set_reg(pm4, R_00B818_COMPUTE_START_Z, 0);
si_pm4_set_reg(pm4, R_00B81C_COMPUTE_NUM_THREAD_X,
- S_00B81C_NUM_THREAD_FULL(block_layout[0]));
+ S_00B81C_NUM_THREAD_FULL(info->block[0]));
si_pm4_set_reg(pm4, R_00B820_COMPUTE_NUM_THREAD_Y,
- S_00B820_NUM_THREAD_FULL(block_layout[1]));
+ S_00B820_NUM_THREAD_FULL(info->block[1]));
si_pm4_set_reg(pm4, R_00B824_COMPUTE_NUM_THREAD_Z,
- S_00B824_NUM_THREAD_FULL(block_layout[2]));
+ S_00B824_NUM_THREAD_FULL(info->block[2]));
/* Global buffers */
for (i = 0; i < MAX_GLOBAL_BUFFERS; i++) {
@@ -323,7 +322,7 @@ static void si_launch_grid(
}
shader_va = shader->bo->gpu_address;
- shader_va += pc;
+ shader_va += info->pc;
radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo,
RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
@@ -375,9 +374,9 @@ static void si_launch_grid(
;
si_pm4_cmd_begin(pm4, PKT3_DISPATCH_DIRECT);
- si_pm4_cmd_add(pm4, grid_layout[0]); /* Thread groups DIM_X */
- si_pm4_cmd_add(pm4, grid_layout[1]); /* Thread groups DIM_Y */
- si_pm4_cmd_add(pm4, grid_layout[2]); /* Thread gropus DIM_Z */
+ si_pm4_cmd_add(pm4, info->grid[0]); /* Thread groups DIM_X */
+ si_pm4_cmd_add(pm4, info->grid[1]); /* Thread groups DIM_Y */
+ si_pm4_cmd_add(pm4, info->grid[2]); /* Thread gropus DIM_Z */
si_pm4_cmd_add(pm4, 1); /* DISPATCH_INITIATOR */
si_pm4_cmd_end(pm4, false);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index e9d69d2db38..37fd4a25d59 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -22,6 +22,7 @@
*/
#include "si_pipe.h"
+#include "si_shader.h"
#include "si_public.h"
#include "sid.h"
@@ -448,6 +449,10 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
switch (param) {
case PIPE_SHADER_CAP_PREFERRED_IR:
return PIPE_SHADER_IR_NATIVE;
+
+ case PIPE_SHADER_CAP_SUPPORTED_IRS:
+ return 0;
+
case PIPE_SHADER_CAP_DOUBLES:
return HAVE_LLVM >= 0x0307;
@@ -511,6 +516,8 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
return 16;
case PIPE_SHADER_CAP_PREFERRED_IR:
return PIPE_SHADER_IR_TGSI;
+ case PIPE_SHADER_CAP_SUPPORTED_IRS:
+ return 0;
case PIPE_SHADER_CAP_DOUBLES:
return HAVE_LLVM >= 0x0307;
case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
@@ -522,6 +529,7 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
return 32;
case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+ case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
return 0;
}
return 0;
@@ -530,6 +538,14 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
static void si_destroy_screen(struct pipe_screen* pscreen)
{
struct si_screen *sscreen = (struct si_screen *)pscreen;
+ struct si_shader_part *parts[] = {
+ sscreen->vs_prologs,
+ sscreen->vs_epilogs,
+ sscreen->tcs_epilogs,
+ sscreen->ps_prologs,
+ sscreen->ps_epilogs
+ };
+ unsigned i;
if (!sscreen)
return;
@@ -537,6 +553,18 @@ static void si_destroy_screen(struct pipe_screen* pscreen)
if (!sscreen->b.ws->unref(sscreen->b.ws))
return;
+ /* Free shader parts. */
+ for (i = 0; i < ARRAY_SIZE(parts); i++) {
+ while (parts[i]) {
+ struct si_shader_part *part = parts[i];
+
+ parts[i] = part->next;
+ radeon_shader_binary_clean(&part->binary);
+ FREE(part);
+ }
+ }
+ pipe_mutex_destroy(sscreen->shader_parts_mutex);
+ si_destroy_shader_cache(sscreen);
r600_destroy_common_screen(&sscreen->b);
}
@@ -584,7 +612,8 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
sscreen->b.b.resource_create = r600_resource_create_common;
if (!r600_common_screen_init(&sscreen->b, ws) ||
- !si_init_gs_info(sscreen)) {
+ !si_init_gs_info(sscreen) ||
+ !si_init_shader_cache(sscreen)) {
FREE(sscreen);
return NULL;
}
@@ -594,6 +623,10 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
sscreen->b.has_cp_dma = true;
sscreen->b.has_streamout = true;
+ pipe_mutex_init(sscreen->shader_parts_mutex);
+ sscreen->use_monolithic_shaders =
+ HAVE_LLVM < 0x0308 ||
+ (sscreen->b.debug_flags & DBG_MONOLITHIC_SHADERS) != 0;
if (debug_get_bool_option("RADEON_DUMP_SHADERS", FALSE))
sscreen->b.debug_flags |= DBG_FS | DBG_VS | DBG_GS | DBG_PS | DBG_CS;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index b5790d6b564..ef860a58b83 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -80,10 +80,36 @@
#define SI_MAX_BORDER_COLORS 4096
struct si_compute;
+struct hash_table;
struct si_screen {
struct r600_common_screen b;
unsigned gs_table_depth;
+
+ /* Whether shaders are monolithic (1-part) or separate (3-part). */
+ bool use_monolithic_shaders;
+
+ pipe_mutex shader_parts_mutex;
+ struct si_shader_part *vs_prologs;
+ struct si_shader_part *vs_epilogs;
+ struct si_shader_part *tcs_epilogs;
+ struct si_shader_part *ps_prologs;
+ struct si_shader_part *ps_epilogs;
+
+ /* Shader cache in memory.
+ *
+ * Design & limitations:
+ * - The shader cache is per screen (= per process), never saved to
+ * disk, and skips redundant shader compilations from TGSI to bytecode.
+ * - It can only be used with one-variant-per-shader support, in which
+ * case only the main (typically middle) part of shaders is cached.
+ * - Only VS, TCS, TES, PS are cached, out of which only the hw VS
+ * variants of VS and TES are cached, so LS and ES aren't.
+ * - GS and CS aren't cached, but it's certainly possible to cache
+ * those as well.
+ */
+ pipe_mutex shader_cache_mutex;
+ struct hash_table *shader_cache;
};
struct si_blend_color {
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index baa1090e2fb..57458ae1381 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -70,6 +70,12 @@ struct si_shader_context
unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */
bool is_gs_copy_shader;
+
+ /* Whether to generate the optimized shader variant compiled as a whole
+ * (without a prolog and epilog)
+ */
+ bool is_monolithic;
+
int param_streamout_config;
int param_streamout_write_index;
int param_streamout_offset[4];
@@ -77,6 +83,7 @@ struct si_shader_context
int param_rel_auto_id;
int param_vs_prim_id;
int param_instance_id;
+ int param_vertex_index0;
int param_tes_u;
int param_tes_v;
int param_tes_rel_patch_id;
@@ -96,14 +103,17 @@ struct si_shader_context
LLVMValueRef esgs_ring;
LLVMValueRef gsvs_ring[4];
LLVMValueRef gs_next_vertex[4];
+ LLVMValueRef return_value;
LLVMTypeRef voidt;
LLVMTypeRef i1;
LLVMTypeRef i8;
LLVMTypeRef i32;
+ LLVMTypeRef i64;
LLVMTypeRef i128;
LLVMTypeRef f32;
LLVMTypeRef v16i8;
+ LLVMTypeRef v2i32;
LLVMTypeRef v4i32;
LLVMTypeRef v4f32;
LLVMTypeRef v8i32;
@@ -118,9 +128,17 @@ static struct si_shader_context *si_shader_context(
static void si_init_shader_ctx(struct si_shader_context *ctx,
struct si_screen *sscreen,
struct si_shader *shader,
- LLVMTargetMachineRef tm,
- struct tgsi_shader_info *info);
+ LLVMTargetMachineRef tm);
+/* Ideally pass the sample mask input to the PS epilog as v13, which
+ * is its usual location, so that the shader doesn't have to add v_mov.
+ */
+#define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
+
+/* The VS location of the PrimitiveID input is the same in the epilog,
+ * so that the main shader part doesn't have to move it.
+ */
+#define VS_EPILOG_PRIMID_LOC 2
#define PERSPECTIVE_BASE 0
#define LINEAR_BASE 9
@@ -196,6 +214,10 @@ static LLVMValueRef unpack_param(struct si_shader_context *ctx,
LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn,
param);
+ if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
+ value = bitcast(&ctx->radeon_bld.soa.bld_base,
+ TGSI_TYPE_UNSIGNED, value);
+
if (rshift)
value = LLVMBuildLShr(gallivm->builder, value,
lp_build_const_int32(gallivm, rshift), "");
@@ -375,7 +397,7 @@ static LLVMValueRef build_indexed_load_const(
static LLVMValueRef get_instance_index_for_fetch(
struct radeon_llvm_context *radeon_bld,
- unsigned divisor)
+ unsigned param_start_instance, unsigned divisor)
{
struct si_shader_context *ctx =
si_shader_context(&radeon_bld->soa.bld_base);
@@ -389,8 +411,8 @@ static LLVMValueRef get_instance_index_for_fetch(
result = LLVMBuildUDiv(gallivm->builder, result,
lp_build_const_int32(gallivm, divisor), "");
- return LLVMBuildAdd(gallivm->builder, result, LLVMGetParam(
- radeon_bld->main_fn, SI_PARAM_START_INSTANCE), "");
+ return LLVMBuildAdd(gallivm->builder, result,
+ LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
}
static void declare_input_vs(
@@ -402,7 +424,8 @@ static void declare_input_vs(
struct gallivm_state *gallivm = base->gallivm;
struct si_shader_context *ctx =
si_shader_context(&radeon_bld->soa.bld_base);
- unsigned divisor = ctx->shader->key.vs.instance_divisors[input_index];
+ unsigned divisor =
+ ctx->shader->key.vs.prolog.instance_divisors[input_index];
unsigned chan;
@@ -424,10 +447,16 @@ static void declare_input_vs(
/* Build the attribute offset */
attribute_offset = lp_build_const_int32(gallivm, 0);
- if (divisor) {
+ if (!ctx->is_monolithic) {
+ buffer_index = LLVMGetParam(radeon_bld->main_fn,
+ ctx->param_vertex_index0 +
+ input_index);
+ } else if (divisor) {
/* Build index from instance ID, start instance and divisor */
- ctx->shader->uses_instanceid = true;
- buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld, divisor);
+ ctx->shader->info.uses_instanceid = true;
+ buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
+ SI_PARAM_START_INSTANCE,
+ divisor);
} else {
/* Load the buffer index for vertices. */
LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
@@ -853,7 +882,8 @@ static int lookup_interp_param_index(unsigned interpolate, unsigned location)
static unsigned select_interp_param(struct si_shader_context *ctx,
unsigned param)
{
- if (!ctx->shader->key.ps.force_persample_interp)
+ if (!ctx->shader->key.ps.prolog.force_persample_interp ||
+ !ctx->is_monolithic)
return param;
/* If the shader doesn't use center/centroid, just return the parameter.
@@ -923,7 +953,7 @@ static void interp_fs_input(struct si_shader_context *ctx,
intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
if (semantic_name == TGSI_SEMANTIC_COLOR &&
- ctx->shader->key.ps.color_two_side) {
+ ctx->shader->key.ps.prolog.color_two_side) {
LLVMValueRef args[4];
LLVMValueRef is_face_positive;
LLVMValueRef back_attr_number;
@@ -997,6 +1027,7 @@ static void declare_input_fs(
unsigned input_index,
const struct tgsi_full_declaration *decl)
{
+ struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
struct si_shader_context *ctx =
si_shader_context(&radeon_bld->soa.bld_base);
struct si_shader *shader = ctx->shader;
@@ -1004,6 +1035,26 @@ static void declare_input_fs(
LLVMValueRef interp_param = NULL;
int interp_param_idx;
+ /* Get colors from input VGPRs (set by the prolog). */
+ if (!ctx->is_monolithic &&
+ decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
+ unsigned i = decl->Semantic.Index;
+ unsigned colors_read = shader->selector->info.colors_read;
+ unsigned mask = colors_read >> (i * 4);
+ unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
+ (i ? util_bitcount(colors_read & 0xf) : 0);
+
+ radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
+ mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
+ radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
+ mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
+ radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
+ mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
+ radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
+ mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
+ return;
+ }
+
interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
decl->Interp.Location);
if (interp_param_idx == -1)
@@ -1330,12 +1381,12 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
const union si_shader_key *key = &ctx->shader->key;
- unsigned col_formats = key->ps.spi_shader_col_format;
+ unsigned col_formats = key->ps.epilog.spi_shader_col_format;
int cbuf = target - V_008DFC_SQ_EXP_MRT;
assert(cbuf >= 0 && cbuf < 8);
spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
- is_int8 = (key->ps.color_is_int8 >> cbuf) & 0x1;
+ is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1;
}
args[4] = uint->zero; /* COMPR flag */
@@ -1488,13 +1539,13 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
struct si_shader_context *ctx = si_shader_context(bld_base);
struct gallivm_state *gallivm = bld_base->base.gallivm;
- if (ctx->shader->key.ps.alpha_func != PIPE_FUNC_NEVER) {
+ if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn,
SI_PARAM_ALPHA_REF);
LLVMValueRef alpha_pass =
lp_build_cmp(&bld_base->base,
- ctx->shader->key.ps.alpha_func,
+ ctx->shader->key.ps.epilog.alpha_func,
alpha, alpha_ref);
LLVMValueRef arg =
lp_build_select(&bld_base->base,
@@ -1511,7 +1562,8 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
}
static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
- LLVMValueRef alpha)
+ LLVMValueRef alpha,
+ unsigned samplemask_param)
{
struct si_shader_context *ctx = si_shader_context(bld_base);
struct gallivm_state *gallivm = bld_base->base.gallivm;
@@ -1519,7 +1571,7 @@ static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *
/* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
coverage = LLVMGetParam(ctx->radeon_bld.main_fn,
- SI_PARAM_SAMPLE_COVERAGE);
+ samplemask_param);
coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
@@ -1841,7 +1893,8 @@ handle_semantic:
case TGSI_SEMANTIC_COLOR:
case TGSI_SEMANTIC_BCOLOR:
target = V_008DFC_SQ_EXP_PARAM + param_count;
- shader->vs_output_param_offset[i] = param_count;
+ assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
+ shader->info.vs_output_param_offset[i] = param_count;
param_count++;
break;
case TGSI_SEMANTIC_CLIPDIST:
@@ -1855,7 +1908,8 @@ handle_semantic:
case TGSI_SEMANTIC_TEXCOORD:
case TGSI_SEMANTIC_GENERIC:
target = V_008DFC_SQ_EXP_PARAM + param_count;
- shader->vs_output_param_offset[i] = param_count;
+ assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
+ shader->info.vs_output_param_offset[i] = param_count;
param_count++;
break;
default:
@@ -1883,7 +1937,7 @@ handle_semantic:
}
}
- shader->nr_param_exports = param_count;
+ shader->info.nr_param_exports = param_count;
/* We need to add the position output manually if it's missing. */
if (!pos_args[0][0]) {
@@ -1945,7 +1999,7 @@ handle_semantic:
for (i = 0; i < 4; i++)
if (pos_args[i][0])
- shader->nr_pos_exports++;
+ shader->info.nr_pos_exports++;
pos_idx = 0;
for (i = 0; i < 4; i++) {
@@ -1955,7 +2009,7 @@ handle_semantic:
/* Specify the target we are exporting */
pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
- if (pos_idx == shader->nr_pos_exports)
+ if (pos_idx == shader->info.nr_pos_exports)
/* Specify that this is the last export */
pos_args[i][2] = uint->one;
@@ -1989,7 +2043,7 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
invocation_id, bld_base->uint_bld.zero, ""));
/* Determine the layout of one tess factor element in the buffer. */
- switch (shader->key.tcs.prim_mode) {
+ switch (shader->key.tcs.epilog.prim_mode) {
case PIPE_PRIM_LINES:
stride = 2; /* 2 dwords, 1 vec2 store */
outer_comps = 2;
@@ -2061,14 +2115,51 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
{
struct si_shader_context *ctx = si_shader_context(bld_base);
- LLVMValueRef invocation_id;
+ LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
+ rel_patch_id = get_rel_patch_id(ctx);
invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
+ tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
- si_write_tess_factors(bld_base,
- get_rel_patch_id(ctx),
- invocation_id,
- get_tcs_out_current_patch_data_offset(ctx));
+ if (!ctx->is_monolithic) {
+ /* Return epilog parameters from this function. */
+ LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+ LLVMValueRef ret = ctx->return_value;
+ LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
+ unsigned vgpr;
+
+ /* RW_BUFFERS pointer */
+ rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
+ SI_PARAM_RW_BUFFERS);
+ rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
+ rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
+ rw0 = LLVMBuildExtractElement(builder, rw_buffers,
+ bld_base->uint_bld.zero, "");
+ rw1 = LLVMBuildExtractElement(builder, rw_buffers,
+ bld_base->uint_bld.one, "");
+ ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
+ ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
+
+ /* Tess factor buffer soffset is after user SGPRs. */
+ tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
+ SI_PARAM_TESS_FACTOR_OFFSET);
+ ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
+ SI_TCS_NUM_USER_SGPR, "");
+
+ /* VGPRs */
+ rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
+ invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
+ tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
+
+ vgpr = SI_TCS_NUM_USER_SGPR + 1;
+ ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
+ ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
+ ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
+ ctx->return_value = ret;
+ return;
+ }
+
+ si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset);
}
static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
@@ -2214,16 +2305,26 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
"");
}
- /* Export PrimitiveID when PS needs it. */
- if (si_vs_exports_prim_id(ctx->shader)) {
- outputs[i].name = TGSI_SEMANTIC_PRIMID;
- outputs[i].sid = 0;
- outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
- get_primitive_id(bld_base, 0));
- outputs[i].values[1] = bld_base->base.undef;
- outputs[i].values[2] = bld_base->base.undef;
- outputs[i].values[3] = bld_base->base.undef;
- i++;
+ if (ctx->is_monolithic) {
+ /* Export PrimitiveID when PS needs it. */
+ if (si_vs_exports_prim_id(ctx->shader)) {
+ outputs[i].name = TGSI_SEMANTIC_PRIMID;
+ outputs[i].sid = 0;
+ outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+ get_primitive_id(bld_base, 0));
+ outputs[i].values[1] = bld_base->base.undef;
+ outputs[i].values[2] = bld_base->base.undef;
+ outputs[i].values[3] = bld_base->base.undef;
+ i++;
+ }
+ } else {
+ /* Return the primitive ID from the LLVM function. */
+ ctx->return_value =
+ LLVMBuildInsertValue(gallivm->builder,
+ ctx->return_value,
+ bitcast(bld_base, TGSI_TYPE_FLOAT,
+ get_primitive_id(bld_base, 0)),
+ VS_EPILOG_PRIMID_LOC, "");
}
si_llvm_export_vs(bld_base, outputs, i);
@@ -2284,6 +2385,7 @@ static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
LLVMValueRef *color, unsigned index,
+ unsigned samplemask_param,
bool is_last)
{
struct si_shader_context *ctx = si_shader_context(bld_base);
@@ -2291,30 +2393,31 @@ static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
int i;
/* Clamp color */
- if (ctx->shader->key.ps.clamp_color)
+ if (ctx->shader->key.ps.epilog.clamp_color)
for (i = 0; i < 4; i++)
color[i] = radeon_llvm_saturate(bld_base, color[i]);
/* Alpha to one */
- if (ctx->shader->key.ps.alpha_to_one)
+ if (ctx->shader->key.ps.epilog.alpha_to_one)
color[3] = base->one;
/* Alpha test */
if (index == 0 &&
- ctx->shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS)
+ ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
si_alpha_test(bld_base, color[3]);
/* Line & polygon smoothing */
- if (ctx->shader->key.ps.poly_line_smoothing)
- color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3]);
+ if (ctx->shader->key.ps.epilog.poly_line_smoothing)
+ color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
+ samplemask_param);
/* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
- if (ctx->shader->key.ps.last_cbuf > 0) {
+ if (ctx->shader->key.ps.epilog.last_cbuf > 0) {
LLVMValueRef args[8][9];
int c, last = -1;
/* Get the export arguments, also find out what the last one is. */
- for (c = 0; c <= ctx->shader->key.ps.last_cbuf; c++) {
+ for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
si_llvm_init_export_args(bld_base, color,
V_008DFC_SQ_EXP_MRT + c, args[c]);
if (args[c][0] != bld_base->uint_bld.zero)
@@ -2322,7 +2425,7 @@ static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
}
/* Emit all exports. */
- for (c = 0; c <= ctx->shader->key.ps.last_cbuf; c++) {
+ for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
if (is_last && last == c) {
args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
args[c][2] = bld_base->uint_bld.one; /* DONE bit */
@@ -2385,11 +2488,11 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
* Otherwise, find the last color export.
*/
if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) {
- unsigned spi_format = shader->key.ps.spi_shader_col_format;
+ unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format;
/* Don't export NULL and return if alpha-test is enabled. */
- if (shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS &&
- shader->key.ps.alpha_func != PIPE_FUNC_NEVER &&
+ if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS &&
+ shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER &&
(spi_format & 0xf) == 0)
spi_format |= V_028714_SPI_SHADER_32_AR;
@@ -2400,10 +2503,10 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
continue;
/* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
- if (shader->key.ps.last_cbuf > 0) {
+ if (shader->key.ps.epilog.last_cbuf > 0) {
/* Just set this if any of the colorbuffers are enabled. */
if (spi_format &
- ((1llu << (4 * (shader->key.ps.last_cbuf + 1))) - 1))
+ ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1))
last_color_export = i;
continue;
}
@@ -2445,6 +2548,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
ctx->radeon_bld.soa.outputs[i][j], "");
si_export_mrt_color(bld_base, color, semantic_index,
+ SI_PARAM_SAMPLE_COVERAGE,
last_color_export == i);
break;
default:
@@ -2458,6 +2562,100 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
si_export_mrt_z(bld_base, depth, stencil, samplemask);
}
+/**
+ * Return PS outputs in this order:
+ *
+ * v[0:3] = color0.xyzw
+ * v[4:7] = color1.xyzw
+ * ...
+ * vN+0 = Depth
+ * vN+1 = Stencil
+ * vN+2 = SampleMask
+ * vN+3 = SampleMaskIn (used for OpenGL smoothing)
+ *
+ * The alpha-ref SGPR is returned via its original location.
+ */
+static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
+{
+ struct si_shader_context *ctx = si_shader_context(bld_base);
+ struct si_shader *shader = ctx->shader;
+ struct lp_build_context *base = &bld_base->base;
+ struct tgsi_shader_info *info = &shader->selector->info;
+ LLVMBuilderRef builder = base->gallivm->builder;
+ unsigned i, j, first_vgpr, vgpr;
+
+ LLVMValueRef color[8][4] = {};
+ LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
+ LLVMValueRef ret;
+
+ /* Read the output values. */
+ for (i = 0; i < info->num_outputs; i++) {
+ unsigned semantic_name = info->output_semantic_name[i];
+ unsigned semantic_index = info->output_semantic_index[i];
+
+ switch (semantic_name) {
+ case TGSI_SEMANTIC_COLOR:
+ assert(semantic_index < 8);
+ for (j = 0; j < 4; j++) {
+ LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j];
+ LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
+ color[semantic_index][j] = result;
+ }
+ break;
+ case TGSI_SEMANTIC_POSITION:
+ depth = LLVMBuildLoad(builder,
+ ctx->radeon_bld.soa.outputs[i][2], "");
+ break;
+ case TGSI_SEMANTIC_STENCIL:
+ stencil = LLVMBuildLoad(builder,
+ ctx->radeon_bld.soa.outputs[i][1], "");
+ break;
+ case TGSI_SEMANTIC_SAMPLEMASK:
+ samplemask = LLVMBuildLoad(builder,
+ ctx->radeon_bld.soa.outputs[i][0], "");
+ break;
+ default:
+ fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
+ semantic_name);
+ }
+ }
+
+ /* Fill the return structure. */
+ ret = ctx->return_value;
+
+ /* Set SGPRs. */
+ ret = LLVMBuildInsertValue(builder, ret,
+ bitcast(bld_base, TGSI_TYPE_SIGNED,
+ LLVMGetParam(ctx->radeon_bld.main_fn,
+ SI_PARAM_ALPHA_REF)),
+ SI_SGPR_ALPHA_REF, "");
+
+ /* Set VGPRs */
+ first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
+ for (i = 0; i < ARRAY_SIZE(color); i++) {
+ if (!color[i][0])
+ continue;
+
+ for (j = 0; j < 4; j++)
+ ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
+ }
+ if (depth)
+ ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
+ if (stencil)
+ ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
+ if (samplemask)
+ ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
+
+ /* Add the input sample mask for smoothing at the end. */
+ if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
+ vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
+ ret = LLVMBuildInsertValue(builder, ret,
+ LLVMGetParam(ctx->radeon_bld.main_fn,
+ SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
+
+ ctx->return_value = ret;
+}
+
static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
struct lp_build_tgsi_context *bld_base,
struct lp_build_emit_data *emit_data);
@@ -2536,13 +2734,12 @@ static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
/**
* Load an image view, fmask view. or sampler state descriptor.
*/
-static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
- LLVMValueRef index, enum desc_type type)
+static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx,
+ LLVMValueRef list, LLVMValueRef index,
+ enum desc_type type)
{
struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
LLVMBuilderRef builder = gallivm->builder;
- LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
- SI_PARAM_SAMPLERS);
switch (type) {
case DESC_IMAGE:
@@ -2558,12 +2755,21 @@ static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
/* The sampler state is at [12:15]. */
index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
- ptr = LLVMBuildPointerCast(builder, ptr,
- const_array(ctx->v4i32, 0), "");
+ list = LLVMBuildPointerCast(builder, list,
+ const_array(ctx->v4i32, 0), "");
break;
}
- return build_indexed_load_const(ctx, ptr, index);
+ return build_indexed_load_const(ctx, list, index);
+}
+
+static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
+ LLVMValueRef index, enum desc_type type)
+{
+ LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn,
+ SI_PARAM_SAMPLERS);
+
+ return get_sampler_desc_custom(ctx, list, index, type);
}
static void tex_fetch_ptrs(
@@ -3546,6 +3752,30 @@ static const struct lp_build_tgsi_action interp_action = {
.emit = build_interp_intrinsic,
};
+static void si_create_function(struct si_shader_context *ctx,
+ LLVMTypeRef *returns, unsigned num_returns,
+ LLVMTypeRef *params, unsigned num_params,
+ int last_array_pointer, int last_sgpr)
+{
+ int i;
+
+ radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns,
+ params, num_params);
+ radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
+ ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type);
+
+ for (i = 0; i <= last_sgpr; ++i) {
+ LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
+
+ /* We tell llvm that array inputs are passed by value to allow Sinking pass
+ * to move load. Inputs are constant so this is fine. */
+ if (i <= last_array_pointer)
+ LLVMAddAttribute(P, LLVMByValAttribute);
+ else
+ LLVMAddAttribute(P, LLVMInRegAttribute);
+ }
+}
+
static void create_meta_data(struct si_shader_context *ctx)
{
struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
@@ -3579,15 +3809,57 @@ static void declare_streamout_params(struct si_shader_context *ctx,
}
}
+static unsigned llvm_get_type_size(LLVMTypeRef type)
+{
+ LLVMTypeKind kind = LLVMGetTypeKind(type);
+
+ switch (kind) {
+ case LLVMIntegerTypeKind:
+ return LLVMGetIntTypeWidth(type) / 8;
+ case LLVMFloatTypeKind:
+ return 4;
+ case LLVMPointerTypeKind:
+ return 8;
+ case LLVMVectorTypeKind:
+ return LLVMGetVectorSize(type) *
+ llvm_get_type_size(LLVMGetElementType(type));
+ default:
+ assert(0);
+ return 0;
+ }
+}
+
+static void declare_tess_lds(struct si_shader_context *ctx)
+{
+ struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
+ LLVMTypeRef i32 = ctx->radeon_bld.soa.bld_base.uint_bld.elem_type;
+
+ /* This is the upper bound, maximum is 32 inputs times 32 vertices */
+ unsigned vertex_data_dw_size = 32*32*4;
+ unsigned patch_data_dw_size = 32*4;
+ /* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */
+ unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size;
+ unsigned lds_dwords = patch_dw_size;
+
+ /* The actual size is computed outside of the shader to reduce
+ * the number of shader variants. */
+ ctx->lds =
+ LLVMAddGlobalInAddressSpace(gallivm->module,
+ LLVMArrayType(i32, lds_dwords),
+ "tess_lds",
+ LOCAL_ADDR_SPACE);
+}
+
static void create_function(struct si_shader_context *ctx)
{
struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
struct gallivm_state *gallivm = bld_base->base.gallivm;
struct si_shader *shader = ctx->shader;
- LLVMTypeRef params[SI_NUM_PARAMS], v2i32, v3i32;
- unsigned i, last_array_pointer, last_sgpr, num_params;
+ LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32;
+ LLVMTypeRef returns[16+32*4];
+ unsigned i, last_array_pointer, last_sgpr, num_params, num_return_sgprs;
+ unsigned num_returns = 0;
- v2i32 = LLVMVectorType(ctx->i32, 2);
v3i32 = LLVMVectorType(ctx->i32, 3);
params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
@@ -3630,6 +3902,20 @@ static void create_function(struct si_shader_context *ctx)
params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
params[ctx->param_instance_id = num_params++] = ctx->i32;
+
+ if (!ctx->is_monolithic &&
+ !ctx->is_gs_copy_shader) {
+ /* Vertex load indices. */
+ ctx->param_vertex_index0 = num_params;
+
+ for (i = 0; i < shader->selector->info.num_inputs; i++)
+ params[num_params++] = ctx->i32;
+
+ /* PrimitiveID output. */
+ if (!shader->key.vs.as_es && !shader->key.vs.as_ls)
+ for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
+ returns[num_returns++] = ctx->f32;
+ }
break;
case TGSI_PROCESSOR_TESS_CTRL:
@@ -3643,6 +3929,15 @@ static void create_function(struct si_shader_context *ctx)
params[SI_PARAM_PATCH_ID] = ctx->i32;
params[SI_PARAM_REL_IDS] = ctx->i32;
num_params = SI_PARAM_REL_IDS+1;
+
+ if (!ctx->is_monolithic) {
+ /* PARAM_TESS_FACTOR_OFFSET is after user SGPRs. */
+ for (i = 0; i <= SI_TCS_NUM_USER_SGPR; i++)
+ returns[num_returns++] = ctx->i32; /* SGPRs */
+
+ for (i = 0; i < 3; i++)
+ returns[num_returns++] = ctx->f32; /* VGPRs */
+ }
break;
case TGSI_PROCESSOR_TESS_EVAL:
@@ -3663,6 +3958,11 @@ static void create_function(struct si_shader_context *ctx)
params[ctx->param_tes_v = num_params++] = ctx->f32;
params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
+
+ /* PrimitiveID output. */
+ if (!ctx->is_monolithic && !shader->key.tes.as_es)
+ for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
+ returns[num_returns++] = ctx->f32;
break;
case TGSI_PROCESSOR_GEOMETRY:
@@ -3686,13 +3986,13 @@ static void create_function(struct si_shader_context *ctx)
params[SI_PARAM_ALPHA_REF] = ctx->f32;
params[SI_PARAM_PRIM_MASK] = ctx->i32;
last_sgpr = SI_PARAM_PRIM_MASK;
- params[SI_PARAM_PERSP_SAMPLE] = v2i32;
- params[SI_PARAM_PERSP_CENTER] = v2i32;
- params[SI_PARAM_PERSP_CENTROID] = v2i32;
+ params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
+ params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
+ params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
- params[SI_PARAM_LINEAR_SAMPLE] = v2i32;
- params[SI_PARAM_LINEAR_CENTER] = v2i32;
- params[SI_PARAM_LINEAR_CENTROID] = v2i32;
+ params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
+ params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
+ params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
@@ -3701,8 +4001,39 @@ static void create_function(struct si_shader_context *ctx)
params[SI_PARAM_FRONT_FACE] = ctx->i32;
params[SI_PARAM_ANCILLARY] = ctx->i32;
params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
- params[SI_PARAM_POS_FIXED_PT] = ctx->f32;
+ params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
num_params = SI_PARAM_POS_FIXED_PT+1;
+
+ if (!ctx->is_monolithic) {
+ /* Color inputs from the prolog. */
+ if (shader->selector->info.colors_read) {
+ unsigned num_color_elements =
+ util_bitcount(shader->selector->info.colors_read);
+
+ assert(num_params + num_color_elements <= ARRAY_SIZE(params));
+ for (i = 0; i < num_color_elements; i++)
+ params[num_params++] = ctx->f32;
+ }
+
+ /* Outputs for the epilog. */
+ num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
+ num_returns =
+ num_return_sgprs +
+ util_bitcount(shader->selector->info.colors_written) * 4 +
+ shader->selector->info.writes_z +
+ shader->selector->info.writes_stencil +
+ shader->selector->info.writes_samplemask +
+ 1 /* SampleMaskIn */;
+
+ num_returns = MAX2(num_returns,
+ num_return_sgprs +
+ PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
+
+ for (i = 0; i < num_return_sgprs; i++)
+ returns[i] = ctx->i32;
+ for (; i < num_returns; i++)
+ returns[i] = ctx->f32;
+ }
break;
default:
@@ -3711,20 +4042,38 @@ static void create_function(struct si_shader_context *ctx)
}
assert(num_params <= Elements(params));
- radeon_llvm_create_func(&ctx->radeon_bld, params, num_params);
- radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
-
- for (i = 0; i <= last_sgpr; ++i) {
- LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
- /* We tell llvm that array inputs are passed by value to allow Sinking pass
- * to move load. Inputs are constant so this is fine. */
- if (i <= last_array_pointer)
- LLVMAddAttribute(P, LLVMByValAttribute);
- else
- LLVMAddAttribute(P, LLVMInRegAttribute);
+ si_create_function(ctx, returns, num_returns, params,
+ num_params, last_array_pointer, last_sgpr);
+
+ /* Reserve register locations for VGPR inputs the PS prolog may need. */
+ if (ctx->type == TGSI_PROCESSOR_FRAGMENT &&
+ !ctx->is_monolithic) {
+ radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
+ "InitialPSInputAddr",
+ S_0286D0_PERSP_SAMPLE_ENA(1) |
+ S_0286D0_PERSP_CENTER_ENA(1) |
+ S_0286D0_PERSP_CENTROID_ENA(1) |
+ S_0286D0_LINEAR_SAMPLE_ENA(1) |
+ S_0286D0_LINEAR_CENTER_ENA(1) |
+ S_0286D0_LINEAR_CENTROID_ENA(1) |
+ S_0286D0_FRONT_FACE_ENA(1) |
+ S_0286D0_POS_FIXED_PT_ENA(1));
}
+ shader->info.num_input_sgprs = 0;
+ shader->info.num_input_vgprs = 0;
+
+ for (i = 0; i <= last_sgpr; ++i)
+ shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
+
+ /* Unused fragment shader inputs are eliminated by the compiler,
+ * so we don't know yet how many there will be.
+ */
+ if (ctx->type != TGSI_PROCESSOR_FRAGMENT)
+ for (; i < num_params; ++i)
+ shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
+
if (bld_base->info &&
(bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
@@ -3740,22 +4089,8 @@ static void create_function(struct si_shader_context *ctx)
if ((ctx->type == TGSI_PROCESSOR_VERTEX && shader->key.vs.as_ls) ||
ctx->type == TGSI_PROCESSOR_TESS_CTRL ||
- ctx->type == TGSI_PROCESSOR_TESS_EVAL) {
- /* This is the upper bound, maximum is 32 inputs times 32 vertices */
- unsigned vertex_data_dw_size = 32*32*4;
- unsigned patch_data_dw_size = 32*4;
- /* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */
- unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size;
- unsigned lds_dwords = patch_dw_size;
-
- /* The actual size is computed outside of the shader to reduce
- * the number of shader variants. */
- ctx->lds =
- LLVMAddGlobalInAddressSpace(gallivm->module,
- LLVMArrayType(ctx->i32, lds_dwords),
- "tess_lds",
- LOCAL_ADDR_SPACE);
- }
+ ctx->type == TGSI_PROCESSOR_TESS_EVAL)
+ declare_tess_lds(ctx);
}
static void preload_constants(struct si_shader_context *ctx)
@@ -3887,6 +4222,49 @@ static void preload_ring_buffers(struct si_shader_context *ctx)
}
}
+static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
+ LLVMValueRef param_sampler_views,
+ unsigned param_pos_fixed_pt)
+{
+ struct lp_build_tgsi_context *bld_base =
+ &ctx->radeon_bld.soa.bld_base;
+ struct gallivm_state *gallivm = bld_base->base.gallivm;
+ struct lp_build_emit_data result = {};
+ struct tgsi_full_instruction inst = {};
+ LLVMValueRef desc, sampler_index, address[2], pix;
+
+ /* Use the fixed-point gl_FragCoord input.
+ * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
+ * per coordinate to get the repeating effect.
+ */
+ address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
+ address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
+
+ /* Load the sampler view descriptor. */
+ sampler_index = lp_build_const_int32(gallivm, SI_POLY_STIPPLE_SAMPLER);
+ desc = get_sampler_desc_custom(ctx, param_sampler_views,
+ sampler_index, DESC_IMAGE);
+
+ /* Load the texel. */
+ inst.Instruction.Opcode = TGSI_OPCODE_TXF;
+ inst.Texture.Texture = TGSI_TEXTURE_2D_MSAA; /* = use load, not load_mip */
+ result.inst = &inst;
+ set_tex_fetch_args(ctx, &result, TGSI_OPCODE_TXF,
+ inst.Texture.Texture,
+ desc, NULL, address, ARRAY_SIZE(address), 0xf);
+ build_tex_intrinsic(&tex_action, bld_base, &result);
+
+ /* Kill the thread accordingly. */
+ pix = LLVMBuildExtractElement(gallivm->builder, result.output[0],
+ lp_build_const_int32(gallivm, 3), "");
+ pix = bitcast(bld_base, TGSI_TYPE_FLOAT, pix);
+ pix = LLVMBuildFNeg(gallivm->builder, pix, "");
+
+ lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
+ LLVMVoidTypeInContext(gallivm->context),
+ &pix, 1, 0);
+}
+
void si_shader_binary_read_config(struct radeon_shader_binary *binary,
struct si_shader_config *conf,
unsigned symbol_offset)
@@ -3972,41 +4350,70 @@ void si_shader_apply_scratch_relocs(struct si_context *sctx,
}
}
+static unsigned si_get_shader_binary_size(struct si_shader *shader)
+{
+ unsigned size = shader->binary.code_size;
+
+ if (shader->prolog)
+ size += shader->prolog->binary.code_size;
+ if (shader->epilog)
+ size += shader->epilog->binary.code_size;
+ return size;
+}
+
int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
{
- const struct radeon_shader_binary *binary = &shader->binary;
- unsigned code_size = binary->code_size + binary->rodata_size;
+ const struct radeon_shader_binary *prolog =
+ shader->prolog ? &shader->prolog->binary : NULL;
+ const struct radeon_shader_binary *epilog =
+ shader->epilog ? &shader->epilog->binary : NULL;
+ const struct radeon_shader_binary *mainb = &shader->binary;
+ unsigned bo_size = si_get_shader_binary_size(shader) +
+ (!epilog ? mainb->rodata_size : 0);
unsigned char *ptr;
+ assert(!prolog || !prolog->rodata_size);
+ assert((!prolog && !epilog) || !mainb->rodata_size);
+ assert(!epilog || !epilog->rodata_size);
+
r600_resource_reference(&shader->bo, NULL);
shader->bo = si_resource_create_custom(&sscreen->b.b,
PIPE_USAGE_IMMUTABLE,
- code_size);
+ bo_size);
if (!shader->bo)
return -ENOMEM;
+ /* Upload. */
ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
PIPE_TRANSFER_READ_WRITE);
- util_memcpy_cpu_to_le32(ptr, binary->code, binary->code_size);
- if (binary->rodata_size > 0) {
- ptr += binary->code_size;
- util_memcpy_cpu_to_le32(ptr, binary->rodata,
- binary->rodata_size);
+
+ if (prolog) {
+ util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
+ ptr += prolog->code_size;
}
+ util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
+ ptr += mainb->code_size;
+
+ if (epilog)
+ util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
+ else if (mainb->rodata_size > 0)
+ util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
+
sscreen->b.ws->buffer_unmap(shader->bo->buf);
return 0;
}
static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
- struct pipe_debug_callback *debug)
+ struct pipe_debug_callback *debug,
+ const char *name)
{
char *line, *p;
unsigned i, count;
if (binary->disasm_string) {
- fprintf(stderr, "\nShader Disassembly:\n\n");
- fprintf(stderr, "%s\n", binary->disasm_string);
+ fprintf(stderr, "Shader %s disassembly:\n", name);
+ fprintf(stderr, "%s", binary->disasm_string);
if (debug && debug->debug_message) {
/* Very long debug messages are cut off, so send the
@@ -4036,7 +4443,7 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary
"Shader Disassembly End");
}
} else {
- fprintf(stderr, "SI CODE:\n");
+ fprintf(stderr, "Shader %s binary:\n", name);
for (i = 0; i < binary->code_size; i += 4) {
fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i,
binary->code[i + 3], binary->code[i + 2],
@@ -4115,16 +4522,60 @@ static void si_shader_dump_stats(struct si_screen *sscreen,
max_simd_waves);
}
+static const char *si_get_shader_name(struct si_shader *shader,
+ unsigned processor)
+{
+ switch (processor) {
+ case TGSI_PROCESSOR_VERTEX:
+ if (shader->key.vs.as_es)
+ return "Vertex Shader as ES";
+ else if (shader->key.vs.as_ls)
+ return "Vertex Shader as LS";
+ else
+ return "Vertex Shader as VS";
+ case TGSI_PROCESSOR_TESS_CTRL:
+ return "Tessellation Control Shader";
+ case TGSI_PROCESSOR_TESS_EVAL:
+ if (shader->key.tes.as_es)
+ return "Tessellation Evaluation Shader as ES";
+ else
+ return "Tessellation Evaluation Shader as VS";
+ case TGSI_PROCESSOR_GEOMETRY:
+ if (shader->gs_copy_shader == NULL)
+ return "GS Copy Shader as VS";
+ else
+ return "Geometry Shader";
+ case TGSI_PROCESSOR_FRAGMENT:
+ return "Pixel Shader";
+ case TGSI_PROCESSOR_COMPUTE:
+ return "Compute Shader";
+ default:
+ return "Unknown Shader";
+ }
+}
+
void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
struct pipe_debug_callback *debug, unsigned processor)
{
- if (r600_can_dump_shader(&sscreen->b, processor))
- if (!(sscreen->b.debug_flags & DBG_NO_ASM))
- si_shader_dump_disassembly(&shader->binary, debug);
+ if (r600_can_dump_shader(&sscreen->b, processor) &&
+ !(sscreen->b.debug_flags & DBG_NO_ASM)) {
+ fprintf(stderr, "\n%s:\n", si_get_shader_name(shader, processor));
+
+ if (shader->prolog)
+ si_shader_dump_disassembly(&shader->prolog->binary,
+ debug, "prolog");
+
+ si_shader_dump_disassembly(&shader->binary, debug, "main");
+
+ if (shader->epilog)
+ si_shader_dump_disassembly(&shader->epilog->binary,
+ debug, "epilog");
+ fprintf(stderr, "\n");
+ }
si_shader_dump_stats(sscreen, &shader->config,
shader->selector ? shader->selector->info.num_inputs : 0,
- shader->binary.code_size, debug, processor);
+ si_get_shader_binary_size(shader), debug, processor);
}
int si_compile_llvm(struct si_screen *sscreen,
@@ -4177,6 +4628,19 @@ int si_compile_llvm(struct si_screen *sscreen,
FREE(binary->global_symbol_offsets);
binary->config = NULL;
binary->global_symbol_offsets = NULL;
+
+ /* Some shaders can't have rodata because their binaries can be
+ * concatenated.
+ */
+ if (binary->rodata_size &&
+ (processor == TGSI_PROCESSOR_VERTEX ||
+ processor == TGSI_PROCESSOR_TESS_CTRL ||
+ processor == TGSI_PROCESSOR_TESS_EVAL ||
+ processor == TGSI_PROCESSOR_FRAGMENT)) {
+ fprintf(stderr, "radeonsi: The shader can't have rodata.");
+ return -EINVAL;
+ }
+
return r;
}
@@ -4196,7 +4660,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
- si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm, gsinfo);
+ si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm);
ctx->type = TGSI_PROCESSOR_VERTEX;
ctx->is_gs_copy_shader = true;
@@ -4241,7 +4705,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
- LLVMBuildRetVoid(bld_base->base.gallivm->builder);
+ LLVMBuildRet(gallivm->builder, ctx->return_value);
/* Dump LLVM IR before any optimization passes */
if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
@@ -4278,35 +4742,38 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
switch (shader) {
case PIPE_SHADER_VERTEX:
fprintf(f, " instance_divisors = {");
- for (i = 0; i < Elements(key->vs.instance_divisors); i++)
+ for (i = 0; i < Elements(key->vs.prolog.instance_divisors); i++)
fprintf(f, !i ? "%u" : ", %u",
- key->vs.instance_divisors[i]);
+ key->vs.prolog.instance_divisors[i]);
fprintf(f, "}\n");
fprintf(f, " as_es = %u\n", key->vs.as_es);
fprintf(f, " as_ls = %u\n", key->vs.as_ls);
- fprintf(f, " export_prim_id = %u\n", key->vs.export_prim_id);
+ fprintf(f, " export_prim_id = %u\n", key->vs.epilog.export_prim_id);
break;
case PIPE_SHADER_TESS_CTRL:
- fprintf(f, " prim_mode = %u\n", key->tcs.prim_mode);
+ fprintf(f, " prim_mode = %u\n", key->tcs.epilog.prim_mode);
break;
case PIPE_SHADER_TESS_EVAL:
fprintf(f, " as_es = %u\n", key->tes.as_es);
- fprintf(f, " export_prim_id = %u\n", key->tes.export_prim_id);
+ fprintf(f, " export_prim_id = %u\n", key->tes.epilog.export_prim_id);
break;
case PIPE_SHADER_GEOMETRY:
break;
case PIPE_SHADER_FRAGMENT:
- fprintf(f, " spi_shader_col_format = 0x%x\n", key->ps.spi_shader_col_format);
- fprintf(f, " last_cbuf = %u\n", key->ps.last_cbuf);
- fprintf(f, " color_two_side = %u\n", key->ps.color_two_side);
- fprintf(f, " alpha_func = %u\n", key->ps.alpha_func);
- fprintf(f, " alpha_to_one = %u\n", key->ps.alpha_to_one);
- fprintf(f, " poly_stipple = %u\n", key->ps.poly_stipple);
- fprintf(f, " clamp_color = %u\n", key->ps.clamp_color);
+ fprintf(f, " prolog.color_two_side = %u\n", key->ps.prolog.color_two_side);
+ fprintf(f, " prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple);
+ fprintf(f, " prolog.force_persample_interp = %u\n", key->ps.prolog.force_persample_interp);
+ fprintf(f, " epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
+ fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
+ fprintf(f, " epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
+ fprintf(f, " epilog.alpha_func = %u\n", key->ps.epilog.alpha_func);
+ fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one);
+ fprintf(f, " epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing);
+ fprintf(f, " epilog.clamp_color = %u\n", key->ps.epilog.clamp_color);
break;
default:
@@ -4317,13 +4784,12 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
static void si_init_shader_ctx(struct si_shader_context *ctx,
struct si_screen *sscreen,
struct si_shader *shader,
- LLVMTargetMachineRef tm,
- struct tgsi_shader_info *info)
+ LLVMTargetMachineRef tm)
{
struct lp_build_tgsi_context *bld_base;
memset(ctx, 0, sizeof(*ctx));
- radeon_llvm_context_init(&ctx->radeon_bld);
+ radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--");
ctx->tm = tm;
ctx->screen = sscreen;
if (shader && shader->selector)
@@ -4336,15 +4802,18 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context);
ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context);
ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context);
- ctx->i128 = LLVMInt128TypeInContext(ctx->radeon_bld.gallivm.context);
+ ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context);
+ ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128);
ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context);
ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
+ ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
bld_base = &ctx->radeon_bld.soa.bld_base;
- bld_base->info = info;
+ if (shader && shader->selector)
+ bld_base->info = &shader->selector->info;
bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
@@ -4380,40 +4849,31 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
}
-int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
- struct si_shader *shader,
- struct pipe_debug_callback *debug)
+int si_compile_tgsi_shader(struct si_screen *sscreen,
+ LLVMTargetMachineRef tm,
+ struct si_shader *shader,
+ bool is_monolithic,
+ struct pipe_debug_callback *debug)
{
struct si_shader_selector *sel = shader->selector;
- struct tgsi_token *tokens = sel->tokens;
struct si_shader_context ctx;
struct lp_build_tgsi_context *bld_base;
- struct tgsi_shader_info stipple_shader_info;
LLVMModuleRef mod;
int r = 0;
- bool poly_stipple = sel->type == PIPE_SHADER_FRAGMENT &&
- shader->key.ps.poly_stipple;
-
- if (poly_stipple) {
- tokens = util_pstipple_create_fragment_shader(tokens, NULL,
- SI_POLY_STIPPLE_SAMPLER,
- TGSI_FILE_SYSTEM_VALUE);
- tgsi_scan_shader(tokens, &stipple_shader_info);
- }
/* Dump TGSI code before doing TGSI->LLVM conversion in case the
* conversion fails. */
if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
!(sscreen->b.debug_flags & DBG_NO_TGSI)) {
si_dump_shader_key(sel->type, &shader->key, stderr);
- tgsi_dump(tokens, 0);
+ tgsi_dump(sel->tokens, 0);
si_dump_streamout(&sel->so);
}
- si_init_shader_ctx(&ctx, sscreen, shader, tm,
- poly_stipple ? &stipple_shader_info : &sel->info);
+ si_init_shader_ctx(&ctx, sscreen, shader, tm);
+ ctx.is_monolithic = is_monolithic;
- shader->uses_instanceid = sel->info.uses_instanceid;
+ shader->info.uses_instanceid = sel->info.uses_instanceid;
bld_base = &ctx.radeon_bld.soa.bld_base;
ctx.radeon_bld.load_system_value = declare_system_value;
@@ -4447,7 +4907,10 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
break;
case TGSI_PROCESSOR_FRAGMENT:
ctx.radeon_bld.load_input = declare_input_fs;
- bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
+ if (is_monolithic)
+ bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
+ else
+ bld_base->emit_epilogue = si_llvm_return_fs_outputs;
break;
default:
assert(!"Unsupported shader type");
@@ -4461,6 +4924,14 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
preload_streamout_buffers(&ctx);
preload_ring_buffers(&ctx);
+ if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT &&
+ shader->key.ps.prolog.poly_stipple) {
+ LLVMValueRef views = LLVMGetParam(ctx.radeon_bld.main_fn,
+ SI_PARAM_SAMPLERS);
+ si_llvm_emit_polygon_stipple(&ctx, views,
+ SI_PARAM_POS_FIXED_PT);
+ }
+
if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
int i;
for (i = 0; i < 4; i++) {
@@ -4470,12 +4941,12 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
}
}
- if (!lp_build_tgsi_llvm(bld_base, tokens)) {
+ if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
goto out;
}
- LLVMBuildRetVoid(bld_base->base.gallivm->builder);
+ LLVMBuildRet(bld_base->base.gallivm->builder, ctx.return_value);
mod = bld_base->base.gallivm->module;
/* Dump LLVM IR before any optimization passes */
@@ -4492,16 +4963,49 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
goto out;
}
- si_shader_dump(sscreen, shader, debug, ctx.type);
+ radeon_llvm_dispose(&ctx.radeon_bld);
- r = si_shader_binary_upload(sscreen, shader);
- if (r) {
- fprintf(stderr, "LLVM failed to upload shader\n");
- goto out;
+ /* Calculate the number of fragment input VGPRs. */
+ if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
+ shader->info.num_input_vgprs = 0;
+ shader->info.face_vgpr_index = -1;
+
+ if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
+ shader->info.num_input_vgprs += 2;
+ if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
+ shader->info.num_input_vgprs += 2;
+ if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
+ shader->info.num_input_vgprs += 2;
+ if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
+ shader->info.num_input_vgprs += 3;
+ if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
+ shader->info.num_input_vgprs += 2;
+ if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
+ shader->info.num_input_vgprs += 2;
+ if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
+ shader->info.num_input_vgprs += 2;
+ if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
+ shader->info.num_input_vgprs += 1;
+ if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
+ shader->info.num_input_vgprs += 1;
+ if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
+ shader->info.num_input_vgprs += 1;
+ if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
+ shader->info.num_input_vgprs += 1;
+ if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
+ shader->info.num_input_vgprs += 1;
+ if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
+ shader->info.face_vgpr_index = shader->info.num_input_vgprs;
+ shader->info.num_input_vgprs += 1;
+ }
+ if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
+ shader->info.num_input_vgprs += 1;
+ if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
+ shader->info.num_input_vgprs += 1;
+ if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
+ shader->info.num_input_vgprs += 1;
}
- radeon_llvm_dispose(&ctx.radeon_bld);
-
if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
shader->gs_copy_shader->selector = shader->selector;
@@ -4517,11 +5021,968 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
out:
for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++)
FREE(ctx.constants[i]);
- if (poly_stipple)
- tgsi_free_tokens(tokens);
return r;
}
+/**
+ * Create, compile and return a shader part (prolog or epilog).
+ *
+ * \param sscreen screen
+ * \param list list of shader parts of the same category
+ * \param key shader part key
+ * \param tm LLVM target machine
+ * \param debug debug callback
+ * \param compile the callback responsible for compilation
+ * \return non-NULL on success
+ */
+static struct si_shader_part *
+si_get_shader_part(struct si_screen *sscreen,
+ struct si_shader_part **list,
+ union si_shader_part_key *key,
+ LLVMTargetMachineRef tm,
+ struct pipe_debug_callback *debug,
+ bool (*compile)(struct si_screen *,
+ LLVMTargetMachineRef,
+ struct pipe_debug_callback *,
+ struct si_shader_part *))
+{
+ struct si_shader_part *result;
+
+ pipe_mutex_lock(sscreen->shader_parts_mutex);
+
+ /* Find existing. */
+ for (result = *list; result; result = result->next) {
+ if (memcmp(&result->key, key, sizeof(*key)) == 0) {
+ pipe_mutex_unlock(sscreen->shader_parts_mutex);
+ return result;
+ }
+ }
+
+ /* Compile a new one. */
+ result = CALLOC_STRUCT(si_shader_part);
+ result->key = *key;
+ if (!compile(sscreen, tm, debug, result)) {
+ FREE(result);
+ pipe_mutex_unlock(sscreen->shader_parts_mutex);
+ return NULL;
+ }
+
+ result->next = *list;
+ *list = result;
+ pipe_mutex_unlock(sscreen->shader_parts_mutex);
+ return result;
+}
+
+/**
+ * Create a vertex shader prolog.
+ *
+ * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
+ * All inputs are returned unmodified. The vertex load indices are
+ * stored after them, which will used by the API VS for fetching inputs.
+ *
+ * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
+ * input_v0,
+ * input_v1,
+ * input_v2,
+ * input_v3,
+ * (VertexID + BaseVertex),
+ * (InstanceID + StartInstance),
+ * (InstanceID / 2 + StartInstance)
+ */
+static bool si_compile_vs_prolog(struct si_screen *sscreen,
+ LLVMTargetMachineRef tm,
+ struct pipe_debug_callback *debug,
+ struct si_shader_part *out)
+{
+ union si_shader_part_key *key = &out->key;
+ struct si_shader shader = {};
+ struct si_shader_context ctx;
+ struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+ LLVMTypeRef *params, *returns;
+ LLVMValueRef ret, func;
+ int last_sgpr, num_params, num_returns, i;
+ bool status = true;
+
+ si_init_shader_ctx(&ctx, sscreen, &shader, tm);
+ ctx.type = TGSI_PROCESSOR_VERTEX;
+ ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
+ ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
+
+ /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
+ params = alloca((key->vs_prolog.num_input_sgprs + 4) *
+ sizeof(LLVMTypeRef));
+ returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
+ key->vs_prolog.last_input + 1) *
+ sizeof(LLVMTypeRef));
+ num_params = 0;
+ num_returns = 0;
+
+ /* Declare input and output SGPRs. */
+ num_params = 0;
+ for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
+ params[num_params++] = ctx.i32;
+ returns[num_returns++] = ctx.i32;
+ }
+ last_sgpr = num_params - 1;
+
+ /* 4 preloaded VGPRs (outputs must be floats) */
+ for (i = 0; i < 4; i++) {
+ params[num_params++] = ctx.i32;
+ returns[num_returns++] = ctx.f32;
+ }
+
+ /* Vertex load indices. */
+ for (i = 0; i <= key->vs_prolog.last_input; i++)
+ returns[num_returns++] = ctx.f32;
+
+ /* Create the function. */
+ si_create_function(&ctx, returns, num_returns, params,
+ num_params, -1, last_sgpr);
+ func = ctx.radeon_bld.main_fn;
+
+ /* Copy inputs to outputs. This should be no-op, as the registers match,
+ * but it will prevent the compiler from overwriting them unintentionally.
+ */
+ ret = ctx.return_value;
+ for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
+ LLVMValueRef p = LLVMGetParam(func, i);
+ ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
+ }
+ for (i = num_params - 4; i < num_params; i++) {
+ LLVMValueRef p = LLVMGetParam(func, i);
+ p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, "");
+ ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
+ }
+
+ /* Compute vertex load indices from instance divisors. */
+ for (i = 0; i <= key->vs_prolog.last_input; i++) {
+ unsigned divisor = key->vs_prolog.states.instance_divisors[i];
+ LLVMValueRef index;
+
+ if (divisor) {
+ /* InstanceID / Divisor + StartInstance */
+ index = get_instance_index_for_fetch(&ctx.radeon_bld,
+ SI_SGPR_START_INSTANCE,
+ divisor);
+ } else {
+ /* VertexID + BaseVertex */
+ index = LLVMBuildAdd(gallivm->builder,
+ LLVMGetParam(func, ctx.param_vertex_id),
+ LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
+ }
+
+ index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, "");
+ ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
+ num_params++, "");
+ }
+
+ /* Compile. */
+ LLVMBuildRet(gallivm->builder, ret);
+ radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+ if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+ gallivm->module, debug, ctx.type,
+ "Vertex Shader Prolog"))
+ status = false;
+
+ radeon_llvm_dispose(&ctx.radeon_bld);
+ return status;
+}
+
+/**
+ * Compile the vertex shader epilog. This is also used by the tessellation
+ * evaluation shader compiled as VS.
+ *
+ * The input is PrimitiveID.
+ *
+ * If PrimitiveID is required by the pixel shader, export it.
+ * Otherwise, do nothing.
+ */
+static bool si_compile_vs_epilog(struct si_screen *sscreen,
+ LLVMTargetMachineRef tm,
+ struct pipe_debug_callback *debug,
+ struct si_shader_part *out)
+{
+ union si_shader_part_key *key = &out->key;
+ struct si_shader_context ctx;
+ struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+ struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
+ LLVMTypeRef params[5];
+ int num_params, i;
+ bool status = true;
+
+ si_init_shader_ctx(&ctx, sscreen, NULL, tm);
+ ctx.type = TGSI_PROCESSOR_VERTEX;
+
+ /* Declare input VGPRs. */
+ num_params = key->vs_epilog.states.export_prim_id ?
+ (VS_EPILOG_PRIMID_LOC + 1) : 0;
+ assert(num_params <= ARRAY_SIZE(params));
+
+ for (i = 0; i < num_params; i++)
+ params[i] = ctx.f32;
+
+ /* Create the function. */
+ si_create_function(&ctx, NULL, 0, params, num_params,
+ -1, -1);
+
+ /* Emit exports. */
+ if (key->vs_epilog.states.export_prim_id) {
+ struct lp_build_context *base = &bld_base->base;
+ struct lp_build_context *uint = &bld_base->uint_bld;
+ LLVMValueRef args[9];
+
+ args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
+ args[1] = uint->zero; /* whether the EXEC mask is valid */
+ args[2] = uint->zero; /* DONE bit */
+ args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM +
+ key->vs_epilog.prim_id_param_offset);
+ args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
+ args[5] = LLVMGetParam(ctx.radeon_bld.main_fn,
+ VS_EPILOG_PRIMID_LOC); /* X */
+ args[6] = uint->undef; /* Y */
+ args[7] = uint->undef; /* Z */
+ args[8] = uint->undef; /* W */
+
+ lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+ LLVMVoidTypeInContext(base->gallivm->context),
+ args, 9, 0);
+ }
+
+ /* Compile. */
+ LLVMBuildRet(gallivm->builder, ctx.return_value);
+ radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+ if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+ gallivm->module, debug, ctx.type,
+ "Vertex Shader Epilog"))
+ status = false;
+
+ radeon_llvm_dispose(&ctx.radeon_bld);
+ return status;
+}
+
+/**
+ * Create & compile a vertex shader epilog. This a helper used by VS and TES.
+ */
+static bool si_get_vs_epilog(struct si_screen *sscreen,
+ LLVMTargetMachineRef tm,
+ struct si_shader *shader,
+ struct pipe_debug_callback *debug,
+ struct si_vs_epilog_bits *states)
+{
+ union si_shader_part_key epilog_key;
+
+ memset(&epilog_key, 0, sizeof(epilog_key));
+ epilog_key.vs_epilog.states = *states;
+
+ /* Set up the PrimitiveID output. */
+ if (shader->key.vs.epilog.export_prim_id) {
+ unsigned index = shader->selector->info.num_outputs;
+ unsigned offset = shader->info.nr_param_exports++;
+
+ epilog_key.vs_epilog.prim_id_param_offset = offset;
+ assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
+ shader->info.vs_output_param_offset[index] = offset;
+ }
+
+ shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
+ &epilog_key, tm, debug,
+ si_compile_vs_epilog);
+ return shader->epilog != NULL;
+}
+
+/**
+ * Select and compile (or reuse) vertex shader parts (prolog & epilog).
+ */
+static bool si_shader_select_vs_parts(struct si_screen *sscreen,
+ LLVMTargetMachineRef tm,
+ struct si_shader *shader,
+ struct pipe_debug_callback *debug)
+{
+ struct tgsi_shader_info *info = &shader->selector->info;
+ union si_shader_part_key prolog_key;
+ unsigned i;
+
+ /* Get the prolog. */
+ memset(&prolog_key, 0, sizeof(prolog_key));
+ prolog_key.vs_prolog.states = shader->key.vs.prolog;
+ prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
+ prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
+
+ /* The prolog is a no-op if there are no inputs. */
+ if (info->num_inputs) {
+ shader->prolog =
+ si_get_shader_part(sscreen, &sscreen->vs_prologs,
+ &prolog_key, tm, debug,
+ si_compile_vs_prolog);
+ if (!shader->prolog)
+ return false;
+ }
+
+ /* Get the epilog. */
+ if (!shader->key.vs.as_es && !shader->key.vs.as_ls &&
+ !si_get_vs_epilog(sscreen, tm, shader, debug,
+ &shader->key.vs.epilog))
+ return false;
+
+ /* Set the instanceID flag. */
+ for (i = 0; i < info->num_inputs; i++)
+ if (prolog_key.vs_prolog.states.instance_divisors[i])
+ shader->info.uses_instanceid = true;
+
+ return true;
+}
+
+/**
+ * Select and compile (or reuse) TES parts (epilog).
+ */
+static bool si_shader_select_tes_parts(struct si_screen *sscreen,
+ LLVMTargetMachineRef tm,
+ struct si_shader *shader,
+ struct pipe_debug_callback *debug)
+{
+ if (shader->key.tes.as_es)
+ return true;
+
+ /* TES compiled as VS. */
+ return si_get_vs_epilog(sscreen, tm, shader, debug,
+ &shader->key.tes.epilog);
+}
+
+/**
+ * Compile the TCS epilog. This writes tesselation factors to memory based on
+ * the output primitive type of the tesselator (determined by TES).
+ */
+static bool si_compile_tcs_epilog(struct si_screen *sscreen,
+ LLVMTargetMachineRef tm,
+ struct pipe_debug_callback *debug,
+ struct si_shader_part *out)
+{
+ union si_shader_part_key *key = &out->key;
+ struct si_shader shader = {};
+ struct si_shader_context ctx;
+ struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+ struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
+ LLVMTypeRef params[16];
+ LLVMValueRef func;
+ int last_array_pointer, last_sgpr, num_params;
+ bool status = true;
+
+ si_init_shader_ctx(&ctx, sscreen, &shader, tm);
+ ctx.type = TGSI_PROCESSOR_TESS_CTRL;
+ shader.key.tcs.epilog = key->tcs_epilog.states;
+
+ /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
+ params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS);
+ last_array_pointer = SI_PARAM_RW_BUFFERS;
+ params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
+ params[SI_PARAM_SAMPLERS] = ctx.i64;
+ params[SI_PARAM_UNUSED] = ctx.i64;
+ params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
+ params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
+ params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
+ params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32;
+ last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
+ num_params = last_sgpr + 1;
+
+ params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */
+ params[num_params++] = ctx.i32; /* invocation ID within the patch */
+ params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */
+
+ /* Create the function. */
+ si_create_function(&ctx, NULL, 0, params, num_params,
+ last_array_pointer, last_sgpr);
+ declare_tess_lds(&ctx);
+ func = ctx.radeon_bld.main_fn;
+
+ si_write_tess_factors(bld_base,
+ LLVMGetParam(func, last_sgpr + 1),
+ LLVMGetParam(func, last_sgpr + 2),
+ LLVMGetParam(func, last_sgpr + 3));
+
+ /* Compile. */
+ LLVMBuildRet(gallivm->builder, ctx.return_value);
+ radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+ if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+ gallivm->module, debug, ctx.type,
+ "Tessellation Control Shader Epilog"))
+ status = false;
+
+ radeon_llvm_dispose(&ctx.radeon_bld);
+ return status;
+}
+
+/**
+ * Select and compile (or reuse) TCS parts (epilog).
+ */
+static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
+ LLVMTargetMachineRef tm,
+ struct si_shader *shader,
+ struct pipe_debug_callback *debug)
+{
+ union si_shader_part_key epilog_key;
+
+ /* Get the epilog. */
+ memset(&epilog_key, 0, sizeof(epilog_key));
+ epilog_key.tcs_epilog.states = shader->key.tcs.epilog;
+
+ shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
+ &epilog_key, tm, debug,
+ si_compile_tcs_epilog);
+ return shader->epilog != NULL;
+}
+
+/**
+ * Compile the pixel shader prolog. This handles:
+ * - two-side color selection and interpolation
+ * - overriding interpolation parameters for the API PS
+ * - polygon stippling
+ *
+ * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
+ * overriden by other states. (e.g. per-sample interpolation)
+ * Interpolated colors are stored after the preloaded VGPRs.
+ */
+static bool si_compile_ps_prolog(struct si_screen *sscreen,
+ LLVMTargetMachineRef tm,
+ struct pipe_debug_callback *debug,
+ struct si_shader_part *out)
+{
+ union si_shader_part_key *key = &out->key;
+ struct si_shader shader = {};
+ struct si_shader_context ctx;
+ struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+ LLVMTypeRef *params;
+ LLVMValueRef ret, func;
+ int last_sgpr, num_params, num_returns, i, num_color_channels;
+ bool status = true;
+
+ si_init_shader_ctx(&ctx, sscreen, &shader, tm);
+ ctx.type = TGSI_PROCESSOR_FRAGMENT;
+ shader.key.ps.prolog = key->ps_prolog.states;
+
+ /* Number of inputs + 8 color elements. */
+ params = alloca((key->ps_prolog.num_input_sgprs +
+ key->ps_prolog.num_input_vgprs + 8) *
+ sizeof(LLVMTypeRef));
+
+ /* Declare inputs. */
+ num_params = 0;
+ for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
+ params[num_params++] = ctx.i32;
+ last_sgpr = num_params - 1;
+
+ for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
+ params[num_params++] = ctx.f32;
+
+ /* Declare outputs (same as inputs + add colors if needed) */
+ num_returns = num_params;
+ num_color_channels = util_bitcount(key->ps_prolog.colors_read);
+ for (i = 0; i < num_color_channels; i++)
+ params[num_returns++] = ctx.f32;
+
+ /* Create the function. */
+ si_create_function(&ctx, params, num_returns, params,
+ num_params, -1, last_sgpr);
+ func = ctx.radeon_bld.main_fn;
+
+ /* Copy inputs to outputs. This should be no-op, as the registers match,
+ * but it will prevent the compiler from overwriting them unintentionally.
+ */
+ ret = ctx.return_value;
+ for (i = 0; i < num_params; i++) {
+ LLVMValueRef p = LLVMGetParam(func, i);
+ ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
+ }
+
+ /* Polygon stippling. */
+ if (key->ps_prolog.states.poly_stipple) {
+ /* POS_FIXED_PT is always last. */
+ unsigned pos = key->ps_prolog.num_input_sgprs +
+ key->ps_prolog.num_input_vgprs - 1;
+ LLVMValueRef ptr[2], views;
+
+ /* Get the pointer to sampler views. */
+ ptr[0] = LLVMGetParam(func, SI_SGPR_SAMPLERS);
+ ptr[1] = LLVMGetParam(func, SI_SGPR_SAMPLERS+1);
+ views = lp_build_gather_values(gallivm, ptr, 2);
+ views = LLVMBuildBitCast(gallivm->builder, views, ctx.i64, "");
+ views = LLVMBuildIntToPtr(gallivm->builder, views,
+ const_array(ctx.v8i32, SI_NUM_SAMPLERS), "");
+
+ si_llvm_emit_polygon_stipple(&ctx, views, pos);
+ }
+
+ /* Interpolate colors. */
+ for (i = 0; i < 2; i++) {
+ unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
+ unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
+ key->ps_prolog.face_vgpr_index;
+ LLVMValueRef interp[2], color[4];
+ LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
+
+ if (!writemask)
+ continue;
+
+ /* If the interpolation qualifier is not CONSTANT (-1). */
+ if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
+ unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
+ key->ps_prolog.color_interp_vgpr_index[i];
+
+ interp[0] = LLVMGetParam(func, interp_vgpr);
+ interp[1] = LLVMGetParam(func, interp_vgpr + 1);
+ interp_ij = lp_build_gather_values(gallivm, interp, 2);
+ interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij,
+ ctx.v2i32, "");
+ }
+
+ /* Use the absolute location of the input. */
+ prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
+
+ if (key->ps_prolog.states.color_two_side) {
+ face = LLVMGetParam(func, face_vgpr);
+ face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, "");
+ }
+
+ interp_fs_input(&ctx,
+ key->ps_prolog.color_attr_index[i],
+ TGSI_SEMANTIC_COLOR, i,
+ key->ps_prolog.num_interp_inputs,
+ key->ps_prolog.colors_read, interp_ij,
+ prim_mask, face, color);
+
+ while (writemask) {
+ unsigned chan = u_bit_scan(&writemask);
+ ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
+ num_params++, "");
+ }
+ }
+
+ /* Force per-sample interpolation. */
+ if (key->ps_prolog.states.force_persample_interp) {
+ unsigned i, base = key->ps_prolog.num_input_sgprs;
+ LLVMValueRef persp_sample[2], linear_sample[2];
+
+ /* Read PERSP_SAMPLE. */
+ for (i = 0; i < 2; i++)
+ persp_sample[i] = LLVMGetParam(func, base + i);
+ /* Overwrite PERSP_CENTER. */
+ for (i = 0; i < 2; i++)
+ ret = LLVMBuildInsertValue(gallivm->builder, ret,
+ persp_sample[i], base + 2 + i, "");
+ /* Overwrite PERSP_CENTROID. */
+ for (i = 0; i < 2; i++)
+ ret = LLVMBuildInsertValue(gallivm->builder, ret,
+ persp_sample[i], base + 4 + i, "");
+ /* Read LINEAR_SAMPLE. */
+ for (i = 0; i < 2; i++)
+ linear_sample[i] = LLVMGetParam(func, base + 6 + i);
+ /* Overwrite LINEAR_CENTER. */
+ for (i = 0; i < 2; i++)
+ ret = LLVMBuildInsertValue(gallivm->builder, ret,
+ linear_sample[i], base + 8 + i, "");
+ /* Overwrite LINEAR_CENTROID. */
+ for (i = 0; i < 2; i++)
+ ret = LLVMBuildInsertValue(gallivm->builder, ret,
+ linear_sample[i], base + 10 + i, "");
+ }
+
+ /* Compile. */
+ LLVMBuildRet(gallivm->builder, ret);
+ radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+ if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+ gallivm->module, debug, ctx.type,
+ "Fragment Shader Prolog"))
+ status = false;
+
+ radeon_llvm_dispose(&ctx.radeon_bld);
+ return status;
+}
+
+/**
+ * Compile the pixel shader epilog. This handles everything that must be
+ * emulated for pixel shader exports. (alpha-test, format conversions, etc)
+ */
+static bool si_compile_ps_epilog(struct si_screen *sscreen,
+ LLVMTargetMachineRef tm,
+ struct pipe_debug_callback *debug,
+ struct si_shader_part *out)
+{
+ union si_shader_part_key *key = &out->key;
+ struct si_shader shader = {};
+ struct si_shader_context ctx;
+ struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+ struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
+ LLVMTypeRef params[16+8*4+3];
+ LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
+ int last_array_pointer, last_sgpr, num_params, i;
+ bool status = true;
+
+ si_init_shader_ctx(&ctx, sscreen, &shader, tm);
+ ctx.type = TGSI_PROCESSOR_FRAGMENT;
+ shader.key.ps.epilog = key->ps_epilog.states;
+
+ /* Declare input SGPRs. */
+ params[SI_PARAM_RW_BUFFERS] = ctx.i64;
+ params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
+ params[SI_PARAM_SAMPLERS] = ctx.i64;
+ params[SI_PARAM_UNUSED] = ctx.i64;
+ params[SI_PARAM_ALPHA_REF] = ctx.f32;
+ last_array_pointer = -1;
+ last_sgpr = SI_PARAM_ALPHA_REF;
+
+ /* Declare input VGPRs. */
+ num_params = (last_sgpr + 1) +
+ util_bitcount(key->ps_epilog.colors_written) * 4 +
+ key->ps_epilog.writes_z +
+ key->ps_epilog.writes_stencil +
+ key->ps_epilog.writes_samplemask;
+
+ num_params = MAX2(num_params,
+ last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
+
+ assert(num_params <= ARRAY_SIZE(params));
+
+ for (i = last_sgpr + 1; i < num_params; i++)
+ params[i] = ctx.f32;
+
+ /* Create the function. */
+ si_create_function(&ctx, NULL, 0, params, num_params,
+ last_array_pointer, last_sgpr);
+ /* Disable elimination of unused inputs. */
+ radeon_llvm_add_attribute(ctx.radeon_bld.main_fn,
+ "InitialPSInputAddr", 0xffffff);
+
+ /* Process colors. */
+ unsigned vgpr = last_sgpr + 1;
+ unsigned colors_written = key->ps_epilog.colors_written;
+ int last_color_export = -1;
+
+ /* Find the last color export. */
+ if (!key->ps_epilog.writes_z &&
+ !key->ps_epilog.writes_stencil &&
+ !key->ps_epilog.writes_samplemask) {
+ unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
+
+ /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
+ if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
+ /* Just set this if any of the colorbuffers are enabled. */
+ if (spi_format &
+ ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
+ last_color_export = 0;
+ } else {
+ for (i = 0; i < 8; i++)
+ if (colors_written & (1 << i) &&
+ (spi_format >> (i * 4)) & 0xf)
+ last_color_export = i;
+ }
+ }
+
+ while (colors_written) {
+ LLVMValueRef color[4];
+ int mrt = u_bit_scan(&colors_written);
+
+ for (i = 0; i < 4; i++)
+ color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
+
+ si_export_mrt_color(bld_base, color, mrt,
+ num_params - 1,
+ mrt == last_color_export);
+ }
+
+ /* Process depth, stencil, samplemask. */
+ if (key->ps_epilog.writes_z)
+ depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
+ if (key->ps_epilog.writes_stencil)
+ stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
+ if (key->ps_epilog.writes_samplemask)
+ samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
+
+ if (depth || stencil || samplemask)
+ si_export_mrt_z(bld_base, depth, stencil, samplemask);
+ else if (last_color_export == -1)
+ si_export_null(bld_base);
+
+ /* Compile. */
+ LLVMBuildRetVoid(gallivm->builder);
+ radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+ if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+ gallivm->module, debug, ctx.type,
+ "Fragment Shader Epilog"))
+ status = false;
+
+ radeon_llvm_dispose(&ctx.radeon_bld);
+ return status;
+}
+
+/**
+ * Select and compile (or reuse) pixel shader parts (prolog & epilog).
+ */
+static bool si_shader_select_ps_parts(struct si_screen *sscreen,
+ LLVMTargetMachineRef tm,
+ struct si_shader *shader,
+ struct pipe_debug_callback *debug)
+{
+ struct tgsi_shader_info *info = &shader->selector->info;
+ union si_shader_part_key prolog_key;
+ union si_shader_part_key epilog_key;
+ unsigned i;
+
+ /* Get the prolog. */
+ memset(&prolog_key, 0, sizeof(prolog_key));
+ prolog_key.ps_prolog.states = shader->key.ps.prolog;
+ prolog_key.ps_prolog.colors_read = info->colors_read;
+ prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
+ prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
+
+ if (info->colors_read) {
+ unsigned *color = shader->selector->color_attr_index;
+
+ if (shader->key.ps.prolog.color_two_side) {
+ /* BCOLORs are stored after the last input. */
+ prolog_key.ps_prolog.num_interp_inputs = info->num_inputs;
+ prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
+ shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
+ }
+
+ for (i = 0; i < 2; i++) {
+ unsigned location = info->input_interpolate_loc[color[i]];
+
+ if (!(info->colors_read & (0xf << i*4)))
+ continue;
+
+ prolog_key.ps_prolog.color_attr_index[i] = color[i];
+
+ /* Force per-sample interpolation for the colors here. */
+ if (shader->key.ps.prolog.force_persample_interp)
+ location = TGSI_INTERPOLATE_LOC_SAMPLE;
+
+ switch (info->input_interpolate[color[i]]) {
+ case TGSI_INTERPOLATE_CONSTANT:
+ prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1;
+ break;
+ case TGSI_INTERPOLATE_PERSPECTIVE:
+ case TGSI_INTERPOLATE_COLOR:
+ switch (location) {
+ case TGSI_INTERPOLATE_LOC_SAMPLE:
+ prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0;
+ shader->config.spi_ps_input_ena |=
+ S_0286CC_PERSP_SAMPLE_ENA(1);
+ break;
+ case TGSI_INTERPOLATE_LOC_CENTER:
+ prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2;
+ shader->config.spi_ps_input_ena |=
+ S_0286CC_PERSP_CENTER_ENA(1);
+ break;
+ case TGSI_INTERPOLATE_LOC_CENTROID:
+ prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4;
+ shader->config.spi_ps_input_ena |=
+ S_0286CC_PERSP_CENTROID_ENA(1);
+ break;
+ default:
+ assert(0);
+ }
+ break;
+ case TGSI_INTERPOLATE_LINEAR:
+ switch (location) {
+ case TGSI_INTERPOLATE_LOC_SAMPLE:
+ prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6;
+ shader->config.spi_ps_input_ena |=
+ S_0286CC_LINEAR_SAMPLE_ENA(1);
+ break;
+ case TGSI_INTERPOLATE_LOC_CENTER:
+ prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8;
+ shader->config.spi_ps_input_ena |=
+ S_0286CC_LINEAR_CENTER_ENA(1);
+ break;
+ case TGSI_INTERPOLATE_LOC_CENTROID:
+ prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10;
+ shader->config.spi_ps_input_ena |=
+ S_0286CC_LINEAR_CENTROID_ENA(1);
+ break;
+ default:
+ assert(0);
+ }
+ break;
+ default:
+ assert(0);
+ }
+ }
+ }
+
+ /* The prolog is a no-op if these aren't set. */
+ if (prolog_key.ps_prolog.colors_read ||
+ prolog_key.ps_prolog.states.force_persample_interp ||
+ prolog_key.ps_prolog.states.poly_stipple) {
+ shader->prolog =
+ si_get_shader_part(sscreen, &sscreen->ps_prologs,
+ &prolog_key, tm, debug,
+ si_compile_ps_prolog);
+ if (!shader->prolog)
+ return false;
+ }
+
+ /* Get the epilog. */
+ memset(&epilog_key, 0, sizeof(epilog_key));
+ epilog_key.ps_epilog.colors_written = info->colors_written;
+ epilog_key.ps_epilog.writes_z = info->writes_z;
+ epilog_key.ps_epilog.writes_stencil = info->writes_stencil;
+ epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask;
+ epilog_key.ps_epilog.states = shader->key.ps.epilog;
+
+ shader->epilog =
+ si_get_shader_part(sscreen, &sscreen->ps_epilogs,
+ &epilog_key, tm, debug,
+ si_compile_ps_epilog);
+ if (!shader->epilog)
+ return false;
+
+ /* Enable POS_FIXED_PT if polygon stippling is enabled. */
+ if (shader->key.ps.prolog.poly_stipple) {
+ shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
+ assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
+ }
+
+ /* Set up the enable bits for per-sample shading if needed. */
+ if (shader->key.ps.prolog.force_persample_interp) {
+ if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
+ G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena)) {
+ shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
+ shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
+ shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
+ }
+ if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
+ G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena)) {
+ shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
+ shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
+ shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
+ }
+ }
+
+ /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
+ if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
+ !(shader->config.spi_ps_input_ena & 0xf)) {
+ shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
+ assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
+ }
+
+ /* At least one pair of interpolation weights must be enabled. */
+ if (!(shader->config.spi_ps_input_ena & 0x7f)) {
+ shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
+ assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
+ }
+
+ /* The sample mask input is always enabled, because the API shader always
+ * passes it through to the epilog. Disable it here if it's unused.
+ */
+ if (!shader->key.ps.epilog.poly_line_smoothing &&
+ !shader->selector->info.reads_samplemask)
+ shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
+
+ return true;
+}
+
+int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
+ struct si_shader *shader,
+ struct pipe_debug_callback *debug)
+{
+ struct si_shader *mainp = shader->selector->main_shader_part;
+ int r;
+
+ /* LS and ES are always compiled on demand. */
+ if (!mainp ||
+ (shader->selector->type == PIPE_SHADER_VERTEX &&
+ (shader->key.vs.as_es || shader->key.vs.as_ls)) ||
+ (shader->selector->type == PIPE_SHADER_TESS_EVAL &&
+ shader->key.tes.as_es)) {
+ /* Monolithic shader (compiled as a whole, has many variants,
+ * may take a long time to compile).
+ */
+ r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
+ if (r)
+ return r;
+ } else {
+ /* The shader consists of 2-3 parts:
+ *
+ * - the middle part is the user shader, it has 1 variant only
+ * and it was compiled during the creation of the shader
+ * selector
+ * - the prolog part is inserted at the beginning
+ * - the epilog part is inserted at the end
+ *
+ * The prolog and epilog have many (but simple) variants.
+ */
+
+ /* Copy the compiled TGSI shader data over. */
+ shader->is_binary_shared = true;
+ shader->binary = mainp->binary;
+ shader->config = mainp->config;
+ shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
+ shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
+ shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
+ memcpy(shader->info.vs_output_param_offset,
+ mainp->info.vs_output_param_offset,
+ sizeof(mainp->info.vs_output_param_offset));
+ shader->info.uses_instanceid = mainp->info.uses_instanceid;
+ shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
+ shader->info.nr_param_exports = mainp->info.nr_param_exports;
+
+ /* Select prologs and/or epilogs. */
+ switch (shader->selector->type) {
+ case PIPE_SHADER_VERTEX:
+ if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
+ return -1;
+ break;
+ case PIPE_SHADER_TESS_CTRL:
+ if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
+ return -1;
+ break;
+ case PIPE_SHADER_TESS_EVAL:
+ if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
+ return -1;
+ break;
+ case PIPE_SHADER_FRAGMENT:
+ if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
+ return -1;
+
+ /* Make sure we have at least as many VGPRs as there
+ * are allocated inputs.
+ */
+ shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
+ shader->info.num_input_vgprs);
+ break;
+ }
+
+ /* Update SGPR and VGPR counts. */
+ if (shader->prolog) {
+ shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
+ shader->prolog->config.num_sgprs);
+ shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
+ shader->prolog->config.num_vgprs);
+ }
+ if (shader->epilog) {
+ shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
+ shader->epilog->config.num_sgprs);
+ shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
+ shader->epilog->config.num_vgprs);
+ }
+ }
+
+ si_shader_dump(sscreen, shader, debug, shader->selector->info.processor);
+
+ /* Upload. */
+ r = si_shader_binary_upload(sscreen, shader);
+ if (r) {
+ fprintf(stderr, "LLVM failed to upload shader\n");
+ return r;
+ }
+
+ return 0;
+}
+
void si_shader_destroy(struct si_shader *shader)
{
if (shader->gs_copy_shader) {
@@ -4534,5 +5995,6 @@ void si_shader_destroy(struct si_shader *shader)
r600_resource_reference(&shader->bo, NULL);
- radeon_shader_binary_clean(&shader->binary);
+ if (!shader->is_binary_shared)
+ radeon_shader_binary_clean(&shader->binary);
}
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index dc75e0330e4..ff5c24d8918 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -75,6 +75,8 @@
struct radeon_shader_binary;
struct radeon_shader_reloc;
+#define SI_MAX_VS_OUTPUTS 40
+
#define SI_SGPR_RW_BUFFERS 0 /* rings (& stream-out, VS only) */
#define SI_SGPR_CONST_BUFFERS 2
#define SI_SGPR_SAMPLERS 4 /* images & sampler states interleaved */
@@ -169,7 +171,7 @@ struct radeon_shader_reloc;
#define SI_PARAM_SAMPLE_COVERAGE 20
#define SI_PARAM_POS_FIXED_PT 21
-#define SI_NUM_PARAMS (SI_PARAM_POS_FIXED_PT + 1)
+#define SI_NUM_PARAMS (SI_PARAM_POS_FIXED_PT + 9) /* +8 for COLOR[0..1] */
struct si_shader;
@@ -181,6 +183,11 @@ struct si_shader_selector {
struct si_shader *first_variant; /* immutable after the first variant */
struct si_shader *last_variant; /* mutable */
+ /* The compiled TGSI shader expecting a prolog and/or epilog (not
+ * uploaded to a buffer).
+ */
+ struct si_shader *main_shader_part;
+
struct tgsi_token *tokens;
struct pipe_stream_output_info so;
struct tgsi_shader_info info;
@@ -199,6 +206,7 @@ struct si_shader_selector {
unsigned max_gsvs_emit_size;
/* PS parameters. */
+ unsigned color_attr_index[2];
unsigned db_shader_control;
/* Set 0xf or 0x0 (4 bits) per each written output.
* ANDed with spi_shader_col_format.
@@ -221,37 +229,103 @@ struct si_shader_selector {
* With both: LS | HS | ES | GS | VS | PS
*/
+/* Common VS bits between the shader key and the prolog key. */
+struct si_vs_prolog_bits {
+ unsigned instance_divisors[SI_NUM_VERTEX_BUFFERS];
+};
+
+/* Common VS bits between the shader key and the epilog key. */
+struct si_vs_epilog_bits {
+ unsigned export_prim_id:1; /* when PS needs it and GS is disabled */
+ /* TODO:
+ * - skip clipdist, culldist (including clipvertex code) exports based
+ * on which clip_plane_enable bits are set
+ * - skip layer, viewport, clipdist, and culldist parameter exports
+ * if PS doesn't read them
+ */
+};
+
+/* Common TCS bits between the shader key and the epilog key. */
+struct si_tcs_epilog_bits {
+ unsigned prim_mode:3;
+};
+
+/* Common PS bits between the shader key and the prolog key. */
+struct si_ps_prolog_bits {
+ unsigned color_two_side:1;
+ /* TODO: add a flatshade bit that skips interpolation for colors */
+ unsigned poly_stipple:1;
+ unsigned force_persample_interp:1;
+ /* TODO:
+ * - add force_center_interp if MSAA is disabled and centroid or
+ * sample are present
+ * - add force_center_interp_bc_optimize to force center interpolation
+ * based on the bc_optimize SGPR bit if MSAA is enabled, centroid is
+ * present and sample isn't present.
+ */
+};
+
+/* Common PS bits between the shader key and the epilog key. */
+struct si_ps_epilog_bits {
+ unsigned spi_shader_col_format;
+ unsigned color_is_int8:8;
+ unsigned last_cbuf:3;
+ unsigned alpha_func:3;
+ unsigned alpha_to_one:1;
+ unsigned poly_line_smoothing:1;
+ unsigned clamp_color:1;
+};
+
+union si_shader_part_key {
+ struct {
+ struct si_vs_prolog_bits states;
+ unsigned num_input_sgprs:5;
+ unsigned last_input:4;
+ } vs_prolog;
+ struct {
+ struct si_vs_epilog_bits states;
+ unsigned prim_id_param_offset:5;
+ } vs_epilog;
+ struct {
+ struct si_tcs_epilog_bits states;
+ } tcs_epilog;
+ struct {
+ struct si_ps_prolog_bits states;
+ unsigned num_input_sgprs:5;
+ unsigned num_input_vgprs:5;
+ /* Color interpolation and two-side color selection. */
+ unsigned colors_read:8; /* color input components read */
+ unsigned num_interp_inputs:5; /* BCOLOR is at this location */
+ unsigned face_vgpr_index:5;
+ char color_attr_index[2];
+ char color_interp_vgpr_index[2]; /* -1 == constant */
+ } ps_prolog;
+ struct {
+ struct si_ps_epilog_bits states;
+ unsigned colors_written:8;
+ unsigned writes_z:1;
+ unsigned writes_stencil:1;
+ unsigned writes_samplemask:1;
+ } ps_epilog;
+};
+
union si_shader_key {
struct {
- unsigned spi_shader_col_format;
- unsigned color_is_int8:8;
- unsigned last_cbuf:3;
- unsigned color_two_side:1;
- unsigned alpha_func:3;
- unsigned alpha_to_one:1;
- unsigned poly_stipple:1;
- unsigned poly_line_smoothing:1;
- unsigned clamp_color:1;
- unsigned force_persample_interp:1;
+ struct si_ps_prolog_bits prolog;
+ struct si_ps_epilog_bits epilog;
} ps;
struct {
- unsigned instance_divisors[SI_NUM_VERTEX_BUFFERS];
- /* Mask of "get_unique_index" bits - which outputs are read
- * by the next stage (needed by ES).
- * This describes how outputs are laid out in memory. */
+ struct si_vs_prolog_bits prolog;
+ struct si_vs_epilog_bits epilog;
unsigned as_es:1; /* export shader */
unsigned as_ls:1; /* local shader */
- unsigned export_prim_id:1; /* when PS needs it and GS is disabled */
} vs;
struct {
- unsigned prim_mode:3;
+ struct si_tcs_epilog_bits epilog;
} tcs; /* tessellation control shader */
struct {
- /* Mask of "get_unique_index" bits - which outputs are read
- * by the next stage (needed by ES).
- * This describes how outputs are laid out in memory. */
+ struct si_vs_epilog_bits epilog; /* same as VS */
unsigned as_es:1; /* export shader */
- unsigned export_prim_id:1; /* when PS needs it and GS is disabled */
} tes; /* tessellation evaluation shader */
};
@@ -267,22 +341,42 @@ struct si_shader_config {
unsigned rsrc2;
};
+/* GCN-specific shader info. */
+struct si_shader_info {
+ ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS];
+ ubyte num_input_sgprs;
+ ubyte num_input_vgprs;
+ char face_vgpr_index;
+ bool uses_instanceid;
+ ubyte nr_pos_exports;
+ ubyte nr_param_exports;
+};
+
struct si_shader {
struct si_shader_selector *selector;
struct si_shader *next_variant;
+ struct si_shader_part *prolog;
+ struct si_shader_part *epilog;
+
struct si_shader *gs_copy_shader;
struct si_pm4_state *pm4;
struct r600_resource *bo;
struct r600_resource *scratch_bo;
union si_shader_key key;
+ bool is_binary_shared;
+
+ /* The following data is all that's needed for binary shaders. */
struct radeon_shader_binary binary;
struct si_shader_config config;
+ struct si_shader_info info;
+};
- unsigned vs_output_param_offset[PIPE_MAX_SHADER_OUTPUTS];
- bool uses_instanceid;
- unsigned nr_pos_exports;
- unsigned nr_param_exports;
+struct si_shader_part {
+ struct si_shader_part *next;
+ union si_shader_part_key key;
+ struct radeon_shader_binary binary;
+ struct si_shader_config config;
};
static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
@@ -310,14 +404,19 @@ static inline struct si_shader* si_get_vs_state(struct si_context *sctx)
static inline bool si_vs_exports_prim_id(struct si_shader *shader)
{
if (shader->selector->type == PIPE_SHADER_VERTEX)
- return shader->key.vs.export_prim_id;
+ return shader->key.vs.epilog.export_prim_id;
else if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
- return shader->key.tes.export_prim_id;
+ return shader->key.tes.epilog.export_prim_id;
else
return false;
}
-/* radeonsi_shader.c */
+/* si_shader.c */
+int si_compile_tgsi_shader(struct si_screen *sscreen,
+ LLVMTargetMachineRef tm,
+ struct si_shader *shader,
+ bool is_monolithic,
+ struct pipe_debug_callback *debug);
int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
struct si_shader *shader,
struct pipe_debug_callback *debug);
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index bf780777b50..2dfdbeb8d8f 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -277,7 +277,7 @@ static void si_emit_cb_render_state(struct si_context *sctx, struct r600_atom *a
if (sctx->b.family == CHIP_STONEY) {
unsigned spi_shader_col_format =
sctx->ps_shader.cso ?
- sctx->ps_shader.current->key.ps.spi_shader_col_format : 0;
+ sctx->ps_shader.current->key.ps.epilog.spi_shader_col_format : 0;
unsigned sx_ps_downconvert = 0;
unsigned sx_blend_opt_epsilon = 0;
unsigned sx_blend_opt_control = 0;
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index f64c4d45f1b..40792cbc1d5 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -280,6 +280,8 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
/* si_state_shader.c */
bool si_update_shaders(struct si_context *sctx);
void si_init_shader_functions(struct si_context *sctx);
+bool si_init_shader_cache(struct si_screen *sscreen);
+void si_destroy_shader_cache(struct si_screen *sscreen);
/* si_state_draw.c */
void si_emit_cache_flush(struct si_context *sctx, struct r600_atom *atom);
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 77a4e47c809..a6753a7a528 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -32,10 +32,221 @@
#include "tgsi/tgsi_parse.h"
#include "tgsi/tgsi_ureg.h"
+#include "util/hash_table.h"
+#include "util/u_hash.h"
#include "util/u_memory.h"
#include "util/u_prim.h"
#include "util/u_simple_shaders.h"
+/* SHADER_CACHE */
+
+/**
+ * Return the TGSI binary in a buffer. The first 4 bytes contain its size as
+ * integer.
+ */
+static void *si_get_tgsi_binary(struct si_shader_selector *sel)
+{
+ unsigned tgsi_size = tgsi_num_tokens(sel->tokens) *
+ sizeof(struct tgsi_token);
+ unsigned size = 4 + tgsi_size + sizeof(sel->so);
+ char *result = (char*)MALLOC(size);
+
+ if (!result)
+ return NULL;
+
+ *((uint32_t*)result) = size;
+ memcpy(result + 4, sel->tokens, tgsi_size);
+ memcpy(result + 4 + tgsi_size, &sel->so, sizeof(sel->so));
+ return result;
+}
+
+/** Copy "data" to "ptr" and return the next dword following copied data. */
+static uint32_t *write_data(uint32_t *ptr, const void *data, unsigned size)
+{
+ memcpy(ptr, data, size);
+ ptr += DIV_ROUND_UP(size, 4);
+ return ptr;
+}
+
+/** Read data from "ptr". Return the next dword following the data. */
+static uint32_t *read_data(uint32_t *ptr, void *data, unsigned size)
+{
+ memcpy(data, ptr, size);
+ ptr += DIV_ROUND_UP(size, 4);
+ return ptr;
+}
+
+/**
+ * Write the size as uint followed by the data. Return the next dword
+ * following the copied data.
+ */
+static uint32_t *write_chunk(uint32_t *ptr, const void *data, unsigned size)
+{
+ *ptr++ = size;
+ return write_data(ptr, data, size);
+}
+
+/**
+ * Read the size as uint followed by the data. Return both via parameters.
+ * Return the next dword following the data.
+ */
+static uint32_t *read_chunk(uint32_t *ptr, void **data, unsigned *size)
+{
+ *size = *ptr++;
+ assert(*data == NULL);
+ *data = malloc(*size);
+ return read_data(ptr, *data, *size);
+}
+
+/**
+ * Return the shader binary in a buffer. The first 4 bytes contain its size
+ * as integer.
+ */
+static void *si_get_shader_binary(struct si_shader *shader)
+{
+ /* There is always a size of data followed by the data itself. */
+ unsigned relocs_size = shader->binary.reloc_count *
+ sizeof(shader->binary.relocs[0]);
+ unsigned disasm_size = strlen(shader->binary.disasm_string) + 1;
+ unsigned size =
+ 4 + /* total size */
+ 4 + /* CRC32 of the data below */
+ align(sizeof(shader->config), 4) +
+ align(sizeof(shader->info), 4) +
+ 4 + align(shader->binary.code_size, 4) +
+ 4 + align(shader->binary.rodata_size, 4) +
+ 4 + align(relocs_size, 4) +
+ 4 + align(disasm_size, 4);
+ void *buffer = CALLOC(1, size);
+ uint32_t *ptr = (uint32_t*)buffer;
+
+ if (!buffer)
+ return NULL;
+
+ *ptr++ = size;
+ ptr++; /* CRC32 is calculated at the end. */
+
+ ptr = write_data(ptr, &shader->config, sizeof(shader->config));
+ ptr = write_data(ptr, &shader->info, sizeof(shader->info));
+ ptr = write_chunk(ptr, shader->binary.code, shader->binary.code_size);
+ ptr = write_chunk(ptr, shader->binary.rodata, shader->binary.rodata_size);
+ ptr = write_chunk(ptr, shader->binary.relocs, relocs_size);
+ ptr = write_chunk(ptr, shader->binary.disasm_string, disasm_size);
+ assert((char *)ptr - (char *)buffer == size);
+
+ /* Compute CRC32. */
+ ptr = (uint32_t*)buffer;
+ ptr++;
+ *ptr = util_hash_crc32(ptr + 1, size - 8);
+
+ return buffer;
+}
+
+static bool si_load_shader_binary(struct si_shader *shader, void *binary)
+{
+ uint32_t *ptr = (uint32_t*)binary;
+ uint32_t size = *ptr++;
+ uint32_t crc32 = *ptr++;
+ unsigned chunk_size;
+
+ if (util_hash_crc32(ptr, size - 8) != crc32) {
+ fprintf(stderr, "radeonsi: binary shader has invalid CRC32\n");
+ return false;
+ }
+
+ ptr = read_data(ptr, &shader->config, sizeof(shader->config));
+ ptr = read_data(ptr, &shader->info, sizeof(shader->info));
+ ptr = read_chunk(ptr, (void**)&shader->binary.code,
+ &shader->binary.code_size);
+ ptr = read_chunk(ptr, (void**)&shader->binary.rodata,
+ &shader->binary.rodata_size);
+ ptr = read_chunk(ptr, (void**)&shader->binary.relocs, &chunk_size);
+ shader->binary.reloc_count = chunk_size / sizeof(shader->binary.relocs[0]);
+ ptr = read_chunk(ptr, (void**)&shader->binary.disasm_string, &chunk_size);
+
+ return true;
+}
+
+/**
+ * Insert a shader into the cache. It's assumed the shader is not in the cache.
+ * Use si_shader_cache_load_shader before calling this.
+ *
+ * Returns false on failure, in which case the tgsi_binary should be freed.
+ */
+static bool si_shader_cache_insert_shader(struct si_screen *sscreen,
+ void *tgsi_binary,
+ struct si_shader *shader)
+{
+ void *hw_binary = si_get_shader_binary(shader);
+
+ if (!hw_binary)
+ return false;
+
+ if (_mesa_hash_table_insert(sscreen->shader_cache, tgsi_binary,
+ hw_binary) == NULL) {
+ FREE(hw_binary);
+ return false;
+ }
+
+ return true;
+}
+
+static bool si_shader_cache_load_shader(struct si_screen *sscreen,
+ void *tgsi_binary,
+ struct si_shader *shader)
+{
+ struct hash_entry *entry =
+ _mesa_hash_table_search(sscreen->shader_cache, tgsi_binary);
+ if (!entry)
+ return false;
+
+ return si_load_shader_binary(shader, entry->data);
+}
+
+static uint32_t si_shader_cache_key_hash(const void *key)
+{
+ /* The first dword is the key size. */
+ return util_hash_crc32(key, *(uint32_t*)key);
+}
+
+static bool si_shader_cache_key_equals(const void *a, const void *b)
+{
+ uint32_t *keya = (uint32_t*)a;
+ uint32_t *keyb = (uint32_t*)b;
+
+ /* The first dword is the key size. */
+ if (*keya != *keyb)
+ return false;
+
+ return memcmp(keya, keyb, *keya) == 0;
+}
+
+static void si_destroy_shader_cache_entry(struct hash_entry *entry)
+{
+ FREE((void*)entry->key);
+ FREE(entry->data);
+}
+
+bool si_init_shader_cache(struct si_screen *sscreen)
+{
+ pipe_mutex_init(sscreen->shader_cache_mutex);
+ sscreen->shader_cache =
+ _mesa_hash_table_create(NULL,
+ si_shader_cache_key_hash,
+ si_shader_cache_key_equals);
+ return sscreen->shader_cache != NULL;
+}
+
+void si_destroy_shader_cache(struct si_screen *sscreen)
+{
+ if (sscreen->shader_cache)
+ _mesa_hash_table_destroy(sscreen->shader_cache,
+ si_destroy_shader_cache_entry);
+ pipe_mutex_destroy(sscreen->shader_cache_mutex);
+}
+
+/* SHADER STATES */
+
static void si_set_tesseval_regs(struct si_shader *shader,
struct si_pm4_state *pm4)
{
@@ -108,7 +319,7 @@ static void si_shader_ls(struct si_shader *shader)
/* We need at least 2 components for LS.
* VGPR0-3: (VertexID, RelAutoindex, ???, InstanceID). */
- vgpr_comp_cnt = shader->uses_instanceid ? 3 : 1;
+ vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 1;
num_user_sgprs = SI_LS_NUM_USER_SGPR;
num_sgprs = shader->config.num_sgprs;
@@ -181,7 +392,7 @@ static void si_shader_es(struct si_shader *shader)
si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
if (shader->selector->type == PIPE_SHADER_VERTEX) {
- vgpr_comp_cnt = shader->uses_instanceid ? 3 : 0;
+ vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 0;
num_user_sgprs = SI_ES_NUM_USER_SGPR;
} else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
vgpr_comp_cnt = 3; /* all components are needed for TES */
@@ -347,7 +558,7 @@ static void si_shader_vs(struct si_shader *shader, struct si_shader *gs)
vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */
num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR;
} else if (shader->selector->type == PIPE_SHADER_VERTEX) {
- vgpr_comp_cnt = shader->uses_instanceid ? 3 : (enable_prim_id ? 2 : 0);
+ vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : (enable_prim_id ? 2 : 0);
num_user_sgprs = SI_VS_NUM_USER_SGPR;
} else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
vgpr_comp_cnt = 3; /* all components are needed for TES */
@@ -363,19 +574,19 @@ static void si_shader_vs(struct si_shader *shader, struct si_shader *gs)
assert(num_sgprs <= 104);
/* VS is required to export at least one param. */
- nparams = MAX2(shader->nr_param_exports, 1);
+ nparams = MAX2(shader->info.nr_param_exports, 1);
si_pm4_set_reg(pm4, R_0286C4_SPI_VS_OUT_CONFIG,
S_0286C4_VS_EXPORT_COUNT(nparams - 1));
si_pm4_set_reg(pm4, R_02870C_SPI_SHADER_POS_FORMAT,
S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
- S_02870C_POS1_EXPORT_FORMAT(shader->nr_pos_exports > 1 ?
+ S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ?
V_02870C_SPI_SHADER_4COMP :
V_02870C_SPI_SHADER_NONE) |
- S_02870C_POS2_EXPORT_FORMAT(shader->nr_pos_exports > 2 ?
+ S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ?
V_02870C_SPI_SHADER_4COMP :
V_02870C_SPI_SHADER_NONE) |
- S_02870C_POS3_EXPORT_FORMAT(shader->nr_pos_exports > 3 ?
+ S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ?
V_02870C_SPI_SHADER_4COMP :
V_02870C_SPI_SHADER_NONE));
@@ -415,7 +626,7 @@ static unsigned si_get_ps_num_interp(struct si_shader *ps)
unsigned num_colors = !!(info->colors_read & 0x0f) +
!!(info->colors_read & 0xf0);
unsigned num_interp = ps->selector->info.num_inputs +
- (ps->key.ps.color_two_side ? num_colors : 0);
+ (ps->key.ps.prolog.color_two_side ? num_colors : 0);
assert(num_interp <= 32);
return MIN2(num_interp, 32);
@@ -423,7 +634,7 @@ static unsigned si_get_ps_num_interp(struct si_shader *ps)
static unsigned si_get_spi_shader_col_format(struct si_shader *shader)
{
- unsigned value = shader->key.ps.spi_shader_col_format;
+ unsigned value = shader->key.ps.epilog.spi_shader_col_format;
unsigned i, num_targets = (util_last_bit(value) + 3) / 4;
/* If the i-th target format is set, all previous target formats must
@@ -528,7 +739,7 @@ static void si_shader_ps(struct si_shader *shader)
if (!spi_shader_col_format &&
!info->writes_z && !info->writes_stencil && !info->writes_samplemask &&
(shader->selector->info.uses_kill ||
- shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS))
+ shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS))
spi_shader_col_format = V_028714_SPI_SHADER_32_R;
si_pm4_set_reg(pm4, R_0286CC_SPI_PS_INPUT_ENA, input_ena);
@@ -638,11 +849,13 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
switch (sel->type) {
case PIPE_SHADER_VERTEX:
- if (sctx->vertex_elements)
- for (i = 0; i < sctx->vertex_elements->count; ++i)
- key->vs.instance_divisors[i] =
+ if (sctx->vertex_elements) {
+ unsigned count = MIN2(sel->info.num_inputs,
+ sctx->vertex_elements->count);
+ for (i = 0; i < count; ++i)
+ key->vs.prolog.instance_divisors[i] =
sctx->vertex_elements->elements[i].instance_divisor;
-
+ }
if (sctx->tes_shader.cso)
key->vs.as_ls = 1;
else if (sctx->gs_shader.cso)
@@ -650,17 +863,17 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
if (!sctx->gs_shader.cso && sctx->ps_shader.cso &&
sctx->ps_shader.cso->info.uses_primid)
- key->vs.export_prim_id = 1;
+ key->vs.epilog.export_prim_id = 1;
break;
case PIPE_SHADER_TESS_CTRL:
- key->tcs.prim_mode =
+ key->tcs.epilog.prim_mode =
sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
break;
case PIPE_SHADER_TESS_EVAL:
if (sctx->gs_shader.cso)
key->tes.as_es = 1;
else if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
- key->tes.export_prim_id = 1;
+ key->tes.epilog.export_prim_id = 1;
break;
case PIPE_SHADER_GEOMETRY:
break;
@@ -670,13 +883,13 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
sel->info.colors_written == 0x1)
- key->ps.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
+ key->ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
if (blend) {
/* Select the shader color format based on whether
* blending or alpha are needed.
*/
- key->ps.spi_shader_col_format =
+ key->ps.epilog.spi_shader_col_format =
(blend->blend_enable_4bit & blend->need_src_alpha_4bit &
sctx->framebuffer.spi_shader_col_format_blend_alpha) |
(blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
@@ -686,26 +899,26 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
(~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
sctx->framebuffer.spi_shader_col_format);
} else
- key->ps.spi_shader_col_format = sctx->framebuffer.spi_shader_col_format;
+ key->ps.epilog.spi_shader_col_format = sctx->framebuffer.spi_shader_col_format;
/* If alpha-to-coverage is enabled, we have to export alpha
* even if there is no color buffer.
*/
- if (!(key->ps.spi_shader_col_format & 0xf) &&
+ if (!(key->ps.epilog.spi_shader_col_format & 0xf) &&
blend && blend->alpha_to_coverage)
- key->ps.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR;
+ key->ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR;
/* On SI and CIK except Hawaii, the CB doesn't clamp outputs
* to the range supported by the type if a channel has less
* than 16 bits and the export format is 16_ABGR.
*/
if (sctx->b.chip_class <= CIK && sctx->b.family != CHIP_HAWAII)
- key->ps.color_is_int8 = sctx->framebuffer.color_is_int8;
+ key->ps.epilog.color_is_int8 = sctx->framebuffer.color_is_int8;
/* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */
- if (!key->ps.last_cbuf) {
- key->ps.spi_shader_col_format &= sel->colors_written_4bit;
- key->ps.color_is_int8 &= sel->info.colors_written;
+ if (!key->ps.epilog.last_cbuf) {
+ key->ps.epilog.spi_shader_col_format &= sel->colors_written_4bit;
+ key->ps.epilog.color_is_int8 &= sel->info.colors_written;
}
if (rs) {
@@ -714,31 +927,32 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
sctx->current_rast_prim >= PIPE_PRIM_TRIANGLES_ADJACENCY;
bool is_line = !is_poly && sctx->current_rast_prim != PIPE_PRIM_POINTS;
- key->ps.color_two_side = rs->two_side && sel->info.colors_read;
+ key->ps.prolog.color_two_side = rs->two_side && sel->info.colors_read;
if (sctx->queued.named.blend) {
- key->ps.alpha_to_one = sctx->queued.named.blend->alpha_to_one &&
- rs->multisample_enable &&
- !sctx->framebuffer.cb0_is_integer;
+ key->ps.epilog.alpha_to_one = sctx->queued.named.blend->alpha_to_one &&
+ rs->multisample_enable &&
+ !sctx->framebuffer.cb0_is_integer;
}
- key->ps.poly_stipple = rs->poly_stipple_enable && is_poly;
- key->ps.poly_line_smoothing = ((is_poly && rs->poly_smooth) ||
- (is_line && rs->line_smooth)) &&
- sctx->framebuffer.nr_samples <= 1;
- key->ps.clamp_color = rs->clamp_fragment_color;
-
- key->ps.force_persample_interp = rs->force_persample_interp &&
- rs->multisample_enable &&
- sctx->framebuffer.nr_samples > 1 &&
- sctx->ps_iter_samples > 1 &&
- (sel->info.uses_persp_center ||
- sel->info.uses_persp_centroid ||
- sel->info.uses_linear_center ||
- sel->info.uses_linear_centroid);
+ key->ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
+ key->ps.epilog.poly_line_smoothing = ((is_poly && rs->poly_smooth) ||
+ (is_line && rs->line_smooth)) &&
+ sctx->framebuffer.nr_samples <= 1;
+ key->ps.epilog.clamp_color = rs->clamp_fragment_color;
+
+ key->ps.prolog.force_persample_interp =
+ rs->force_persample_interp &&
+ rs->multisample_enable &&
+ sctx->framebuffer.nr_samples > 1 &&
+ sctx->ps_iter_samples > 1 &&
+ (sel->info.uses_persp_center ||
+ sel->info.uses_persp_centroid ||
+ sel->info.uses_linear_center ||
+ sel->info.uses_linear_centroid);
}
- key->ps.alpha_func = si_get_alpha_test_func(sctx);
+ key->ps.epilog.alpha_func = si_get_alpha_test_func(sctx);
break;
}
default:
@@ -821,6 +1035,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
const struct pipe_shader_state *state)
{
struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+ struct si_context *sctx = (struct si_context*)ctx;
struct si_shader_selector *sel = CALLOC_STRUCT(si_shader_selector);
int i;
@@ -900,6 +1115,13 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
for (i = 0; i < 8; i++)
if (sel->info.colors_written & (1 << i))
sel->colors_written_4bit |= 0xf << (4 * i);
+
+ for (i = 0; i < sel->info.num_inputs; i++) {
+ if (sel->info.input_semantic_name[i] == TGSI_SEMANTIC_COLOR) {
+ int index = sel->info.input_semantic_index[i];
+ sel->color_attr_index[index] = i;
+ }
+ }
break;
}
@@ -921,6 +1143,44 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
break;
}
+ /* Compile the main shader part for use with a prolog and/or epilog. */
+ if (sel->type != PIPE_SHADER_GEOMETRY &&
+ !sscreen->use_monolithic_shaders) {
+ struct si_shader *shader = CALLOC_STRUCT(si_shader);
+ void *tgsi_binary;
+
+ if (!shader)
+ goto error;
+
+ shader->selector = sel;
+
+ tgsi_binary = si_get_tgsi_binary(sel);
+
+ /* Try to load the shader from the shader cache. */
+ pipe_mutex_lock(sscreen->shader_cache_mutex);
+
+ if (tgsi_binary &&
+ si_shader_cache_load_shader(sscreen, tgsi_binary, shader)) {
+ FREE(tgsi_binary);
+ } else {
+ /* Compile the shader if it hasn't been loaded from the cache. */
+ if (si_compile_tgsi_shader(sscreen, sctx->tm, shader, false,
+ &sctx->b.debug) != 0) {
+ FREE(shader);
+ FREE(tgsi_binary);
+ pipe_mutex_unlock(sscreen->shader_cache_mutex);
+ goto error;
+ }
+
+ if (tgsi_binary &&
+ !si_shader_cache_insert_shader(sscreen, tgsi_binary, shader))
+ FREE(tgsi_binary);
+ }
+ pipe_mutex_unlock(sscreen->shader_cache_mutex);
+
+ sel->main_shader_part = shader;
+ }
+
/* Pre-compilation. */
if (sel->type == PIPE_SHADER_GEOMETRY ||
sscreen->b.debug_flags & DBG_PRECOMPILE) {
@@ -934,27 +1194,29 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
*/
switch (sel->type) {
case PIPE_SHADER_TESS_CTRL:
- key.tcs.prim_mode = PIPE_PRIM_TRIANGLES;
+ key.tcs.epilog.prim_mode = PIPE_PRIM_TRIANGLES;
break;
case PIPE_SHADER_FRAGMENT:
- key.ps.alpha_func = PIPE_FUNC_ALWAYS;
+ key.ps.epilog.alpha_func = PIPE_FUNC_ALWAYS;
for (i = 0; i < 8; i++)
if (sel->info.colors_written & (1 << i))
- key.ps.spi_shader_col_format |=
+ key.ps.epilog.spi_shader_col_format |=
V_028710_SPI_SHADER_FP16_ABGR << (i * 4);
break;
}
- if (si_shader_select_with_key(ctx, &state, &key)) {
- fprintf(stderr, "radeonsi: can't create a shader\n");
- tgsi_free_tokens(sel->tokens);
- FREE(sel);
- return NULL;
- }
+ if (si_shader_select_with_key(ctx, &state, &key))
+ goto error;
}
pipe_mutex_init(sel->mutex);
return sel;
+
+error:
+ fprintf(stderr, "radeonsi: can't create a shader\n");
+ tgsi_free_tokens(sel->tokens);
+ FREE(sel);
+ return NULL;
}
/**
@@ -1119,6 +1381,9 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
p = c;
}
+ if (sel->main_shader_part)
+ si_delete_shader(sctx, sel->main_shader_part);
+
pipe_mutex_destroy(sel->mutex);
free(sel->tokens);
free(sel);
@@ -1144,14 +1409,14 @@ static unsigned si_get_ps_input_cntl(struct si_context *sctx,
for (j = 0; j < vsinfo->num_outputs; j++) {
if (name == vsinfo->output_semantic_name[j] &&
index == vsinfo->output_semantic_index[j]) {
- ps_input_cntl |= S_028644_OFFSET(vs->vs_output_param_offset[j]);
+ ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[j]);
break;
}
}
if (name == TGSI_SEMANTIC_PRIMID)
/* PrimID is written after the last output. */
- ps_input_cntl |= S_028644_OFFSET(vs->vs_output_param_offset[vsinfo->num_outputs]);
+ ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[vsinfo->num_outputs]);
else if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
/* No corresponding output found, load defaults into input.
* Don't set any other bits.
@@ -1191,7 +1456,7 @@ static void si_emit_spi_map(struct si_context *sctx, struct r600_atom *atom)
}
}
- if (ps->key.ps.color_two_side) {
+ if (ps->key.ps.prolog.color_two_side) {
unsigned bcol = TGSI_SEMANTIC_BCOLOR;
for (i = 0; i < 2; i++) {
@@ -1745,8 +2010,8 @@ bool si_update_shaders(struct si_context *sctx)
si_mark_atom_dirty(sctx, &sctx->db_render_state);
}
- if (sctx->smoothing_enabled != sctx->ps_shader.current->key.ps.poly_line_smoothing) {
- sctx->smoothing_enabled = sctx->ps_shader.current->key.ps.poly_line_smoothing;
+ if (sctx->smoothing_enabled != sctx->ps_shader.current->key.ps.epilog.poly_line_smoothing) {
+ sctx->smoothing_enabled = sctx->ps_shader.current->key.ps.epilog.poly_line_smoothing;
si_mark_atom_dirty(sctx, &sctx->msaa_config);
if (sctx->b.chip_class == SI)