diff options
Diffstat (limited to 'src/gallium/drivers/radeonsi')
-rw-r--r-- | src/gallium/drivers/radeonsi/si_compute.c | 31 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_pipe.c | 35 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_pipe.h | 26 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader.c | 1784 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader.h | 155 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state.c | 2 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state.h | 2 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state_shaders.c | 383 |
8 files changed, 2152 insertions, 266 deletions
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 7370a113d3d..9f5f4c682bc 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -196,9 +196,7 @@ static unsigned compute_num_waves_for_scratch( } static void si_launch_grid( - struct pipe_context *ctx, - const uint *block_layout, const uint *grid_layout, - uint32_t pc, const void *input) + struct pipe_context *ctx, const struct pipe_grid_info *info) { struct si_context *sctx = (struct si_context*)ctx; struct radeon_winsys_cs *cs = sctx->b.gfx.cs; @@ -232,7 +230,7 @@ static void si_launch_grid( pm4->compute_pkt = true; /* Read the config information */ - si_shader_binary_read_config(&shader->binary, &shader->config, pc); + si_shader_binary_read_config(&shader->binary, &shader->config, info->pc); /* Upload the kernel arguments */ @@ -242,15 +240,16 @@ static void si_launch_grid( kernel_args = sctx->b.ws->buffer_map(input_buffer->buf, sctx->b.gfx.cs, PIPE_TRANSFER_WRITE); for (i = 0; i < 3; i++) { - kernel_args[i] = grid_layout[i]; - kernel_args[i + 3] = grid_layout[i] * block_layout[i]; - kernel_args[i + 6] = block_layout[i]; + kernel_args[i] = info->grid[i]; + kernel_args[i + 3] = info->grid[i] * info->block[i]; + kernel_args[i + 6] = info->block[i]; } num_waves_for_scratch = compute_num_waves_for_scratch( - &sctx->screen->b.info, block_layout, grid_layout); + &sctx->screen->b.info, info->block, info->grid); - memcpy(kernel_args + (num_work_size_bytes / 4), input, program->input_size); + memcpy(kernel_args + (num_work_size_bytes / 4), info->input, + program->input_size); if (shader->config.scratch_bytes_per_wave > 0) { @@ -291,11 +290,11 @@ static void si_launch_grid( si_pm4_set_reg(pm4, R_00B818_COMPUTE_START_Z, 0); si_pm4_set_reg(pm4, R_00B81C_COMPUTE_NUM_THREAD_X, - S_00B81C_NUM_THREAD_FULL(block_layout[0])); + S_00B81C_NUM_THREAD_FULL(info->block[0])); si_pm4_set_reg(pm4, R_00B820_COMPUTE_NUM_THREAD_Y, - S_00B820_NUM_THREAD_FULL(block_layout[1])); + S_00B820_NUM_THREAD_FULL(info->block[1])); si_pm4_set_reg(pm4, R_00B824_COMPUTE_NUM_THREAD_Z, - S_00B824_NUM_THREAD_FULL(block_layout[2])); + S_00B824_NUM_THREAD_FULL(info->block[2])); /* Global buffers */ for (i = 0; i < MAX_GLOBAL_BUFFERS; i++) { @@ -323,7 +322,7 @@ static void si_launch_grid( } shader_va = shader->bo->gpu_address; - shader_va += pc; + shader_va += info->pc; radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER); @@ -375,9 +374,9 @@ static void si_launch_grid( ; si_pm4_cmd_begin(pm4, PKT3_DISPATCH_DIRECT); - si_pm4_cmd_add(pm4, grid_layout[0]); /* Thread groups DIM_X */ - si_pm4_cmd_add(pm4, grid_layout[1]); /* Thread groups DIM_Y */ - si_pm4_cmd_add(pm4, grid_layout[2]); /* Thread gropus DIM_Z */ + si_pm4_cmd_add(pm4, info->grid[0]); /* Thread groups DIM_X */ + si_pm4_cmd_add(pm4, info->grid[1]); /* Thread groups DIM_Y */ + si_pm4_cmd_add(pm4, info->grid[2]); /* Thread gropus DIM_Z */ si_pm4_cmd_add(pm4, 1); /* DISPATCH_INITIATOR */ si_pm4_cmd_end(pm4, false); diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index e9d69d2db38..37fd4a25d59 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -22,6 +22,7 @@ */ #include "si_pipe.h" +#include "si_shader.h" #include "si_public.h" #include "sid.h" @@ -448,6 +449,10 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu switch (param) { case PIPE_SHADER_CAP_PREFERRED_IR: return PIPE_SHADER_IR_NATIVE; + + case PIPE_SHADER_CAP_SUPPORTED_IRS: + return 0; + case PIPE_SHADER_CAP_DOUBLES: return HAVE_LLVM >= 0x0307; @@ -511,6 +516,8 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu return 16; case PIPE_SHADER_CAP_PREFERRED_IR: return PIPE_SHADER_IR_TGSI; + case PIPE_SHADER_CAP_SUPPORTED_IRS: + return 0; case PIPE_SHADER_CAP_DOUBLES: return HAVE_LLVM >= 0x0307; case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: @@ -522,6 +529,7 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: + case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: return 0; } return 0; @@ -530,6 +538,14 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu static void si_destroy_screen(struct pipe_screen* pscreen) { struct si_screen *sscreen = (struct si_screen *)pscreen; + struct si_shader_part *parts[] = { + sscreen->vs_prologs, + sscreen->vs_epilogs, + sscreen->tcs_epilogs, + sscreen->ps_prologs, + sscreen->ps_epilogs + }; + unsigned i; if (!sscreen) return; @@ -537,6 +553,18 @@ static void si_destroy_screen(struct pipe_screen* pscreen) if (!sscreen->b.ws->unref(sscreen->b.ws)) return; + /* Free shader parts. */ + for (i = 0; i < ARRAY_SIZE(parts); i++) { + while (parts[i]) { + struct si_shader_part *part = parts[i]; + + parts[i] = part->next; + radeon_shader_binary_clean(&part->binary); + FREE(part); + } + } + pipe_mutex_destroy(sscreen->shader_parts_mutex); + si_destroy_shader_cache(sscreen); r600_destroy_common_screen(&sscreen->b); } @@ -584,7 +612,8 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws) sscreen->b.b.resource_create = r600_resource_create_common; if (!r600_common_screen_init(&sscreen->b, ws) || - !si_init_gs_info(sscreen)) { + !si_init_gs_info(sscreen) || + !si_init_shader_cache(sscreen)) { FREE(sscreen); return NULL; } @@ -594,6 +623,10 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws) sscreen->b.has_cp_dma = true; sscreen->b.has_streamout = true; + pipe_mutex_init(sscreen->shader_parts_mutex); + sscreen->use_monolithic_shaders = + HAVE_LLVM < 0x0308 || + (sscreen->b.debug_flags & DBG_MONOLITHIC_SHADERS) != 0; if (debug_get_bool_option("RADEON_DUMP_SHADERS", FALSE)) sscreen->b.debug_flags |= DBG_FS | DBG_VS | DBG_GS | DBG_PS | DBG_CS; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index b5790d6b564..ef860a58b83 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -80,10 +80,36 @@ #define SI_MAX_BORDER_COLORS 4096 struct si_compute; +struct hash_table; struct si_screen { struct r600_common_screen b; unsigned gs_table_depth; + + /* Whether shaders are monolithic (1-part) or separate (3-part). */ + bool use_monolithic_shaders; + + pipe_mutex shader_parts_mutex; + struct si_shader_part *vs_prologs; + struct si_shader_part *vs_epilogs; + struct si_shader_part *tcs_epilogs; + struct si_shader_part *ps_prologs; + struct si_shader_part *ps_epilogs; + + /* Shader cache in memory. + * + * Design & limitations: + * - The shader cache is per screen (= per process), never saved to + * disk, and skips redundant shader compilations from TGSI to bytecode. + * - It can only be used with one-variant-per-shader support, in which + * case only the main (typically middle) part of shaders is cached. + * - Only VS, TCS, TES, PS are cached, out of which only the hw VS + * variants of VS and TES are cached, so LS and ES aren't. + * - GS and CS aren't cached, but it's certainly possible to cache + * those as well. + */ + pipe_mutex shader_cache_mutex; + struct hash_table *shader_cache; }; struct si_blend_color { diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index baa1090e2fb..57458ae1381 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -70,6 +70,12 @@ struct si_shader_context unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */ bool is_gs_copy_shader; + + /* Whether to generate the optimized shader variant compiled as a whole + * (without a prolog and epilog) + */ + bool is_monolithic; + int param_streamout_config; int param_streamout_write_index; int param_streamout_offset[4]; @@ -77,6 +83,7 @@ struct si_shader_context int param_rel_auto_id; int param_vs_prim_id; int param_instance_id; + int param_vertex_index0; int param_tes_u; int param_tes_v; int param_tes_rel_patch_id; @@ -96,14 +103,17 @@ struct si_shader_context LLVMValueRef esgs_ring; LLVMValueRef gsvs_ring[4]; LLVMValueRef gs_next_vertex[4]; + LLVMValueRef return_value; LLVMTypeRef voidt; LLVMTypeRef i1; LLVMTypeRef i8; LLVMTypeRef i32; + LLVMTypeRef i64; LLVMTypeRef i128; LLVMTypeRef f32; LLVMTypeRef v16i8; + LLVMTypeRef v2i32; LLVMTypeRef v4i32; LLVMTypeRef v4f32; LLVMTypeRef v8i32; @@ -118,9 +128,17 @@ static struct si_shader_context *si_shader_context( static void si_init_shader_ctx(struct si_shader_context *ctx, struct si_screen *sscreen, struct si_shader *shader, - LLVMTargetMachineRef tm, - struct tgsi_shader_info *info); + LLVMTargetMachineRef tm); +/* Ideally pass the sample mask input to the PS epilog as v13, which + * is its usual location, so that the shader doesn't have to add v_mov. + */ +#define PS_EPILOG_SAMPLEMASK_MIN_LOC 13 + +/* The VS location of the PrimitiveID input is the same in the epilog, + * so that the main shader part doesn't have to move it. + */ +#define VS_EPILOG_PRIMID_LOC 2 #define PERSPECTIVE_BASE 0 #define LINEAR_BASE 9 @@ -196,6 +214,10 @@ static LLVMValueRef unpack_param(struct si_shader_context *ctx, LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn, param); + if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind) + value = bitcast(&ctx->radeon_bld.soa.bld_base, + TGSI_TYPE_UNSIGNED, value); + if (rshift) value = LLVMBuildLShr(gallivm->builder, value, lp_build_const_int32(gallivm, rshift), ""); @@ -375,7 +397,7 @@ static LLVMValueRef build_indexed_load_const( static LLVMValueRef get_instance_index_for_fetch( struct radeon_llvm_context *radeon_bld, - unsigned divisor) + unsigned param_start_instance, unsigned divisor) { struct si_shader_context *ctx = si_shader_context(&radeon_bld->soa.bld_base); @@ -389,8 +411,8 @@ static LLVMValueRef get_instance_index_for_fetch( result = LLVMBuildUDiv(gallivm->builder, result, lp_build_const_int32(gallivm, divisor), ""); - return LLVMBuildAdd(gallivm->builder, result, LLVMGetParam( - radeon_bld->main_fn, SI_PARAM_START_INSTANCE), ""); + return LLVMBuildAdd(gallivm->builder, result, + LLVMGetParam(radeon_bld->main_fn, param_start_instance), ""); } static void declare_input_vs( @@ -402,7 +424,8 @@ static void declare_input_vs( struct gallivm_state *gallivm = base->gallivm; struct si_shader_context *ctx = si_shader_context(&radeon_bld->soa.bld_base); - unsigned divisor = ctx->shader->key.vs.instance_divisors[input_index]; + unsigned divisor = + ctx->shader->key.vs.prolog.instance_divisors[input_index]; unsigned chan; @@ -424,10 +447,16 @@ static void declare_input_vs( /* Build the attribute offset */ attribute_offset = lp_build_const_int32(gallivm, 0); - if (divisor) { + if (!ctx->is_monolithic) { + buffer_index = LLVMGetParam(radeon_bld->main_fn, + ctx->param_vertex_index0 + + input_index); + } else if (divisor) { /* Build index from instance ID, start instance and divisor */ - ctx->shader->uses_instanceid = true; - buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld, divisor); + ctx->shader->info.uses_instanceid = true; + buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld, + SI_PARAM_START_INSTANCE, + divisor); } else { /* Load the buffer index for vertices. */ LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn, @@ -853,7 +882,8 @@ static int lookup_interp_param_index(unsigned interpolate, unsigned location) static unsigned select_interp_param(struct si_shader_context *ctx, unsigned param) { - if (!ctx->shader->key.ps.force_persample_interp) + if (!ctx->shader->key.ps.prolog.force_persample_interp || + !ctx->is_monolithic) return param; /* If the shader doesn't use center/centroid, just return the parameter. @@ -923,7 +953,7 @@ static void interp_fs_input(struct si_shader_context *ctx, intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant"; if (semantic_name == TGSI_SEMANTIC_COLOR && - ctx->shader->key.ps.color_two_side) { + ctx->shader->key.ps.prolog.color_two_side) { LLVMValueRef args[4]; LLVMValueRef is_face_positive; LLVMValueRef back_attr_number; @@ -997,6 +1027,7 @@ static void declare_input_fs( unsigned input_index, const struct tgsi_full_declaration *decl) { + struct lp_build_context *base = &radeon_bld->soa.bld_base.base; struct si_shader_context *ctx = si_shader_context(&radeon_bld->soa.bld_base); struct si_shader *shader = ctx->shader; @@ -1004,6 +1035,26 @@ static void declare_input_fs( LLVMValueRef interp_param = NULL; int interp_param_idx; + /* Get colors from input VGPRs (set by the prolog). */ + if (!ctx->is_monolithic && + decl->Semantic.Name == TGSI_SEMANTIC_COLOR) { + unsigned i = decl->Semantic.Index; + unsigned colors_read = shader->selector->info.colors_read; + unsigned mask = colors_read >> (i * 4); + unsigned offset = SI_PARAM_POS_FIXED_PT + 1 + + (i ? util_bitcount(colors_read & 0xf) : 0); + + radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] = + mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef; + radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] = + mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef; + radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] = + mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef; + radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] = + mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef; + return; + } + interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate, decl->Interp.Location); if (interp_param_idx == -1) @@ -1330,12 +1381,12 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { const union si_shader_key *key = &ctx->shader->key; - unsigned col_formats = key->ps.spi_shader_col_format; + unsigned col_formats = key->ps.epilog.spi_shader_col_format; int cbuf = target - V_008DFC_SQ_EXP_MRT; assert(cbuf >= 0 && cbuf < 8); spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf; - is_int8 = (key->ps.color_is_int8 >> cbuf) & 0x1; + is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1; } args[4] = uint->zero; /* COMPR flag */ @@ -1488,13 +1539,13 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base, struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; - if (ctx->shader->key.ps.alpha_func != PIPE_FUNC_NEVER) { + if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) { LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_ALPHA_REF); LLVMValueRef alpha_pass = lp_build_cmp(&bld_base->base, - ctx->shader->key.ps.alpha_func, + ctx->shader->key.ps.epilog.alpha_func, alpha, alpha_ref); LLVMValueRef arg = lp_build_select(&bld_base->base, @@ -1511,7 +1562,8 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base, } static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base, - LLVMValueRef alpha) + LLVMValueRef alpha, + unsigned samplemask_param) { struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; @@ -1519,7 +1571,7 @@ static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context * /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */ coverage = LLVMGetParam(ctx->radeon_bld.main_fn, - SI_PARAM_SAMPLE_COVERAGE); + samplemask_param); coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage); coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32", @@ -1841,7 +1893,8 @@ handle_semantic: case TGSI_SEMANTIC_COLOR: case TGSI_SEMANTIC_BCOLOR: target = V_008DFC_SQ_EXP_PARAM + param_count; - shader->vs_output_param_offset[i] = param_count; + assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset)); + shader->info.vs_output_param_offset[i] = param_count; param_count++; break; case TGSI_SEMANTIC_CLIPDIST: @@ -1855,7 +1908,8 @@ handle_semantic: case TGSI_SEMANTIC_TEXCOORD: case TGSI_SEMANTIC_GENERIC: target = V_008DFC_SQ_EXP_PARAM + param_count; - shader->vs_output_param_offset[i] = param_count; + assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset)); + shader->info.vs_output_param_offset[i] = param_count; param_count++; break; default: @@ -1883,7 +1937,7 @@ handle_semantic: } } - shader->nr_param_exports = param_count; + shader->info.nr_param_exports = param_count; /* We need to add the position output manually if it's missing. */ if (!pos_args[0][0]) { @@ -1945,7 +1999,7 @@ handle_semantic: for (i = 0; i < 4; i++) if (pos_args[i][0]) - shader->nr_pos_exports++; + shader->info.nr_pos_exports++; pos_idx = 0; for (i = 0; i < 4; i++) { @@ -1955,7 +2009,7 @@ handle_semantic: /* Specify the target we are exporting */ pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++); - if (pos_idx == shader->nr_pos_exports) + if (pos_idx == shader->info.nr_pos_exports) /* Specify that this is the last export */ pos_args[i][2] = uint->one; @@ -1989,7 +2043,7 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, invocation_id, bld_base->uint_bld.zero, "")); /* Determine the layout of one tess factor element in the buffer. */ - switch (shader->key.tcs.prim_mode) { + switch (shader->key.tcs.epilog.prim_mode) { case PIPE_PRIM_LINES: stride = 2; /* 2 dwords, 1 vec2 store */ outer_comps = 2; @@ -2061,14 +2115,51 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base) { struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef invocation_id; + LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset; + rel_patch_id = get_rel_patch_id(ctx); invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5); + tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx); - si_write_tess_factors(bld_base, - get_rel_patch_id(ctx), - invocation_id, - get_tcs_out_current_patch_data_offset(ctx)); + if (!ctx->is_monolithic) { + /* Return epilog parameters from this function. */ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + LLVMValueRef ret = ctx->return_value; + LLVMValueRef rw_buffers, rw0, rw1, tf_soffset; + unsigned vgpr; + + /* RW_BUFFERS pointer */ + rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_RW_BUFFERS); + rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, ""); + rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, ""); + rw0 = LLVMBuildExtractElement(builder, rw_buffers, + bld_base->uint_bld.zero, ""); + rw1 = LLVMBuildExtractElement(builder, rw_buffers, + bld_base->uint_bld.one, ""); + ret = LLVMBuildInsertValue(builder, ret, rw0, 0, ""); + ret = LLVMBuildInsertValue(builder, ret, rw1, 1, ""); + + /* Tess factor buffer soffset is after user SGPRs. */ + tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_TESS_FACTOR_OFFSET); + ret = LLVMBuildInsertValue(builder, ret, tf_soffset, + SI_TCS_NUM_USER_SGPR, ""); + + /* VGPRs */ + rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id); + invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id); + tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset); + + vgpr = SI_TCS_NUM_USER_SGPR + 1; + ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, ""); + ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, ""); + ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, ""); + ctx->return_value = ret; + return; + } + + si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset); } static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base) @@ -2214,16 +2305,26 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base) ""); } - /* Export PrimitiveID when PS needs it. */ - if (si_vs_exports_prim_id(ctx->shader)) { - outputs[i].name = TGSI_SEMANTIC_PRIMID; - outputs[i].sid = 0; - outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, - get_primitive_id(bld_base, 0)); - outputs[i].values[1] = bld_base->base.undef; - outputs[i].values[2] = bld_base->base.undef; - outputs[i].values[3] = bld_base->base.undef; - i++; + if (ctx->is_monolithic) { + /* Export PrimitiveID when PS needs it. */ + if (si_vs_exports_prim_id(ctx->shader)) { + outputs[i].name = TGSI_SEMANTIC_PRIMID; + outputs[i].sid = 0; + outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, + get_primitive_id(bld_base, 0)); + outputs[i].values[1] = bld_base->base.undef; + outputs[i].values[2] = bld_base->base.undef; + outputs[i].values[3] = bld_base->base.undef; + i++; + } + } else { + /* Return the primitive ID from the LLVM function. */ + ctx->return_value = + LLVMBuildInsertValue(gallivm->builder, + ctx->return_value, + bitcast(bld_base, TGSI_TYPE_FLOAT, + get_primitive_id(bld_base, 0)), + VS_EPILOG_PRIMID_LOC, ""); } si_llvm_export_vs(bld_base, outputs, i); @@ -2284,6 +2385,7 @@ static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base, static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base, LLVMValueRef *color, unsigned index, + unsigned samplemask_param, bool is_last) { struct si_shader_context *ctx = si_shader_context(bld_base); @@ -2291,30 +2393,31 @@ static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base, int i; /* Clamp color */ - if (ctx->shader->key.ps.clamp_color) + if (ctx->shader->key.ps.epilog.clamp_color) for (i = 0; i < 4; i++) color[i] = radeon_llvm_saturate(bld_base, color[i]); /* Alpha to one */ - if (ctx->shader->key.ps.alpha_to_one) + if (ctx->shader->key.ps.epilog.alpha_to_one) color[3] = base->one; /* Alpha test */ if (index == 0 && - ctx->shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS) + ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) si_alpha_test(bld_base, color[3]); /* Line & polygon smoothing */ - if (ctx->shader->key.ps.poly_line_smoothing) - color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3]); + if (ctx->shader->key.ps.epilog.poly_line_smoothing) + color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3], + samplemask_param); /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ - if (ctx->shader->key.ps.last_cbuf > 0) { + if (ctx->shader->key.ps.epilog.last_cbuf > 0) { LLVMValueRef args[8][9]; int c, last = -1; /* Get the export arguments, also find out what the last one is. */ - for (c = 0; c <= ctx->shader->key.ps.last_cbuf; c++) { + for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) { si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + c, args[c]); if (args[c][0] != bld_base->uint_bld.zero) @@ -2322,7 +2425,7 @@ static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base, } /* Emit all exports. */ - for (c = 0; c <= ctx->shader->key.ps.last_cbuf; c++) { + for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) { if (is_last && last == c) { args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */ args[c][2] = bld_base->uint_bld.one; /* DONE bit */ @@ -2385,11 +2488,11 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base) * Otherwise, find the last color export. */ if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) { - unsigned spi_format = shader->key.ps.spi_shader_col_format; + unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format; /* Don't export NULL and return if alpha-test is enabled. */ - if (shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS && - shader->key.ps.alpha_func != PIPE_FUNC_NEVER && + if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS && + shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER && (spi_format & 0xf) == 0) spi_format |= V_028714_SPI_SHADER_32_AR; @@ -2400,10 +2503,10 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base) continue; /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ - if (shader->key.ps.last_cbuf > 0) { + if (shader->key.ps.epilog.last_cbuf > 0) { /* Just set this if any of the colorbuffers are enabled. */ if (spi_format & - ((1llu << (4 * (shader->key.ps.last_cbuf + 1))) - 1)) + ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1)) last_color_export = i; continue; } @@ -2445,6 +2548,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base) ctx->radeon_bld.soa.outputs[i][j], ""); si_export_mrt_color(bld_base, color, semantic_index, + SI_PARAM_SAMPLE_COVERAGE, last_color_export == i); break; default: @@ -2458,6 +2562,100 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base) si_export_mrt_z(bld_base, depth, stencil, samplemask); } +/** + * Return PS outputs in this order: + * + * v[0:3] = color0.xyzw + * v[4:7] = color1.xyzw + * ... + * vN+0 = Depth + * vN+1 = Stencil + * vN+2 = SampleMask + * vN+3 = SampleMaskIn (used for OpenGL smoothing) + * + * The alpha-ref SGPR is returned via its original location. + */ +static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + struct si_shader *shader = ctx->shader; + struct lp_build_context *base = &bld_base->base; + struct tgsi_shader_info *info = &shader->selector->info; + LLVMBuilderRef builder = base->gallivm->builder; + unsigned i, j, first_vgpr, vgpr; + + LLVMValueRef color[8][4] = {}; + LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; + LLVMValueRef ret; + + /* Read the output values. */ + for (i = 0; i < info->num_outputs; i++) { + unsigned semantic_name = info->output_semantic_name[i]; + unsigned semantic_index = info->output_semantic_index[i]; + + switch (semantic_name) { + case TGSI_SEMANTIC_COLOR: + assert(semantic_index < 8); + for (j = 0; j < 4; j++) { + LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j]; + LLVMValueRef result = LLVMBuildLoad(builder, ptr, ""); + color[semantic_index][j] = result; + } + break; + case TGSI_SEMANTIC_POSITION: + depth = LLVMBuildLoad(builder, + ctx->radeon_bld.soa.outputs[i][2], ""); + break; + case TGSI_SEMANTIC_STENCIL: + stencil = LLVMBuildLoad(builder, + ctx->radeon_bld.soa.outputs[i][1], ""); + break; + case TGSI_SEMANTIC_SAMPLEMASK: + samplemask = LLVMBuildLoad(builder, + ctx->radeon_bld.soa.outputs[i][0], ""); + break; + default: + fprintf(stderr, "Warning: SI unhandled fs output type:%d\n", + semantic_name); + } + } + + /* Fill the return structure. */ + ret = ctx->return_value; + + /* Set SGPRs. */ + ret = LLVMBuildInsertValue(builder, ret, + bitcast(bld_base, TGSI_TYPE_SIGNED, + LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_ALPHA_REF)), + SI_SGPR_ALPHA_REF, ""); + + /* Set VGPRs */ + first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1; + for (i = 0; i < ARRAY_SIZE(color); i++) { + if (!color[i][0]) + continue; + + for (j = 0; j < 4; j++) + ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, ""); + } + if (depth) + ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, ""); + if (stencil) + ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, ""); + if (samplemask) + ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, ""); + + /* Add the input sample mask for smoothing at the end. */ + if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC) + vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC; + ret = LLVMBuildInsertValue(builder, ret, + LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_SAMPLE_COVERAGE), vgpr++, ""); + + ctx->return_value = ret; +} + static void build_tex_intrinsic(const struct lp_build_tgsi_action *action, struct lp_build_tgsi_context *bld_base, struct lp_build_emit_data *emit_data); @@ -2536,13 +2734,12 @@ static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements) /** * Load an image view, fmask view. or sampler state descriptor. */ -static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx, - LLVMValueRef index, enum desc_type type) +static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx, + LLVMValueRef list, LLVMValueRef index, + enum desc_type type) { struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, - SI_PARAM_SAMPLERS); switch (type) { case DESC_IMAGE: @@ -2558,12 +2755,21 @@ static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx, /* The sampler state is at [12:15]. */ index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), ""); index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), ""); - ptr = LLVMBuildPointerCast(builder, ptr, - const_array(ctx->v4i32, 0), ""); + list = LLVMBuildPointerCast(builder, list, + const_array(ctx->v4i32, 0), ""); break; } - return build_indexed_load_const(ctx, ptr, index); + return build_indexed_load_const(ctx, list, index); +} + +static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx, + LLVMValueRef index, enum desc_type type) +{ + LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_SAMPLERS); + + return get_sampler_desc_custom(ctx, list, index, type); } static void tex_fetch_ptrs( @@ -3546,6 +3752,30 @@ static const struct lp_build_tgsi_action interp_action = { .emit = build_interp_intrinsic, }; +static void si_create_function(struct si_shader_context *ctx, + LLVMTypeRef *returns, unsigned num_returns, + LLVMTypeRef *params, unsigned num_params, + int last_array_pointer, int last_sgpr) +{ + int i; + + radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns, + params, num_params); + radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type); + ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type); + + for (i = 0; i <= last_sgpr; ++i) { + LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i); + + /* We tell llvm that array inputs are passed by value to allow Sinking pass + * to move load. Inputs are constant so this is fine. */ + if (i <= last_array_pointer) + LLVMAddAttribute(P, LLVMByValAttribute); + else + LLVMAddAttribute(P, LLVMInRegAttribute); + } +} + static void create_meta_data(struct si_shader_context *ctx) { struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm; @@ -3579,15 +3809,57 @@ static void declare_streamout_params(struct si_shader_context *ctx, } } +static unsigned llvm_get_type_size(LLVMTypeRef type) +{ + LLVMTypeKind kind = LLVMGetTypeKind(type); + + switch (kind) { + case LLVMIntegerTypeKind: + return LLVMGetIntTypeWidth(type) / 8; + case LLVMFloatTypeKind: + return 4; + case LLVMPointerTypeKind: + return 8; + case LLVMVectorTypeKind: + return LLVMGetVectorSize(type) * + llvm_get_type_size(LLVMGetElementType(type)); + default: + assert(0); + return 0; + } +} + +static void declare_tess_lds(struct si_shader_context *ctx) +{ + struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; + LLVMTypeRef i32 = ctx->radeon_bld.soa.bld_base.uint_bld.elem_type; + + /* This is the upper bound, maximum is 32 inputs times 32 vertices */ + unsigned vertex_data_dw_size = 32*32*4; + unsigned patch_data_dw_size = 32*4; + /* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */ + unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size; + unsigned lds_dwords = patch_dw_size; + + /* The actual size is computed outside of the shader to reduce + * the number of shader variants. */ + ctx->lds = + LLVMAddGlobalInAddressSpace(gallivm->module, + LLVMArrayType(i32, lds_dwords), + "tess_lds", + LOCAL_ADDR_SPACE); +} + static void create_function(struct si_shader_context *ctx) { struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; struct gallivm_state *gallivm = bld_base->base.gallivm; struct si_shader *shader = ctx->shader; - LLVMTypeRef params[SI_NUM_PARAMS], v2i32, v3i32; - unsigned i, last_array_pointer, last_sgpr, num_params; + LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32; + LLVMTypeRef returns[16+32*4]; + unsigned i, last_array_pointer, last_sgpr, num_params, num_return_sgprs; + unsigned num_returns = 0; - v2i32 = LLVMVectorType(ctx->i32, 2); v3i32 = LLVMVectorType(ctx->i32, 3); params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS); @@ -3630,6 +3902,20 @@ static void create_function(struct si_shader_context *ctx) params[ctx->param_rel_auto_id = num_params++] = ctx->i32; params[ctx->param_vs_prim_id = num_params++] = ctx->i32; params[ctx->param_instance_id = num_params++] = ctx->i32; + + if (!ctx->is_monolithic && + !ctx->is_gs_copy_shader) { + /* Vertex load indices. */ + ctx->param_vertex_index0 = num_params; + + for (i = 0; i < shader->selector->info.num_inputs; i++) + params[num_params++] = ctx->i32; + + /* PrimitiveID output. */ + if (!shader->key.vs.as_es && !shader->key.vs.as_ls) + for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++) + returns[num_returns++] = ctx->f32; + } break; case TGSI_PROCESSOR_TESS_CTRL: @@ -3643,6 +3929,15 @@ static void create_function(struct si_shader_context *ctx) params[SI_PARAM_PATCH_ID] = ctx->i32; params[SI_PARAM_REL_IDS] = ctx->i32; num_params = SI_PARAM_REL_IDS+1; + + if (!ctx->is_monolithic) { + /* PARAM_TESS_FACTOR_OFFSET is after user SGPRs. */ + for (i = 0; i <= SI_TCS_NUM_USER_SGPR; i++) + returns[num_returns++] = ctx->i32; /* SGPRs */ + + for (i = 0; i < 3; i++) + returns[num_returns++] = ctx->f32; /* VGPRs */ + } break; case TGSI_PROCESSOR_TESS_EVAL: @@ -3663,6 +3958,11 @@ static void create_function(struct si_shader_context *ctx) params[ctx->param_tes_v = num_params++] = ctx->f32; params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32; params[ctx->param_tes_patch_id = num_params++] = ctx->i32; + + /* PrimitiveID output. */ + if (!ctx->is_monolithic && !shader->key.tes.as_es) + for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++) + returns[num_returns++] = ctx->f32; break; case TGSI_PROCESSOR_GEOMETRY: @@ -3686,13 +3986,13 @@ static void create_function(struct si_shader_context *ctx) params[SI_PARAM_ALPHA_REF] = ctx->f32; params[SI_PARAM_PRIM_MASK] = ctx->i32; last_sgpr = SI_PARAM_PRIM_MASK; - params[SI_PARAM_PERSP_SAMPLE] = v2i32; - params[SI_PARAM_PERSP_CENTER] = v2i32; - params[SI_PARAM_PERSP_CENTROID] = v2i32; + params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32; + params[SI_PARAM_PERSP_CENTER] = ctx->v2i32; + params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32; params[SI_PARAM_PERSP_PULL_MODEL] = v3i32; - params[SI_PARAM_LINEAR_SAMPLE] = v2i32; - params[SI_PARAM_LINEAR_CENTER] = v2i32; - params[SI_PARAM_LINEAR_CENTROID] = v2i32; + params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32; + params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32; + params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32; params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32; params[SI_PARAM_POS_X_FLOAT] = ctx->f32; params[SI_PARAM_POS_Y_FLOAT] = ctx->f32; @@ -3701,8 +4001,39 @@ static void create_function(struct si_shader_context *ctx) params[SI_PARAM_FRONT_FACE] = ctx->i32; params[SI_PARAM_ANCILLARY] = ctx->i32; params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32; - params[SI_PARAM_POS_FIXED_PT] = ctx->f32; + params[SI_PARAM_POS_FIXED_PT] = ctx->i32; num_params = SI_PARAM_POS_FIXED_PT+1; + + if (!ctx->is_monolithic) { + /* Color inputs from the prolog. */ + if (shader->selector->info.colors_read) { + unsigned num_color_elements = + util_bitcount(shader->selector->info.colors_read); + + assert(num_params + num_color_elements <= ARRAY_SIZE(params)); + for (i = 0; i < num_color_elements; i++) + params[num_params++] = ctx->f32; + } + + /* Outputs for the epilog. */ + num_return_sgprs = SI_SGPR_ALPHA_REF + 1; + num_returns = + num_return_sgprs + + util_bitcount(shader->selector->info.colors_written) * 4 + + shader->selector->info.writes_z + + shader->selector->info.writes_stencil + + shader->selector->info.writes_samplemask + + 1 /* SampleMaskIn */; + + num_returns = MAX2(num_returns, + num_return_sgprs + + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); + + for (i = 0; i < num_return_sgprs; i++) + returns[i] = ctx->i32; + for (; i < num_returns; i++) + returns[i] = ctx->f32; + } break; default: @@ -3711,20 +4042,38 @@ static void create_function(struct si_shader_context *ctx) } assert(num_params <= Elements(params)); - radeon_llvm_create_func(&ctx->radeon_bld, params, num_params); - radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type); - - for (i = 0; i <= last_sgpr; ++i) { - LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i); - /* We tell llvm that array inputs are passed by value to allow Sinking pass - * to move load. Inputs are constant so this is fine. */ - if (i <= last_array_pointer) - LLVMAddAttribute(P, LLVMByValAttribute); - else - LLVMAddAttribute(P, LLVMInRegAttribute); + si_create_function(ctx, returns, num_returns, params, + num_params, last_array_pointer, last_sgpr); + + /* Reserve register locations for VGPR inputs the PS prolog may need. */ + if (ctx->type == TGSI_PROCESSOR_FRAGMENT && + !ctx->is_monolithic) { + radeon_llvm_add_attribute(ctx->radeon_bld.main_fn, + "InitialPSInputAddr", + S_0286D0_PERSP_SAMPLE_ENA(1) | + S_0286D0_PERSP_CENTER_ENA(1) | + S_0286D0_PERSP_CENTROID_ENA(1) | + S_0286D0_LINEAR_SAMPLE_ENA(1) | + S_0286D0_LINEAR_CENTER_ENA(1) | + S_0286D0_LINEAR_CENTROID_ENA(1) | + S_0286D0_FRONT_FACE_ENA(1) | + S_0286D0_POS_FIXED_PT_ENA(1)); } + shader->info.num_input_sgprs = 0; + shader->info.num_input_vgprs = 0; + + for (i = 0; i <= last_sgpr; ++i) + shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4; + + /* Unused fragment shader inputs are eliminated by the compiler, + * so we don't know yet how many there will be. + */ + if (ctx->type != TGSI_PROCESSOR_FRAGMENT) + for (; i < num_params; ++i) + shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4; + if (bld_base->info && (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 || bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 || @@ -3740,22 +4089,8 @@ static void create_function(struct si_shader_context *ctx) if ((ctx->type == TGSI_PROCESSOR_VERTEX && shader->key.vs.as_ls) || ctx->type == TGSI_PROCESSOR_TESS_CTRL || - ctx->type == TGSI_PROCESSOR_TESS_EVAL) { - /* This is the upper bound, maximum is 32 inputs times 32 vertices */ - unsigned vertex_data_dw_size = 32*32*4; - unsigned patch_data_dw_size = 32*4; - /* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */ - unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size; - unsigned lds_dwords = patch_dw_size; - - /* The actual size is computed outside of the shader to reduce - * the number of shader variants. */ - ctx->lds = - LLVMAddGlobalInAddressSpace(gallivm->module, - LLVMArrayType(ctx->i32, lds_dwords), - "tess_lds", - LOCAL_ADDR_SPACE); - } + ctx->type == TGSI_PROCESSOR_TESS_EVAL) + declare_tess_lds(ctx); } static void preload_constants(struct si_shader_context *ctx) @@ -3887,6 +4222,49 @@ static void preload_ring_buffers(struct si_shader_context *ctx) } } +static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx, + LLVMValueRef param_sampler_views, + unsigned param_pos_fixed_pt) +{ + struct lp_build_tgsi_context *bld_base = + &ctx->radeon_bld.soa.bld_base; + struct gallivm_state *gallivm = bld_base->base.gallivm; + struct lp_build_emit_data result = {}; + struct tgsi_full_instruction inst = {}; + LLVMValueRef desc, sampler_index, address[2], pix; + + /* Use the fixed-point gl_FragCoord input. + * Since the stipple pattern is 32x32 and it repeats, just get 5 bits + * per coordinate to get the repeating effect. + */ + address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5); + address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5); + + /* Load the sampler view descriptor. */ + sampler_index = lp_build_const_int32(gallivm, SI_POLY_STIPPLE_SAMPLER); + desc = get_sampler_desc_custom(ctx, param_sampler_views, + sampler_index, DESC_IMAGE); + + /* Load the texel. */ + inst.Instruction.Opcode = TGSI_OPCODE_TXF; + inst.Texture.Texture = TGSI_TEXTURE_2D_MSAA; /* = use load, not load_mip */ + result.inst = &inst; + set_tex_fetch_args(ctx, &result, TGSI_OPCODE_TXF, + inst.Texture.Texture, + desc, NULL, address, ARRAY_SIZE(address), 0xf); + build_tex_intrinsic(&tex_action, bld_base, &result); + + /* Kill the thread accordingly. */ + pix = LLVMBuildExtractElement(gallivm->builder, result.output[0], + lp_build_const_int32(gallivm, 3), ""); + pix = bitcast(bld_base, TGSI_TYPE_FLOAT, pix); + pix = LLVMBuildFNeg(gallivm->builder, pix, ""); + + lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill", + LLVMVoidTypeInContext(gallivm->context), + &pix, 1, 0); +} + void si_shader_binary_read_config(struct radeon_shader_binary *binary, struct si_shader_config *conf, unsigned symbol_offset) @@ -3972,41 +4350,70 @@ void si_shader_apply_scratch_relocs(struct si_context *sctx, } } +static unsigned si_get_shader_binary_size(struct si_shader *shader) +{ + unsigned size = shader->binary.code_size; + + if (shader->prolog) + size += shader->prolog->binary.code_size; + if (shader->epilog) + size += shader->epilog->binary.code_size; + return size; +} + int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader) { - const struct radeon_shader_binary *binary = &shader->binary; - unsigned code_size = binary->code_size + binary->rodata_size; + const struct radeon_shader_binary *prolog = + shader->prolog ? &shader->prolog->binary : NULL; + const struct radeon_shader_binary *epilog = + shader->epilog ? &shader->epilog->binary : NULL; + const struct radeon_shader_binary *mainb = &shader->binary; + unsigned bo_size = si_get_shader_binary_size(shader) + + (!epilog ? mainb->rodata_size : 0); unsigned char *ptr; + assert(!prolog || !prolog->rodata_size); + assert((!prolog && !epilog) || !mainb->rodata_size); + assert(!epilog || !epilog->rodata_size); + r600_resource_reference(&shader->bo, NULL); shader->bo = si_resource_create_custom(&sscreen->b.b, PIPE_USAGE_IMMUTABLE, - code_size); + bo_size); if (!shader->bo) return -ENOMEM; + /* Upload. */ ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL, PIPE_TRANSFER_READ_WRITE); - util_memcpy_cpu_to_le32(ptr, binary->code, binary->code_size); - if (binary->rodata_size > 0) { - ptr += binary->code_size; - util_memcpy_cpu_to_le32(ptr, binary->rodata, - binary->rodata_size); + + if (prolog) { + util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size); + ptr += prolog->code_size; } + util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size); + ptr += mainb->code_size; + + if (epilog) + util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size); + else if (mainb->rodata_size > 0) + util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size); + sscreen->b.ws->buffer_unmap(shader->bo->buf); return 0; } static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary, - struct pipe_debug_callback *debug) + struct pipe_debug_callback *debug, + const char *name) { char *line, *p; unsigned i, count; if (binary->disasm_string) { - fprintf(stderr, "\nShader Disassembly:\n\n"); - fprintf(stderr, "%s\n", binary->disasm_string); + fprintf(stderr, "Shader %s disassembly:\n", name); + fprintf(stderr, "%s", binary->disasm_string); if (debug && debug->debug_message) { /* Very long debug messages are cut off, so send the @@ -4036,7 +4443,7 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary "Shader Disassembly End"); } } else { - fprintf(stderr, "SI CODE:\n"); + fprintf(stderr, "Shader %s binary:\n", name); for (i = 0; i < binary->code_size; i += 4) { fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i, binary->code[i + 3], binary->code[i + 2], @@ -4115,16 +4522,60 @@ static void si_shader_dump_stats(struct si_screen *sscreen, max_simd_waves); } +static const char *si_get_shader_name(struct si_shader *shader, + unsigned processor) +{ + switch (processor) { + case TGSI_PROCESSOR_VERTEX: + if (shader->key.vs.as_es) + return "Vertex Shader as ES"; + else if (shader->key.vs.as_ls) + return "Vertex Shader as LS"; + else + return "Vertex Shader as VS"; + case TGSI_PROCESSOR_TESS_CTRL: + return "Tessellation Control Shader"; + case TGSI_PROCESSOR_TESS_EVAL: + if (shader->key.tes.as_es) + return "Tessellation Evaluation Shader as ES"; + else + return "Tessellation Evaluation Shader as VS"; + case TGSI_PROCESSOR_GEOMETRY: + if (shader->gs_copy_shader == NULL) + return "GS Copy Shader as VS"; + else + return "Geometry Shader"; + case TGSI_PROCESSOR_FRAGMENT: + return "Pixel Shader"; + case TGSI_PROCESSOR_COMPUTE: + return "Compute Shader"; + default: + return "Unknown Shader"; + } +} + void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, struct pipe_debug_callback *debug, unsigned processor) { - if (r600_can_dump_shader(&sscreen->b, processor)) - if (!(sscreen->b.debug_flags & DBG_NO_ASM)) - si_shader_dump_disassembly(&shader->binary, debug); + if (r600_can_dump_shader(&sscreen->b, processor) && + !(sscreen->b.debug_flags & DBG_NO_ASM)) { + fprintf(stderr, "\n%s:\n", si_get_shader_name(shader, processor)); + + if (shader->prolog) + si_shader_dump_disassembly(&shader->prolog->binary, + debug, "prolog"); + + si_shader_dump_disassembly(&shader->binary, debug, "main"); + + if (shader->epilog) + si_shader_dump_disassembly(&shader->epilog->binary, + debug, "epilog"); + fprintf(stderr, "\n"); + } si_shader_dump_stats(sscreen, &shader->config, shader->selector ? shader->selector->info.num_inputs : 0, - shader->binary.code_size, debug, processor); + si_get_shader_binary_size(shader), debug, processor); } int si_compile_llvm(struct si_screen *sscreen, @@ -4177,6 +4628,19 @@ int si_compile_llvm(struct si_screen *sscreen, FREE(binary->global_symbol_offsets); binary->config = NULL; binary->global_symbol_offsets = NULL; + + /* Some shaders can't have rodata because their binaries can be + * concatenated. + */ + if (binary->rodata_size && + (processor == TGSI_PROCESSOR_VERTEX || + processor == TGSI_PROCESSOR_TESS_CTRL || + processor == TGSI_PROCESSOR_TESS_EVAL || + processor == TGSI_PROCESSOR_FRAGMENT)) { + fprintf(stderr, "radeonsi: The shader can't have rodata."); + return -EINVAL; + } + return r; } @@ -4196,7 +4660,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen, outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0])); - si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm, gsinfo); + si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm); ctx->type = TGSI_PROCESSOR_VERTEX; ctx->is_gs_copy_shader = true; @@ -4241,7 +4705,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen, si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs); - LLVMBuildRetVoid(bld_base->base.gallivm->builder); + LLVMBuildRet(gallivm->builder, ctx->return_value); /* Dump LLVM IR before any optimization passes */ if (sscreen->b.debug_flags & DBG_PREOPT_IR && @@ -4278,35 +4742,38 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f) switch (shader) { case PIPE_SHADER_VERTEX: fprintf(f, " instance_divisors = {"); - for (i = 0; i < Elements(key->vs.instance_divisors); i++) + for (i = 0; i < Elements(key->vs.prolog.instance_divisors); i++) fprintf(f, !i ? "%u" : ", %u", - key->vs.instance_divisors[i]); + key->vs.prolog.instance_divisors[i]); fprintf(f, "}\n"); fprintf(f, " as_es = %u\n", key->vs.as_es); fprintf(f, " as_ls = %u\n", key->vs.as_ls); - fprintf(f, " export_prim_id = %u\n", key->vs.export_prim_id); + fprintf(f, " export_prim_id = %u\n", key->vs.epilog.export_prim_id); break; case PIPE_SHADER_TESS_CTRL: - fprintf(f, " prim_mode = %u\n", key->tcs.prim_mode); + fprintf(f, " prim_mode = %u\n", key->tcs.epilog.prim_mode); break; case PIPE_SHADER_TESS_EVAL: fprintf(f, " as_es = %u\n", key->tes.as_es); - fprintf(f, " export_prim_id = %u\n", key->tes.export_prim_id); + fprintf(f, " export_prim_id = %u\n", key->tes.epilog.export_prim_id); break; case PIPE_SHADER_GEOMETRY: break; case PIPE_SHADER_FRAGMENT: - fprintf(f, " spi_shader_col_format = 0x%x\n", key->ps.spi_shader_col_format); - fprintf(f, " last_cbuf = %u\n", key->ps.last_cbuf); - fprintf(f, " color_two_side = %u\n", key->ps.color_two_side); - fprintf(f, " alpha_func = %u\n", key->ps.alpha_func); - fprintf(f, " alpha_to_one = %u\n", key->ps.alpha_to_one); - fprintf(f, " poly_stipple = %u\n", key->ps.poly_stipple); - fprintf(f, " clamp_color = %u\n", key->ps.clamp_color); + fprintf(f, " prolog.color_two_side = %u\n", key->ps.prolog.color_two_side); + fprintf(f, " prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple); + fprintf(f, " prolog.force_persample_interp = %u\n", key->ps.prolog.force_persample_interp); + fprintf(f, " epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format); + fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8); + fprintf(f, " epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf); + fprintf(f, " epilog.alpha_func = %u\n", key->ps.epilog.alpha_func); + fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one); + fprintf(f, " epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing); + fprintf(f, " epilog.clamp_color = %u\n", key->ps.epilog.clamp_color); break; default: @@ -4317,13 +4784,12 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f) static void si_init_shader_ctx(struct si_shader_context *ctx, struct si_screen *sscreen, struct si_shader *shader, - LLVMTargetMachineRef tm, - struct tgsi_shader_info *info) + LLVMTargetMachineRef tm) { struct lp_build_tgsi_context *bld_base; memset(ctx, 0, sizeof(*ctx)); - radeon_llvm_context_init(&ctx->radeon_bld); + radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--"); ctx->tm = tm; ctx->screen = sscreen; if (shader && shader->selector) @@ -4336,15 +4802,18 @@ static void si_init_shader_ctx(struct si_shader_context *ctx, ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context); ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context); ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context); - ctx->i128 = LLVMInt128TypeInContext(ctx->radeon_bld.gallivm.context); + ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context); + ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128); ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context); ctx->v16i8 = LLVMVectorType(ctx->i8, 16); + ctx->v2i32 = LLVMVectorType(ctx->i32, 2); ctx->v4i32 = LLVMVectorType(ctx->i32, 4); ctx->v4f32 = LLVMVectorType(ctx->f32, 4); ctx->v8i32 = LLVMVectorType(ctx->i32, 8); bld_base = &ctx->radeon_bld.soa.bld_base; - bld_base->info = info; + if (shader && shader->selector) + bld_base->info = &shader->selector->info; bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant; bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action; @@ -4380,40 +4849,31 @@ static void si_init_shader_ctx(struct si_shader_context *ctx, bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32"; } -int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, - struct si_shader *shader, - struct pipe_debug_callback *debug) +int si_compile_tgsi_shader(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + bool is_monolithic, + struct pipe_debug_callback *debug) { struct si_shader_selector *sel = shader->selector; - struct tgsi_token *tokens = sel->tokens; struct si_shader_context ctx; struct lp_build_tgsi_context *bld_base; - struct tgsi_shader_info stipple_shader_info; LLVMModuleRef mod; int r = 0; - bool poly_stipple = sel->type == PIPE_SHADER_FRAGMENT && - shader->key.ps.poly_stipple; - - if (poly_stipple) { - tokens = util_pstipple_create_fragment_shader(tokens, NULL, - SI_POLY_STIPPLE_SAMPLER, - TGSI_FILE_SYSTEM_VALUE); - tgsi_scan_shader(tokens, &stipple_shader_info); - } /* Dump TGSI code before doing TGSI->LLVM conversion in case the * conversion fails. */ if (r600_can_dump_shader(&sscreen->b, sel->info.processor) && !(sscreen->b.debug_flags & DBG_NO_TGSI)) { si_dump_shader_key(sel->type, &shader->key, stderr); - tgsi_dump(tokens, 0); + tgsi_dump(sel->tokens, 0); si_dump_streamout(&sel->so); } - si_init_shader_ctx(&ctx, sscreen, shader, tm, - poly_stipple ? &stipple_shader_info : &sel->info); + si_init_shader_ctx(&ctx, sscreen, shader, tm); + ctx.is_monolithic = is_monolithic; - shader->uses_instanceid = sel->info.uses_instanceid; + shader->info.uses_instanceid = sel->info.uses_instanceid; bld_base = &ctx.radeon_bld.soa.bld_base; ctx.radeon_bld.load_system_value = declare_system_value; @@ -4447,7 +4907,10 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, break; case TGSI_PROCESSOR_FRAGMENT: ctx.radeon_bld.load_input = declare_input_fs; - bld_base->emit_epilogue = si_llvm_emit_fs_epilogue; + if (is_monolithic) + bld_base->emit_epilogue = si_llvm_emit_fs_epilogue; + else + bld_base->emit_epilogue = si_llvm_return_fs_outputs; break; default: assert(!"Unsupported shader type"); @@ -4461,6 +4924,14 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, preload_streamout_buffers(&ctx); preload_ring_buffers(&ctx); + if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT && + shader->key.ps.prolog.poly_stipple) { + LLVMValueRef views = LLVMGetParam(ctx.radeon_bld.main_fn, + SI_PARAM_SAMPLERS); + si_llvm_emit_polygon_stipple(&ctx, views, + SI_PARAM_POS_FIXED_PT); + } + if (ctx.type == TGSI_PROCESSOR_GEOMETRY) { int i; for (i = 0; i < 4; i++) { @@ -4470,12 +4941,12 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, } } - if (!lp_build_tgsi_llvm(bld_base, tokens)) { + if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) { fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n"); goto out; } - LLVMBuildRetVoid(bld_base->base.gallivm->builder); + LLVMBuildRet(bld_base->base.gallivm->builder, ctx.return_value); mod = bld_base->base.gallivm->module; /* Dump LLVM IR before any optimization passes */ @@ -4492,16 +4963,49 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, goto out; } - si_shader_dump(sscreen, shader, debug, ctx.type); + radeon_llvm_dispose(&ctx.radeon_bld); - r = si_shader_binary_upload(sscreen, shader); - if (r) { - fprintf(stderr, "LLVM failed to upload shader\n"); - goto out; + /* Calculate the number of fragment input VGPRs. */ + if (ctx.type == TGSI_PROCESSOR_FRAGMENT) { + shader->info.num_input_vgprs = 0; + shader->info.face_vgpr_index = -1; + + if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 2; + if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 2; + if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 2; + if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 3; + if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 2; + if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 2; + if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 2; + if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 1; + if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 1; + if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 1; + if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 1; + if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 1; + if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) { + shader->info.face_vgpr_index = shader->info.num_input_vgprs; + shader->info.num_input_vgprs += 1; + } + if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 1; + if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 1; + if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 1; } - radeon_llvm_dispose(&ctx.radeon_bld); - if (ctx.type == TGSI_PROCESSOR_GEOMETRY) { shader->gs_copy_shader = CALLOC_STRUCT(si_shader); shader->gs_copy_shader->selector = shader->selector; @@ -4517,11 +5021,968 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, out: for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++) FREE(ctx.constants[i]); - if (poly_stipple) - tgsi_free_tokens(tokens); return r; } +/** + * Create, compile and return a shader part (prolog or epilog). + * + * \param sscreen screen + * \param list list of shader parts of the same category + * \param key shader part key + * \param tm LLVM target machine + * \param debug debug callback + * \param compile the callback responsible for compilation + * \return non-NULL on success + */ +static struct si_shader_part * +si_get_shader_part(struct si_screen *sscreen, + struct si_shader_part **list, + union si_shader_part_key *key, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + bool (*compile)(struct si_screen *, + LLVMTargetMachineRef, + struct pipe_debug_callback *, + struct si_shader_part *)) +{ + struct si_shader_part *result; + + pipe_mutex_lock(sscreen->shader_parts_mutex); + + /* Find existing. */ + for (result = *list; result; result = result->next) { + if (memcmp(&result->key, key, sizeof(*key)) == 0) { + pipe_mutex_unlock(sscreen->shader_parts_mutex); + return result; + } + } + + /* Compile a new one. */ + result = CALLOC_STRUCT(si_shader_part); + result->key = *key; + if (!compile(sscreen, tm, debug, result)) { + FREE(result); + pipe_mutex_unlock(sscreen->shader_parts_mutex); + return NULL; + } + + result->next = *list; + *list = result; + pipe_mutex_unlock(sscreen->shader_parts_mutex); + return result; +} + +/** + * Create a vertex shader prolog. + * + * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values). + * All inputs are returned unmodified. The vertex load indices are + * stored after them, which will used by the API VS for fetching inputs. + * + * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are: + * input_v0, + * input_v1, + * input_v2, + * input_v3, + * (VertexID + BaseVertex), + * (InstanceID + StartInstance), + * (InstanceID / 2 + StartInstance) + */ +static bool si_compile_vs_prolog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + struct si_shader_part *out) +{ + union si_shader_part_key *key = &out->key; + struct si_shader shader = {}; + struct si_shader_context ctx; + struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm; + LLVMTypeRef *params, *returns; + LLVMValueRef ret, func; + int last_sgpr, num_params, num_returns, i; + bool status = true; + + si_init_shader_ctx(&ctx, sscreen, &shader, tm); + ctx.type = TGSI_PROCESSOR_VERTEX; + ctx.param_vertex_id = key->vs_prolog.num_input_sgprs; + ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3; + + /* 4 preloaded VGPRs + vertex load indices as prolog outputs */ + params = alloca((key->vs_prolog.num_input_sgprs + 4) * + sizeof(LLVMTypeRef)); + returns = alloca((key->vs_prolog.num_input_sgprs + 4 + + key->vs_prolog.last_input + 1) * + sizeof(LLVMTypeRef)); + num_params = 0; + num_returns = 0; + + /* Declare input and output SGPRs. */ + num_params = 0; + for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { + params[num_params++] = ctx.i32; + returns[num_returns++] = ctx.i32; + } + last_sgpr = num_params - 1; + + /* 4 preloaded VGPRs (outputs must be floats) */ + for (i = 0; i < 4; i++) { + params[num_params++] = ctx.i32; + returns[num_returns++] = ctx.f32; + } + + /* Vertex load indices. */ + for (i = 0; i <= key->vs_prolog.last_input; i++) + returns[num_returns++] = ctx.f32; + + /* Create the function. */ + si_create_function(&ctx, returns, num_returns, params, + num_params, -1, last_sgpr); + func = ctx.radeon_bld.main_fn; + + /* Copy inputs to outputs. This should be no-op, as the registers match, + * but it will prevent the compiler from overwriting them unintentionally. + */ + ret = ctx.return_value; + for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { + LLVMValueRef p = LLVMGetParam(func, i); + ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); + } + for (i = num_params - 4; i < num_params; i++) { + LLVMValueRef p = LLVMGetParam(func, i); + p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, ""); + ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); + } + + /* Compute vertex load indices from instance divisors. */ + for (i = 0; i <= key->vs_prolog.last_input; i++) { + unsigned divisor = key->vs_prolog.states.instance_divisors[i]; + LLVMValueRef index; + + if (divisor) { + /* InstanceID / Divisor + StartInstance */ + index = get_instance_index_for_fetch(&ctx.radeon_bld, + SI_SGPR_START_INSTANCE, + divisor); + } else { + /* VertexID + BaseVertex */ + index = LLVMBuildAdd(gallivm->builder, + LLVMGetParam(func, ctx.param_vertex_id), + LLVMGetParam(func, SI_SGPR_BASE_VERTEX), ""); + } + + index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, ""); + ret = LLVMBuildInsertValue(gallivm->builder, ret, index, + num_params++, ""); + } + + /* Compile. */ + LLVMBuildRet(gallivm->builder, ret); + radeon_llvm_finalize_module(&ctx.radeon_bld); + + if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, + gallivm->module, debug, ctx.type, + "Vertex Shader Prolog")) + status = false; + + radeon_llvm_dispose(&ctx.radeon_bld); + return status; +} + +/** + * Compile the vertex shader epilog. This is also used by the tessellation + * evaluation shader compiled as VS. + * + * The input is PrimitiveID. + * + * If PrimitiveID is required by the pixel shader, export it. + * Otherwise, do nothing. + */ +static bool si_compile_vs_epilog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + struct si_shader_part *out) +{ + union si_shader_part_key *key = &out->key; + struct si_shader_context ctx; + struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm; + struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base; + LLVMTypeRef params[5]; + int num_params, i; + bool status = true; + + si_init_shader_ctx(&ctx, sscreen, NULL, tm); + ctx.type = TGSI_PROCESSOR_VERTEX; + + /* Declare input VGPRs. */ + num_params = key->vs_epilog.states.export_prim_id ? + (VS_EPILOG_PRIMID_LOC + 1) : 0; + assert(num_params <= ARRAY_SIZE(params)); + + for (i = 0; i < num_params; i++) + params[i] = ctx.f32; + + /* Create the function. */ + si_create_function(&ctx, NULL, 0, params, num_params, + -1, -1); + + /* Emit exports. */ + if (key->vs_epilog.states.export_prim_id) { + struct lp_build_context *base = &bld_base->base; + struct lp_build_context *uint = &bld_base->uint_bld; + LLVMValueRef args[9]; + + args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */ + args[1] = uint->zero; /* whether the EXEC mask is valid */ + args[2] = uint->zero; /* DONE bit */ + args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM + + key->vs_epilog.prim_id_param_offset); + args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */ + args[5] = LLVMGetParam(ctx.radeon_bld.main_fn, + VS_EPILOG_PRIMID_LOC); /* X */ + args[6] = uint->undef; /* Y */ + args[7] = uint->undef; /* Z */ + args[8] = uint->undef; /* W */ + + lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", + LLVMVoidTypeInContext(base->gallivm->context), + args, 9, 0); + } + + /* Compile. */ + LLVMBuildRet(gallivm->builder, ctx.return_value); + radeon_llvm_finalize_module(&ctx.radeon_bld); + + if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, + gallivm->module, debug, ctx.type, + "Vertex Shader Epilog")) + status = false; + + radeon_llvm_dispose(&ctx.radeon_bld); + return status; +} + +/** + * Create & compile a vertex shader epilog. This a helper used by VS and TES. + */ +static bool si_get_vs_epilog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug, + struct si_vs_epilog_bits *states) +{ + union si_shader_part_key epilog_key; + + memset(&epilog_key, 0, sizeof(epilog_key)); + epilog_key.vs_epilog.states = *states; + + /* Set up the PrimitiveID output. */ + if (shader->key.vs.epilog.export_prim_id) { + unsigned index = shader->selector->info.num_outputs; + unsigned offset = shader->info.nr_param_exports++; + + epilog_key.vs_epilog.prim_id_param_offset = offset; + assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset)); + shader->info.vs_output_param_offset[index] = offset; + } + + shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs, + &epilog_key, tm, debug, + si_compile_vs_epilog); + return shader->epilog != NULL; +} + +/** + * Select and compile (or reuse) vertex shader parts (prolog & epilog). + */ +static bool si_shader_select_vs_parts(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug) +{ + struct tgsi_shader_info *info = &shader->selector->info; + union si_shader_part_key prolog_key; + unsigned i; + + /* Get the prolog. */ + memset(&prolog_key, 0, sizeof(prolog_key)); + prolog_key.vs_prolog.states = shader->key.vs.prolog; + prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs; + prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1; + + /* The prolog is a no-op if there are no inputs. */ + if (info->num_inputs) { + shader->prolog = + si_get_shader_part(sscreen, &sscreen->vs_prologs, + &prolog_key, tm, debug, + si_compile_vs_prolog); + if (!shader->prolog) + return false; + } + + /* Get the epilog. */ + if (!shader->key.vs.as_es && !shader->key.vs.as_ls && + !si_get_vs_epilog(sscreen, tm, shader, debug, + &shader->key.vs.epilog)) + return false; + + /* Set the instanceID flag. */ + for (i = 0; i < info->num_inputs; i++) + if (prolog_key.vs_prolog.states.instance_divisors[i]) + shader->info.uses_instanceid = true; + + return true; +} + +/** + * Select and compile (or reuse) TES parts (epilog). + */ +static bool si_shader_select_tes_parts(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug) +{ + if (shader->key.tes.as_es) + return true; + + /* TES compiled as VS. */ + return si_get_vs_epilog(sscreen, tm, shader, debug, + &shader->key.tes.epilog); +} + +/** + * Compile the TCS epilog. This writes tesselation factors to memory based on + * the output primitive type of the tesselator (determined by TES). + */ +static bool si_compile_tcs_epilog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + struct si_shader_part *out) +{ + union si_shader_part_key *key = &out->key; + struct si_shader shader = {}; + struct si_shader_context ctx; + struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm; + struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base; + LLVMTypeRef params[16]; + LLVMValueRef func; + int last_array_pointer, last_sgpr, num_params; + bool status = true; + + si_init_shader_ctx(&ctx, sscreen, &shader, tm); + ctx.type = TGSI_PROCESSOR_TESS_CTRL; + shader.key.tcs.epilog = key->tcs_epilog.states; + + /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */ + params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS); + last_array_pointer = SI_PARAM_RW_BUFFERS; + params[SI_PARAM_CONST_BUFFERS] = ctx.i64; + params[SI_PARAM_SAMPLERS] = ctx.i64; + params[SI_PARAM_UNUSED] = ctx.i64; + params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32; + params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32; + params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32; + params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32; + last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET; + num_params = last_sgpr + 1; + + params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */ + params[num_params++] = ctx.i32; /* invocation ID within the patch */ + params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */ + + /* Create the function. */ + si_create_function(&ctx, NULL, 0, params, num_params, + last_array_pointer, last_sgpr); + declare_tess_lds(&ctx); + func = ctx.radeon_bld.main_fn; + + si_write_tess_factors(bld_base, + LLVMGetParam(func, last_sgpr + 1), + LLVMGetParam(func, last_sgpr + 2), + LLVMGetParam(func, last_sgpr + 3)); + + /* Compile. */ + LLVMBuildRet(gallivm->builder, ctx.return_value); + radeon_llvm_finalize_module(&ctx.radeon_bld); + + if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, + gallivm->module, debug, ctx.type, + "Tessellation Control Shader Epilog")) + status = false; + + radeon_llvm_dispose(&ctx.radeon_bld); + return status; +} + +/** + * Select and compile (or reuse) TCS parts (epilog). + */ +static bool si_shader_select_tcs_parts(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug) +{ + union si_shader_part_key epilog_key; + + /* Get the epilog. */ + memset(&epilog_key, 0, sizeof(epilog_key)); + epilog_key.tcs_epilog.states = shader->key.tcs.epilog; + + shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, + &epilog_key, tm, debug, + si_compile_tcs_epilog); + return shader->epilog != NULL; +} + +/** + * Compile the pixel shader prolog. This handles: + * - two-side color selection and interpolation + * - overriding interpolation parameters for the API PS + * - polygon stippling + * + * All preloaded SGPRs and VGPRs are passed through unmodified unless they are + * overriden by other states. (e.g. per-sample interpolation) + * Interpolated colors are stored after the preloaded VGPRs. + */ +static bool si_compile_ps_prolog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + struct si_shader_part *out) +{ + union si_shader_part_key *key = &out->key; + struct si_shader shader = {}; + struct si_shader_context ctx; + struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm; + LLVMTypeRef *params; + LLVMValueRef ret, func; + int last_sgpr, num_params, num_returns, i, num_color_channels; + bool status = true; + + si_init_shader_ctx(&ctx, sscreen, &shader, tm); + ctx.type = TGSI_PROCESSOR_FRAGMENT; + shader.key.ps.prolog = key->ps_prolog.states; + + /* Number of inputs + 8 color elements. */ + params = alloca((key->ps_prolog.num_input_sgprs + + key->ps_prolog.num_input_vgprs + 8) * + sizeof(LLVMTypeRef)); + + /* Declare inputs. */ + num_params = 0; + for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) + params[num_params++] = ctx.i32; + last_sgpr = num_params - 1; + + for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) + params[num_params++] = ctx.f32; + + /* Declare outputs (same as inputs + add colors if needed) */ + num_returns = num_params; + num_color_channels = util_bitcount(key->ps_prolog.colors_read); + for (i = 0; i < num_color_channels; i++) + params[num_returns++] = ctx.f32; + + /* Create the function. */ + si_create_function(&ctx, params, num_returns, params, + num_params, -1, last_sgpr); + func = ctx.radeon_bld.main_fn; + + /* Copy inputs to outputs. This should be no-op, as the registers match, + * but it will prevent the compiler from overwriting them unintentionally. + */ + ret = ctx.return_value; + for (i = 0; i < num_params; i++) { + LLVMValueRef p = LLVMGetParam(func, i); + ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); + } + + /* Polygon stippling. */ + if (key->ps_prolog.states.poly_stipple) { + /* POS_FIXED_PT is always last. */ + unsigned pos = key->ps_prolog.num_input_sgprs + + key->ps_prolog.num_input_vgprs - 1; + LLVMValueRef ptr[2], views; + + /* Get the pointer to sampler views. */ + ptr[0] = LLVMGetParam(func, SI_SGPR_SAMPLERS); + ptr[1] = LLVMGetParam(func, SI_SGPR_SAMPLERS+1); + views = lp_build_gather_values(gallivm, ptr, 2); + views = LLVMBuildBitCast(gallivm->builder, views, ctx.i64, ""); + views = LLVMBuildIntToPtr(gallivm->builder, views, + const_array(ctx.v8i32, SI_NUM_SAMPLERS), ""); + + si_llvm_emit_polygon_stipple(&ctx, views, pos); + } + + /* Interpolate colors. */ + for (i = 0; i < 2; i++) { + unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf; + unsigned face_vgpr = key->ps_prolog.num_input_sgprs + + key->ps_prolog.face_vgpr_index; + LLVMValueRef interp[2], color[4]; + LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL; + + if (!writemask) + continue; + + /* If the interpolation qualifier is not CONSTANT (-1). */ + if (key->ps_prolog.color_interp_vgpr_index[i] != -1) { + unsigned interp_vgpr = key->ps_prolog.num_input_sgprs + + key->ps_prolog.color_interp_vgpr_index[i]; + + interp[0] = LLVMGetParam(func, interp_vgpr); + interp[1] = LLVMGetParam(func, interp_vgpr + 1); + interp_ij = lp_build_gather_values(gallivm, interp, 2); + interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij, + ctx.v2i32, ""); + } + + /* Use the absolute location of the input. */ + prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); + + if (key->ps_prolog.states.color_two_side) { + face = LLVMGetParam(func, face_vgpr); + face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, ""); + } + + interp_fs_input(&ctx, + key->ps_prolog.color_attr_index[i], + TGSI_SEMANTIC_COLOR, i, + key->ps_prolog.num_interp_inputs, + key->ps_prolog.colors_read, interp_ij, + prim_mask, face, color); + + while (writemask) { + unsigned chan = u_bit_scan(&writemask); + ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan], + num_params++, ""); + } + } + + /* Force per-sample interpolation. */ + if (key->ps_prolog.states.force_persample_interp) { + unsigned i, base = key->ps_prolog.num_input_sgprs; + LLVMValueRef persp_sample[2], linear_sample[2]; + + /* Read PERSP_SAMPLE. */ + for (i = 0; i < 2; i++) + persp_sample[i] = LLVMGetParam(func, base + i); + /* Overwrite PERSP_CENTER. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(gallivm->builder, ret, + persp_sample[i], base + 2 + i, ""); + /* Overwrite PERSP_CENTROID. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(gallivm->builder, ret, + persp_sample[i], base + 4 + i, ""); + /* Read LINEAR_SAMPLE. */ + for (i = 0; i < 2; i++) + linear_sample[i] = LLVMGetParam(func, base + 6 + i); + /* Overwrite LINEAR_CENTER. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(gallivm->builder, ret, + linear_sample[i], base + 8 + i, ""); + /* Overwrite LINEAR_CENTROID. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(gallivm->builder, ret, + linear_sample[i], base + 10 + i, ""); + } + + /* Compile. */ + LLVMBuildRet(gallivm->builder, ret); + radeon_llvm_finalize_module(&ctx.radeon_bld); + + if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, + gallivm->module, debug, ctx.type, + "Fragment Shader Prolog")) + status = false; + + radeon_llvm_dispose(&ctx.radeon_bld); + return status; +} + +/** + * Compile the pixel shader epilog. This handles everything that must be + * emulated for pixel shader exports. (alpha-test, format conversions, etc) + */ +static bool si_compile_ps_epilog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + struct si_shader_part *out) +{ + union si_shader_part_key *key = &out->key; + struct si_shader shader = {}; + struct si_shader_context ctx; + struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm; + struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base; + LLVMTypeRef params[16+8*4+3]; + LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; + int last_array_pointer, last_sgpr, num_params, i; + bool status = true; + + si_init_shader_ctx(&ctx, sscreen, &shader, tm); + ctx.type = TGSI_PROCESSOR_FRAGMENT; + shader.key.ps.epilog = key->ps_epilog.states; + + /* Declare input SGPRs. */ + params[SI_PARAM_RW_BUFFERS] = ctx.i64; + params[SI_PARAM_CONST_BUFFERS] = ctx.i64; + params[SI_PARAM_SAMPLERS] = ctx.i64; + params[SI_PARAM_UNUSED] = ctx.i64; + params[SI_PARAM_ALPHA_REF] = ctx.f32; + last_array_pointer = -1; + last_sgpr = SI_PARAM_ALPHA_REF; + + /* Declare input VGPRs. */ + num_params = (last_sgpr + 1) + + util_bitcount(key->ps_epilog.colors_written) * 4 + + key->ps_epilog.writes_z + + key->ps_epilog.writes_stencil + + key->ps_epilog.writes_samplemask; + + num_params = MAX2(num_params, + last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); + + assert(num_params <= ARRAY_SIZE(params)); + + for (i = last_sgpr + 1; i < num_params; i++) + params[i] = ctx.f32; + + /* Create the function. */ + si_create_function(&ctx, NULL, 0, params, num_params, + last_array_pointer, last_sgpr); + /* Disable elimination of unused inputs. */ + radeon_llvm_add_attribute(ctx.radeon_bld.main_fn, + "InitialPSInputAddr", 0xffffff); + + /* Process colors. */ + unsigned vgpr = last_sgpr + 1; + unsigned colors_written = key->ps_epilog.colors_written; + int last_color_export = -1; + + /* Find the last color export. */ + if (!key->ps_epilog.writes_z && + !key->ps_epilog.writes_stencil && + !key->ps_epilog.writes_samplemask) { + unsigned spi_format = key->ps_epilog.states.spi_shader_col_format; + + /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ + if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) { + /* Just set this if any of the colorbuffers are enabled. */ + if (spi_format & + ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1)) + last_color_export = 0; + } else { + for (i = 0; i < 8; i++) + if (colors_written & (1 << i) && + (spi_format >> (i * 4)) & 0xf) + last_color_export = i; + } + } + + while (colors_written) { + LLVMValueRef color[4]; + int mrt = u_bit_scan(&colors_written); + + for (i = 0; i < 4; i++) + color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++); + + si_export_mrt_color(bld_base, color, mrt, + num_params - 1, + mrt == last_color_export); + } + + /* Process depth, stencil, samplemask. */ + if (key->ps_epilog.writes_z) + depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++); + if (key->ps_epilog.writes_stencil) + stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++); + if (key->ps_epilog.writes_samplemask) + samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++); + + if (depth || stencil || samplemask) + si_export_mrt_z(bld_base, depth, stencil, samplemask); + else if (last_color_export == -1) + si_export_null(bld_base); + + /* Compile. */ + LLVMBuildRetVoid(gallivm->builder); + radeon_llvm_finalize_module(&ctx.radeon_bld); + + if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, + gallivm->module, debug, ctx.type, + "Fragment Shader Epilog")) + status = false; + + radeon_llvm_dispose(&ctx.radeon_bld); + return status; +} + +/** + * Select and compile (or reuse) pixel shader parts (prolog & epilog). + */ +static bool si_shader_select_ps_parts(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug) +{ + struct tgsi_shader_info *info = &shader->selector->info; + union si_shader_part_key prolog_key; + union si_shader_part_key epilog_key; + unsigned i; + + /* Get the prolog. */ + memset(&prolog_key, 0, sizeof(prolog_key)); + prolog_key.ps_prolog.states = shader->key.ps.prolog; + prolog_key.ps_prolog.colors_read = info->colors_read; + prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs; + prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs; + + if (info->colors_read) { + unsigned *color = shader->selector->color_attr_index; + + if (shader->key.ps.prolog.color_two_side) { + /* BCOLORs are stored after the last input. */ + prolog_key.ps_prolog.num_interp_inputs = info->num_inputs; + prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index; + shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1); + } + + for (i = 0; i < 2; i++) { + unsigned location = info->input_interpolate_loc[color[i]]; + + if (!(info->colors_read & (0xf << i*4))) + continue; + + prolog_key.ps_prolog.color_attr_index[i] = color[i]; + + /* Force per-sample interpolation for the colors here. */ + if (shader->key.ps.prolog.force_persample_interp) + location = TGSI_INTERPOLATE_LOC_SAMPLE; + + switch (info->input_interpolate[color[i]]) { + case TGSI_INTERPOLATE_CONSTANT: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1; + break; + case TGSI_INTERPOLATE_PERSPECTIVE: + case TGSI_INTERPOLATE_COLOR: + switch (location) { + case TGSI_INTERPOLATE_LOC_SAMPLE: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0; + shader->config.spi_ps_input_ena |= + S_0286CC_PERSP_SAMPLE_ENA(1); + break; + case TGSI_INTERPOLATE_LOC_CENTER: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2; + shader->config.spi_ps_input_ena |= + S_0286CC_PERSP_CENTER_ENA(1); + break; + case TGSI_INTERPOLATE_LOC_CENTROID: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4; + shader->config.spi_ps_input_ena |= + S_0286CC_PERSP_CENTROID_ENA(1); + break; + default: + assert(0); + } + break; + case TGSI_INTERPOLATE_LINEAR: + switch (location) { + case TGSI_INTERPOLATE_LOC_SAMPLE: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6; + shader->config.spi_ps_input_ena |= + S_0286CC_LINEAR_SAMPLE_ENA(1); + break; + case TGSI_INTERPOLATE_LOC_CENTER: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8; + shader->config.spi_ps_input_ena |= + S_0286CC_LINEAR_CENTER_ENA(1); + break; + case TGSI_INTERPOLATE_LOC_CENTROID: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10; + shader->config.spi_ps_input_ena |= + S_0286CC_LINEAR_CENTROID_ENA(1); + break; + default: + assert(0); + } + break; + default: + assert(0); + } + } + } + + /* The prolog is a no-op if these aren't set. */ + if (prolog_key.ps_prolog.colors_read || + prolog_key.ps_prolog.states.force_persample_interp || + prolog_key.ps_prolog.states.poly_stipple) { + shader->prolog = + si_get_shader_part(sscreen, &sscreen->ps_prologs, + &prolog_key, tm, debug, + si_compile_ps_prolog); + if (!shader->prolog) + return false; + } + + /* Get the epilog. */ + memset(&epilog_key, 0, sizeof(epilog_key)); + epilog_key.ps_epilog.colors_written = info->colors_written; + epilog_key.ps_epilog.writes_z = info->writes_z; + epilog_key.ps_epilog.writes_stencil = info->writes_stencil; + epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask; + epilog_key.ps_epilog.states = shader->key.ps.epilog; + + shader->epilog = + si_get_shader_part(sscreen, &sscreen->ps_epilogs, + &epilog_key, tm, debug, + si_compile_ps_epilog); + if (!shader->epilog) + return false; + + /* Enable POS_FIXED_PT if polygon stippling is enabled. */ + if (shader->key.ps.prolog.poly_stipple) { + shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1); + assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)); + } + + /* Set up the enable bits for per-sample shading if needed. */ + if (shader->key.ps.prolog.force_persample_interp) { + if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) || + G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena)) { + shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA; + shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; + shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1); + } + if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) || + G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena)) { + shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA; + shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; + shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1); + } + } + + /* POW_W_FLOAT requires that one of the perspective weights is enabled. */ + if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) && + !(shader->config.spi_ps_input_ena & 0xf)) { + shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); + assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr)); + } + + /* At least one pair of interpolation weights must be enabled. */ + if (!(shader->config.spi_ps_input_ena & 0x7f)) { + shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); + assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)); + } + + /* The sample mask input is always enabled, because the API shader always + * passes it through to the epilog. Disable it here if it's unused. + */ + if (!shader->key.ps.epilog.poly_line_smoothing && + !shader->selector->info.reads_samplemask) + shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA; + + return true; +} + +int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug) +{ + struct si_shader *mainp = shader->selector->main_shader_part; + int r; + + /* LS and ES are always compiled on demand. */ + if (!mainp || + (shader->selector->type == PIPE_SHADER_VERTEX && + (shader->key.vs.as_es || shader->key.vs.as_ls)) || + (shader->selector->type == PIPE_SHADER_TESS_EVAL && + shader->key.tes.as_es)) { + /* Monolithic shader (compiled as a whole, has many variants, + * may take a long time to compile). + */ + r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug); + if (r) + return r; + } else { + /* The shader consists of 2-3 parts: + * + * - the middle part is the user shader, it has 1 variant only + * and it was compiled during the creation of the shader + * selector + * - the prolog part is inserted at the beginning + * - the epilog part is inserted at the end + * + * The prolog and epilog have many (but simple) variants. + */ + + /* Copy the compiled TGSI shader data over. */ + shader->is_binary_shared = true; + shader->binary = mainp->binary; + shader->config = mainp->config; + shader->info.num_input_sgprs = mainp->info.num_input_sgprs; + shader->info.num_input_vgprs = mainp->info.num_input_vgprs; + shader->info.face_vgpr_index = mainp->info.face_vgpr_index; + memcpy(shader->info.vs_output_param_offset, + mainp->info.vs_output_param_offset, + sizeof(mainp->info.vs_output_param_offset)); + shader->info.uses_instanceid = mainp->info.uses_instanceid; + shader->info.nr_pos_exports = mainp->info.nr_pos_exports; + shader->info.nr_param_exports = mainp->info.nr_param_exports; + + /* Select prologs and/or epilogs. */ + switch (shader->selector->type) { + case PIPE_SHADER_VERTEX: + if (!si_shader_select_vs_parts(sscreen, tm, shader, debug)) + return -1; + break; + case PIPE_SHADER_TESS_CTRL: + if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug)) + return -1; + break; + case PIPE_SHADER_TESS_EVAL: + if (!si_shader_select_tes_parts(sscreen, tm, shader, debug)) + return -1; + break; + case PIPE_SHADER_FRAGMENT: + if (!si_shader_select_ps_parts(sscreen, tm, shader, debug)) + return -1; + + /* Make sure we have at least as many VGPRs as there + * are allocated inputs. + */ + shader->config.num_vgprs = MAX2(shader->config.num_vgprs, + shader->info.num_input_vgprs); + break; + } + + /* Update SGPR and VGPR counts. */ + if (shader->prolog) { + shader->config.num_sgprs = MAX2(shader->config.num_sgprs, + shader->prolog->config.num_sgprs); + shader->config.num_vgprs = MAX2(shader->config.num_vgprs, + shader->prolog->config.num_vgprs); + } + if (shader->epilog) { + shader->config.num_sgprs = MAX2(shader->config.num_sgprs, + shader->epilog->config.num_sgprs); + shader->config.num_vgprs = MAX2(shader->config.num_vgprs, + shader->epilog->config.num_vgprs); + } + } + + si_shader_dump(sscreen, shader, debug, shader->selector->info.processor); + + /* Upload. */ + r = si_shader_binary_upload(sscreen, shader); + if (r) { + fprintf(stderr, "LLVM failed to upload shader\n"); + return r; + } + + return 0; +} + void si_shader_destroy(struct si_shader *shader) { if (shader->gs_copy_shader) { @@ -4534,5 +5995,6 @@ void si_shader_destroy(struct si_shader *shader) r600_resource_reference(&shader->bo, NULL); - radeon_shader_binary_clean(&shader->binary); + if (!shader->is_binary_shared) + radeon_shader_binary_clean(&shader->binary); } diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index dc75e0330e4..ff5c24d8918 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -75,6 +75,8 @@ struct radeon_shader_binary; struct radeon_shader_reloc; +#define SI_MAX_VS_OUTPUTS 40 + #define SI_SGPR_RW_BUFFERS 0 /* rings (& stream-out, VS only) */ #define SI_SGPR_CONST_BUFFERS 2 #define SI_SGPR_SAMPLERS 4 /* images & sampler states interleaved */ @@ -169,7 +171,7 @@ struct radeon_shader_reloc; #define SI_PARAM_SAMPLE_COVERAGE 20 #define SI_PARAM_POS_FIXED_PT 21 -#define SI_NUM_PARAMS (SI_PARAM_POS_FIXED_PT + 1) +#define SI_NUM_PARAMS (SI_PARAM_POS_FIXED_PT + 9) /* +8 for COLOR[0..1] */ struct si_shader; @@ -181,6 +183,11 @@ struct si_shader_selector { struct si_shader *first_variant; /* immutable after the first variant */ struct si_shader *last_variant; /* mutable */ + /* The compiled TGSI shader expecting a prolog and/or epilog (not + * uploaded to a buffer). + */ + struct si_shader *main_shader_part; + struct tgsi_token *tokens; struct pipe_stream_output_info so; struct tgsi_shader_info info; @@ -199,6 +206,7 @@ struct si_shader_selector { unsigned max_gsvs_emit_size; /* PS parameters. */ + unsigned color_attr_index[2]; unsigned db_shader_control; /* Set 0xf or 0x0 (4 bits) per each written output. * ANDed with spi_shader_col_format. @@ -221,37 +229,103 @@ struct si_shader_selector { * With both: LS | HS | ES | GS | VS | PS */ +/* Common VS bits between the shader key and the prolog key. */ +struct si_vs_prolog_bits { + unsigned instance_divisors[SI_NUM_VERTEX_BUFFERS]; +}; + +/* Common VS bits between the shader key and the epilog key. */ +struct si_vs_epilog_bits { + unsigned export_prim_id:1; /* when PS needs it and GS is disabled */ + /* TODO: + * - skip clipdist, culldist (including clipvertex code) exports based + * on which clip_plane_enable bits are set + * - skip layer, viewport, clipdist, and culldist parameter exports + * if PS doesn't read them + */ +}; + +/* Common TCS bits between the shader key and the epilog key. */ +struct si_tcs_epilog_bits { + unsigned prim_mode:3; +}; + +/* Common PS bits between the shader key and the prolog key. */ +struct si_ps_prolog_bits { + unsigned color_two_side:1; + /* TODO: add a flatshade bit that skips interpolation for colors */ + unsigned poly_stipple:1; + unsigned force_persample_interp:1; + /* TODO: + * - add force_center_interp if MSAA is disabled and centroid or + * sample are present + * - add force_center_interp_bc_optimize to force center interpolation + * based on the bc_optimize SGPR bit if MSAA is enabled, centroid is + * present and sample isn't present. + */ +}; + +/* Common PS bits between the shader key and the epilog key. */ +struct si_ps_epilog_bits { + unsigned spi_shader_col_format; + unsigned color_is_int8:8; + unsigned last_cbuf:3; + unsigned alpha_func:3; + unsigned alpha_to_one:1; + unsigned poly_line_smoothing:1; + unsigned clamp_color:1; +}; + +union si_shader_part_key { + struct { + struct si_vs_prolog_bits states; + unsigned num_input_sgprs:5; + unsigned last_input:4; + } vs_prolog; + struct { + struct si_vs_epilog_bits states; + unsigned prim_id_param_offset:5; + } vs_epilog; + struct { + struct si_tcs_epilog_bits states; + } tcs_epilog; + struct { + struct si_ps_prolog_bits states; + unsigned num_input_sgprs:5; + unsigned num_input_vgprs:5; + /* Color interpolation and two-side color selection. */ + unsigned colors_read:8; /* color input components read */ + unsigned num_interp_inputs:5; /* BCOLOR is at this location */ + unsigned face_vgpr_index:5; + char color_attr_index[2]; + char color_interp_vgpr_index[2]; /* -1 == constant */ + } ps_prolog; + struct { + struct si_ps_epilog_bits states; + unsigned colors_written:8; + unsigned writes_z:1; + unsigned writes_stencil:1; + unsigned writes_samplemask:1; + } ps_epilog; +}; + union si_shader_key { struct { - unsigned spi_shader_col_format; - unsigned color_is_int8:8; - unsigned last_cbuf:3; - unsigned color_two_side:1; - unsigned alpha_func:3; - unsigned alpha_to_one:1; - unsigned poly_stipple:1; - unsigned poly_line_smoothing:1; - unsigned clamp_color:1; - unsigned force_persample_interp:1; + struct si_ps_prolog_bits prolog; + struct si_ps_epilog_bits epilog; } ps; struct { - unsigned instance_divisors[SI_NUM_VERTEX_BUFFERS]; - /* Mask of "get_unique_index" bits - which outputs are read - * by the next stage (needed by ES). - * This describes how outputs are laid out in memory. */ + struct si_vs_prolog_bits prolog; + struct si_vs_epilog_bits epilog; unsigned as_es:1; /* export shader */ unsigned as_ls:1; /* local shader */ - unsigned export_prim_id:1; /* when PS needs it and GS is disabled */ } vs; struct { - unsigned prim_mode:3; + struct si_tcs_epilog_bits epilog; } tcs; /* tessellation control shader */ struct { - /* Mask of "get_unique_index" bits - which outputs are read - * by the next stage (needed by ES). - * This describes how outputs are laid out in memory. */ + struct si_vs_epilog_bits epilog; /* same as VS */ unsigned as_es:1; /* export shader */ - unsigned export_prim_id:1; /* when PS needs it and GS is disabled */ } tes; /* tessellation evaluation shader */ }; @@ -267,22 +341,42 @@ struct si_shader_config { unsigned rsrc2; }; +/* GCN-specific shader info. */ +struct si_shader_info { + ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS]; + ubyte num_input_sgprs; + ubyte num_input_vgprs; + char face_vgpr_index; + bool uses_instanceid; + ubyte nr_pos_exports; + ubyte nr_param_exports; +}; + struct si_shader { struct si_shader_selector *selector; struct si_shader *next_variant; + struct si_shader_part *prolog; + struct si_shader_part *epilog; + struct si_shader *gs_copy_shader; struct si_pm4_state *pm4; struct r600_resource *bo; struct r600_resource *scratch_bo; union si_shader_key key; + bool is_binary_shared; + + /* The following data is all that's needed for binary shaders. */ struct radeon_shader_binary binary; struct si_shader_config config; + struct si_shader_info info; +}; - unsigned vs_output_param_offset[PIPE_MAX_SHADER_OUTPUTS]; - bool uses_instanceid; - unsigned nr_pos_exports; - unsigned nr_param_exports; +struct si_shader_part { + struct si_shader_part *next; + union si_shader_part_key key; + struct radeon_shader_binary binary; + struct si_shader_config config; }; static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx) @@ -310,14 +404,19 @@ static inline struct si_shader* si_get_vs_state(struct si_context *sctx) static inline bool si_vs_exports_prim_id(struct si_shader *shader) { if (shader->selector->type == PIPE_SHADER_VERTEX) - return shader->key.vs.export_prim_id; + return shader->key.vs.epilog.export_prim_id; else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) - return shader->key.tes.export_prim_id; + return shader->key.tes.epilog.export_prim_id; else return false; } -/* radeonsi_shader.c */ +/* si_shader.c */ +int si_compile_tgsi_shader(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + bool is_monolithic, + struct pipe_debug_callback *debug); int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, struct si_shader *shader, struct pipe_debug_callback *debug); diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index bf780777b50..2dfdbeb8d8f 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -277,7 +277,7 @@ static void si_emit_cb_render_state(struct si_context *sctx, struct r600_atom *a if (sctx->b.family == CHIP_STONEY) { unsigned spi_shader_col_format = sctx->ps_shader.cso ? - sctx->ps_shader.current->key.ps.spi_shader_col_format : 0; + sctx->ps_shader.current->key.ps.epilog.spi_shader_col_format : 0; unsigned sx_ps_downconvert = 0; unsigned sx_blend_opt_epsilon = 0; unsigned sx_blend_opt_control = 0; diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index f64c4d45f1b..40792cbc1d5 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -280,6 +280,8 @@ si_create_sampler_view_custom(struct pipe_context *ctx, /* si_state_shader.c */ bool si_update_shaders(struct si_context *sctx); void si_init_shader_functions(struct si_context *sctx); +bool si_init_shader_cache(struct si_screen *sscreen); +void si_destroy_shader_cache(struct si_screen *sscreen); /* si_state_draw.c */ void si_emit_cache_flush(struct si_context *sctx, struct r600_atom *atom); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 77a4e47c809..a6753a7a528 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -32,10 +32,221 @@ #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_ureg.h" +#include "util/hash_table.h" +#include "util/u_hash.h" #include "util/u_memory.h" #include "util/u_prim.h" #include "util/u_simple_shaders.h" +/* SHADER_CACHE */ + +/** + * Return the TGSI binary in a buffer. The first 4 bytes contain its size as + * integer. + */ +static void *si_get_tgsi_binary(struct si_shader_selector *sel) +{ + unsigned tgsi_size = tgsi_num_tokens(sel->tokens) * + sizeof(struct tgsi_token); + unsigned size = 4 + tgsi_size + sizeof(sel->so); + char *result = (char*)MALLOC(size); + + if (!result) + return NULL; + + *((uint32_t*)result) = size; + memcpy(result + 4, sel->tokens, tgsi_size); + memcpy(result + 4 + tgsi_size, &sel->so, sizeof(sel->so)); + return result; +} + +/** Copy "data" to "ptr" and return the next dword following copied data. */ +static uint32_t *write_data(uint32_t *ptr, const void *data, unsigned size) +{ + memcpy(ptr, data, size); + ptr += DIV_ROUND_UP(size, 4); + return ptr; +} + +/** Read data from "ptr". Return the next dword following the data. */ +static uint32_t *read_data(uint32_t *ptr, void *data, unsigned size) +{ + memcpy(data, ptr, size); + ptr += DIV_ROUND_UP(size, 4); + return ptr; +} + +/** + * Write the size as uint followed by the data. Return the next dword + * following the copied data. + */ +static uint32_t *write_chunk(uint32_t *ptr, const void *data, unsigned size) +{ + *ptr++ = size; + return write_data(ptr, data, size); +} + +/** + * Read the size as uint followed by the data. Return both via parameters. + * Return the next dword following the data. + */ +static uint32_t *read_chunk(uint32_t *ptr, void **data, unsigned *size) +{ + *size = *ptr++; + assert(*data == NULL); + *data = malloc(*size); + return read_data(ptr, *data, *size); +} + +/** + * Return the shader binary in a buffer. The first 4 bytes contain its size + * as integer. + */ +static void *si_get_shader_binary(struct si_shader *shader) +{ + /* There is always a size of data followed by the data itself. */ + unsigned relocs_size = shader->binary.reloc_count * + sizeof(shader->binary.relocs[0]); + unsigned disasm_size = strlen(shader->binary.disasm_string) + 1; + unsigned size = + 4 + /* total size */ + 4 + /* CRC32 of the data below */ + align(sizeof(shader->config), 4) + + align(sizeof(shader->info), 4) + + 4 + align(shader->binary.code_size, 4) + + 4 + align(shader->binary.rodata_size, 4) + + 4 + align(relocs_size, 4) + + 4 + align(disasm_size, 4); + void *buffer = CALLOC(1, size); + uint32_t *ptr = (uint32_t*)buffer; + + if (!buffer) + return NULL; + + *ptr++ = size; + ptr++; /* CRC32 is calculated at the end. */ + + ptr = write_data(ptr, &shader->config, sizeof(shader->config)); + ptr = write_data(ptr, &shader->info, sizeof(shader->info)); + ptr = write_chunk(ptr, shader->binary.code, shader->binary.code_size); + ptr = write_chunk(ptr, shader->binary.rodata, shader->binary.rodata_size); + ptr = write_chunk(ptr, shader->binary.relocs, relocs_size); + ptr = write_chunk(ptr, shader->binary.disasm_string, disasm_size); + assert((char *)ptr - (char *)buffer == size); + + /* Compute CRC32. */ + ptr = (uint32_t*)buffer; + ptr++; + *ptr = util_hash_crc32(ptr + 1, size - 8); + + return buffer; +} + +static bool si_load_shader_binary(struct si_shader *shader, void *binary) +{ + uint32_t *ptr = (uint32_t*)binary; + uint32_t size = *ptr++; + uint32_t crc32 = *ptr++; + unsigned chunk_size; + + if (util_hash_crc32(ptr, size - 8) != crc32) { + fprintf(stderr, "radeonsi: binary shader has invalid CRC32\n"); + return false; + } + + ptr = read_data(ptr, &shader->config, sizeof(shader->config)); + ptr = read_data(ptr, &shader->info, sizeof(shader->info)); + ptr = read_chunk(ptr, (void**)&shader->binary.code, + &shader->binary.code_size); + ptr = read_chunk(ptr, (void**)&shader->binary.rodata, + &shader->binary.rodata_size); + ptr = read_chunk(ptr, (void**)&shader->binary.relocs, &chunk_size); + shader->binary.reloc_count = chunk_size / sizeof(shader->binary.relocs[0]); + ptr = read_chunk(ptr, (void**)&shader->binary.disasm_string, &chunk_size); + + return true; +} + +/** + * Insert a shader into the cache. It's assumed the shader is not in the cache. + * Use si_shader_cache_load_shader before calling this. + * + * Returns false on failure, in which case the tgsi_binary should be freed. + */ +static bool si_shader_cache_insert_shader(struct si_screen *sscreen, + void *tgsi_binary, + struct si_shader *shader) +{ + void *hw_binary = si_get_shader_binary(shader); + + if (!hw_binary) + return false; + + if (_mesa_hash_table_insert(sscreen->shader_cache, tgsi_binary, + hw_binary) == NULL) { + FREE(hw_binary); + return false; + } + + return true; +} + +static bool si_shader_cache_load_shader(struct si_screen *sscreen, + void *tgsi_binary, + struct si_shader *shader) +{ + struct hash_entry *entry = + _mesa_hash_table_search(sscreen->shader_cache, tgsi_binary); + if (!entry) + return false; + + return si_load_shader_binary(shader, entry->data); +} + +static uint32_t si_shader_cache_key_hash(const void *key) +{ + /* The first dword is the key size. */ + return util_hash_crc32(key, *(uint32_t*)key); +} + +static bool si_shader_cache_key_equals(const void *a, const void *b) +{ + uint32_t *keya = (uint32_t*)a; + uint32_t *keyb = (uint32_t*)b; + + /* The first dword is the key size. */ + if (*keya != *keyb) + return false; + + return memcmp(keya, keyb, *keya) == 0; +} + +static void si_destroy_shader_cache_entry(struct hash_entry *entry) +{ + FREE((void*)entry->key); + FREE(entry->data); +} + +bool si_init_shader_cache(struct si_screen *sscreen) +{ + pipe_mutex_init(sscreen->shader_cache_mutex); + sscreen->shader_cache = + _mesa_hash_table_create(NULL, + si_shader_cache_key_hash, + si_shader_cache_key_equals); + return sscreen->shader_cache != NULL; +} + +void si_destroy_shader_cache(struct si_screen *sscreen) +{ + if (sscreen->shader_cache) + _mesa_hash_table_destroy(sscreen->shader_cache, + si_destroy_shader_cache_entry); + pipe_mutex_destroy(sscreen->shader_cache_mutex); +} + +/* SHADER STATES */ + static void si_set_tesseval_regs(struct si_shader *shader, struct si_pm4_state *pm4) { @@ -108,7 +319,7 @@ static void si_shader_ls(struct si_shader *shader) /* We need at least 2 components for LS. * VGPR0-3: (VertexID, RelAutoindex, ???, InstanceID). */ - vgpr_comp_cnt = shader->uses_instanceid ? 3 : 1; + vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 1; num_user_sgprs = SI_LS_NUM_USER_SGPR; num_sgprs = shader->config.num_sgprs; @@ -181,7 +392,7 @@ static void si_shader_es(struct si_shader *shader) si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER); if (shader->selector->type == PIPE_SHADER_VERTEX) { - vgpr_comp_cnt = shader->uses_instanceid ? 3 : 0; + vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 0; num_user_sgprs = SI_ES_NUM_USER_SGPR; } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) { vgpr_comp_cnt = 3; /* all components are needed for TES */ @@ -347,7 +558,7 @@ static void si_shader_vs(struct si_shader *shader, struct si_shader *gs) vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */ num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR; } else if (shader->selector->type == PIPE_SHADER_VERTEX) { - vgpr_comp_cnt = shader->uses_instanceid ? 3 : (enable_prim_id ? 2 : 0); + vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : (enable_prim_id ? 2 : 0); num_user_sgprs = SI_VS_NUM_USER_SGPR; } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) { vgpr_comp_cnt = 3; /* all components are needed for TES */ @@ -363,19 +574,19 @@ static void si_shader_vs(struct si_shader *shader, struct si_shader *gs) assert(num_sgprs <= 104); /* VS is required to export at least one param. */ - nparams = MAX2(shader->nr_param_exports, 1); + nparams = MAX2(shader->info.nr_param_exports, 1); si_pm4_set_reg(pm4, R_0286C4_SPI_VS_OUT_CONFIG, S_0286C4_VS_EXPORT_COUNT(nparams - 1)); si_pm4_set_reg(pm4, R_02870C_SPI_SHADER_POS_FORMAT, S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) | - S_02870C_POS1_EXPORT_FORMAT(shader->nr_pos_exports > 1 ? + S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP : V_02870C_SPI_SHADER_NONE) | - S_02870C_POS2_EXPORT_FORMAT(shader->nr_pos_exports > 2 ? + S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP : V_02870C_SPI_SHADER_NONE) | - S_02870C_POS3_EXPORT_FORMAT(shader->nr_pos_exports > 3 ? + S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP : V_02870C_SPI_SHADER_NONE)); @@ -415,7 +626,7 @@ static unsigned si_get_ps_num_interp(struct si_shader *ps) unsigned num_colors = !!(info->colors_read & 0x0f) + !!(info->colors_read & 0xf0); unsigned num_interp = ps->selector->info.num_inputs + - (ps->key.ps.color_two_side ? num_colors : 0); + (ps->key.ps.prolog.color_two_side ? num_colors : 0); assert(num_interp <= 32); return MIN2(num_interp, 32); @@ -423,7 +634,7 @@ static unsigned si_get_ps_num_interp(struct si_shader *ps) static unsigned si_get_spi_shader_col_format(struct si_shader *shader) { - unsigned value = shader->key.ps.spi_shader_col_format; + unsigned value = shader->key.ps.epilog.spi_shader_col_format; unsigned i, num_targets = (util_last_bit(value) + 3) / 4; /* If the i-th target format is set, all previous target formats must @@ -528,7 +739,7 @@ static void si_shader_ps(struct si_shader *shader) if (!spi_shader_col_format && !info->writes_z && !info->writes_stencil && !info->writes_samplemask && (shader->selector->info.uses_kill || - shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS)) + shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)) spi_shader_col_format = V_028714_SPI_SHADER_32_R; si_pm4_set_reg(pm4, R_0286CC_SPI_PS_INPUT_ENA, input_ena); @@ -638,11 +849,13 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, switch (sel->type) { case PIPE_SHADER_VERTEX: - if (sctx->vertex_elements) - for (i = 0; i < sctx->vertex_elements->count; ++i) - key->vs.instance_divisors[i] = + if (sctx->vertex_elements) { + unsigned count = MIN2(sel->info.num_inputs, + sctx->vertex_elements->count); + for (i = 0; i < count; ++i) + key->vs.prolog.instance_divisors[i] = sctx->vertex_elements->elements[i].instance_divisor; - + } if (sctx->tes_shader.cso) key->vs.as_ls = 1; else if (sctx->gs_shader.cso) @@ -650,17 +863,17 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, if (!sctx->gs_shader.cso && sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid) - key->vs.export_prim_id = 1; + key->vs.epilog.export_prim_id = 1; break; case PIPE_SHADER_TESS_CTRL: - key->tcs.prim_mode = + key->tcs.epilog.prim_mode = sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE]; break; case PIPE_SHADER_TESS_EVAL: if (sctx->gs_shader.cso) key->tes.as_es = 1; else if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid) - key->tes.export_prim_id = 1; + key->tes.epilog.export_prim_id = 1; break; case PIPE_SHADER_GEOMETRY: break; @@ -670,13 +883,13 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] && sel->info.colors_written == 0x1) - key->ps.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1; + key->ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1; if (blend) { /* Select the shader color format based on whether * blending or alpha are needed. */ - key->ps.spi_shader_col_format = + key->ps.epilog.spi_shader_col_format = (blend->blend_enable_4bit & blend->need_src_alpha_4bit & sctx->framebuffer.spi_shader_col_format_blend_alpha) | (blend->blend_enable_4bit & ~blend->need_src_alpha_4bit & @@ -686,26 +899,26 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, (~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit & sctx->framebuffer.spi_shader_col_format); } else - key->ps.spi_shader_col_format = sctx->framebuffer.spi_shader_col_format; + key->ps.epilog.spi_shader_col_format = sctx->framebuffer.spi_shader_col_format; /* If alpha-to-coverage is enabled, we have to export alpha * even if there is no color buffer. */ - if (!(key->ps.spi_shader_col_format & 0xf) && + if (!(key->ps.epilog.spi_shader_col_format & 0xf) && blend && blend->alpha_to_coverage) - key->ps.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR; + key->ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR; /* On SI and CIK except Hawaii, the CB doesn't clamp outputs * to the range supported by the type if a channel has less * than 16 bits and the export format is 16_ABGR. */ if (sctx->b.chip_class <= CIK && sctx->b.family != CHIP_HAWAII) - key->ps.color_is_int8 = sctx->framebuffer.color_is_int8; + key->ps.epilog.color_is_int8 = sctx->framebuffer.color_is_int8; /* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */ - if (!key->ps.last_cbuf) { - key->ps.spi_shader_col_format &= sel->colors_written_4bit; - key->ps.color_is_int8 &= sel->info.colors_written; + if (!key->ps.epilog.last_cbuf) { + key->ps.epilog.spi_shader_col_format &= sel->colors_written_4bit; + key->ps.epilog.color_is_int8 &= sel->info.colors_written; } if (rs) { @@ -714,31 +927,32 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, sctx->current_rast_prim >= PIPE_PRIM_TRIANGLES_ADJACENCY; bool is_line = !is_poly && sctx->current_rast_prim != PIPE_PRIM_POINTS; - key->ps.color_two_side = rs->two_side && sel->info.colors_read; + key->ps.prolog.color_two_side = rs->two_side && sel->info.colors_read; if (sctx->queued.named.blend) { - key->ps.alpha_to_one = sctx->queued.named.blend->alpha_to_one && - rs->multisample_enable && - !sctx->framebuffer.cb0_is_integer; + key->ps.epilog.alpha_to_one = sctx->queued.named.blend->alpha_to_one && + rs->multisample_enable && + !sctx->framebuffer.cb0_is_integer; } - key->ps.poly_stipple = rs->poly_stipple_enable && is_poly; - key->ps.poly_line_smoothing = ((is_poly && rs->poly_smooth) || - (is_line && rs->line_smooth)) && - sctx->framebuffer.nr_samples <= 1; - key->ps.clamp_color = rs->clamp_fragment_color; - - key->ps.force_persample_interp = rs->force_persample_interp && - rs->multisample_enable && - sctx->framebuffer.nr_samples > 1 && - sctx->ps_iter_samples > 1 && - (sel->info.uses_persp_center || - sel->info.uses_persp_centroid || - sel->info.uses_linear_center || - sel->info.uses_linear_centroid); + key->ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly; + key->ps.epilog.poly_line_smoothing = ((is_poly && rs->poly_smooth) || + (is_line && rs->line_smooth)) && + sctx->framebuffer.nr_samples <= 1; + key->ps.epilog.clamp_color = rs->clamp_fragment_color; + + key->ps.prolog.force_persample_interp = + rs->force_persample_interp && + rs->multisample_enable && + sctx->framebuffer.nr_samples > 1 && + sctx->ps_iter_samples > 1 && + (sel->info.uses_persp_center || + sel->info.uses_persp_centroid || + sel->info.uses_linear_center || + sel->info.uses_linear_centroid); } - key->ps.alpha_func = si_get_alpha_test_func(sctx); + key->ps.epilog.alpha_func = si_get_alpha_test_func(sctx); break; } default: @@ -821,6 +1035,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx, const struct pipe_shader_state *state) { struct si_screen *sscreen = (struct si_screen *)ctx->screen; + struct si_context *sctx = (struct si_context*)ctx; struct si_shader_selector *sel = CALLOC_STRUCT(si_shader_selector); int i; @@ -900,6 +1115,13 @@ static void *si_create_shader_selector(struct pipe_context *ctx, for (i = 0; i < 8; i++) if (sel->info.colors_written & (1 << i)) sel->colors_written_4bit |= 0xf << (4 * i); + + for (i = 0; i < sel->info.num_inputs; i++) { + if (sel->info.input_semantic_name[i] == TGSI_SEMANTIC_COLOR) { + int index = sel->info.input_semantic_index[i]; + sel->color_attr_index[index] = i; + } + } break; } @@ -921,6 +1143,44 @@ static void *si_create_shader_selector(struct pipe_context *ctx, break; } + /* Compile the main shader part for use with a prolog and/or epilog. */ + if (sel->type != PIPE_SHADER_GEOMETRY && + !sscreen->use_monolithic_shaders) { + struct si_shader *shader = CALLOC_STRUCT(si_shader); + void *tgsi_binary; + + if (!shader) + goto error; + + shader->selector = sel; + + tgsi_binary = si_get_tgsi_binary(sel); + + /* Try to load the shader from the shader cache. */ + pipe_mutex_lock(sscreen->shader_cache_mutex); + + if (tgsi_binary && + si_shader_cache_load_shader(sscreen, tgsi_binary, shader)) { + FREE(tgsi_binary); + } else { + /* Compile the shader if it hasn't been loaded from the cache. */ + if (si_compile_tgsi_shader(sscreen, sctx->tm, shader, false, + &sctx->b.debug) != 0) { + FREE(shader); + FREE(tgsi_binary); + pipe_mutex_unlock(sscreen->shader_cache_mutex); + goto error; + } + + if (tgsi_binary && + !si_shader_cache_insert_shader(sscreen, tgsi_binary, shader)) + FREE(tgsi_binary); + } + pipe_mutex_unlock(sscreen->shader_cache_mutex); + + sel->main_shader_part = shader; + } + /* Pre-compilation. */ if (sel->type == PIPE_SHADER_GEOMETRY || sscreen->b.debug_flags & DBG_PRECOMPILE) { @@ -934,27 +1194,29 @@ static void *si_create_shader_selector(struct pipe_context *ctx, */ switch (sel->type) { case PIPE_SHADER_TESS_CTRL: - key.tcs.prim_mode = PIPE_PRIM_TRIANGLES; + key.tcs.epilog.prim_mode = PIPE_PRIM_TRIANGLES; break; case PIPE_SHADER_FRAGMENT: - key.ps.alpha_func = PIPE_FUNC_ALWAYS; + key.ps.epilog.alpha_func = PIPE_FUNC_ALWAYS; for (i = 0; i < 8; i++) if (sel->info.colors_written & (1 << i)) - key.ps.spi_shader_col_format |= + key.ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_FP16_ABGR << (i * 4); break; } - if (si_shader_select_with_key(ctx, &state, &key)) { - fprintf(stderr, "radeonsi: can't create a shader\n"); - tgsi_free_tokens(sel->tokens); - FREE(sel); - return NULL; - } + if (si_shader_select_with_key(ctx, &state, &key)) + goto error; } pipe_mutex_init(sel->mutex); return sel; + +error: + fprintf(stderr, "radeonsi: can't create a shader\n"); + tgsi_free_tokens(sel->tokens); + FREE(sel); + return NULL; } /** @@ -1119,6 +1381,9 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state) p = c; } + if (sel->main_shader_part) + si_delete_shader(sctx, sel->main_shader_part); + pipe_mutex_destroy(sel->mutex); free(sel->tokens); free(sel); @@ -1144,14 +1409,14 @@ static unsigned si_get_ps_input_cntl(struct si_context *sctx, for (j = 0; j < vsinfo->num_outputs; j++) { if (name == vsinfo->output_semantic_name[j] && index == vsinfo->output_semantic_index[j]) { - ps_input_cntl |= S_028644_OFFSET(vs->vs_output_param_offset[j]); + ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[j]); break; } } if (name == TGSI_SEMANTIC_PRIMID) /* PrimID is written after the last output. */ - ps_input_cntl |= S_028644_OFFSET(vs->vs_output_param_offset[vsinfo->num_outputs]); + ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[vsinfo->num_outputs]); else if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(ps_input_cntl)) { /* No corresponding output found, load defaults into input. * Don't set any other bits. @@ -1191,7 +1456,7 @@ static void si_emit_spi_map(struct si_context *sctx, struct r600_atom *atom) } } - if (ps->key.ps.color_two_side) { + if (ps->key.ps.prolog.color_two_side) { unsigned bcol = TGSI_SEMANTIC_BCOLOR; for (i = 0; i < 2; i++) { @@ -1745,8 +2010,8 @@ bool si_update_shaders(struct si_context *sctx) si_mark_atom_dirty(sctx, &sctx->db_render_state); } - if (sctx->smoothing_enabled != sctx->ps_shader.current->key.ps.poly_line_smoothing) { - sctx->smoothing_enabled = sctx->ps_shader.current->key.ps.poly_line_smoothing; + if (sctx->smoothing_enabled != sctx->ps_shader.current->key.ps.epilog.poly_line_smoothing) { + sctx->smoothing_enabled = sctx->ps_shader.current->key.ps.epilog.poly_line_smoothing; si_mark_atom_dirty(sctx, &sctx->msaa_config); if (sctx->b.chip_class == SI) |