diff options
Diffstat (limited to 'src/gallium/drivers/radeonsi/si_shader.c')
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader.c | 1784 |
1 files changed, 1623 insertions, 161 deletions
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index baa1090e2fb..57458ae1381 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -70,6 +70,12 @@ struct si_shader_context unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */ bool is_gs_copy_shader; + + /* Whether to generate the optimized shader variant compiled as a whole + * (without a prolog and epilog) + */ + bool is_monolithic; + int param_streamout_config; int param_streamout_write_index; int param_streamout_offset[4]; @@ -77,6 +83,7 @@ struct si_shader_context int param_rel_auto_id; int param_vs_prim_id; int param_instance_id; + int param_vertex_index0; int param_tes_u; int param_tes_v; int param_tes_rel_patch_id; @@ -96,14 +103,17 @@ struct si_shader_context LLVMValueRef esgs_ring; LLVMValueRef gsvs_ring[4]; LLVMValueRef gs_next_vertex[4]; + LLVMValueRef return_value; LLVMTypeRef voidt; LLVMTypeRef i1; LLVMTypeRef i8; LLVMTypeRef i32; + LLVMTypeRef i64; LLVMTypeRef i128; LLVMTypeRef f32; LLVMTypeRef v16i8; + LLVMTypeRef v2i32; LLVMTypeRef v4i32; LLVMTypeRef v4f32; LLVMTypeRef v8i32; @@ -118,9 +128,17 @@ static struct si_shader_context *si_shader_context( static void si_init_shader_ctx(struct si_shader_context *ctx, struct si_screen *sscreen, struct si_shader *shader, - LLVMTargetMachineRef tm, - struct tgsi_shader_info *info); + LLVMTargetMachineRef tm); +/* Ideally pass the sample mask input to the PS epilog as v13, which + * is its usual location, so that the shader doesn't have to add v_mov. + */ +#define PS_EPILOG_SAMPLEMASK_MIN_LOC 13 + +/* The VS location of the PrimitiveID input is the same in the epilog, + * so that the main shader part doesn't have to move it. + */ +#define VS_EPILOG_PRIMID_LOC 2 #define PERSPECTIVE_BASE 0 #define LINEAR_BASE 9 @@ -196,6 +214,10 @@ static LLVMValueRef unpack_param(struct si_shader_context *ctx, LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn, param); + if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind) + value = bitcast(&ctx->radeon_bld.soa.bld_base, + TGSI_TYPE_UNSIGNED, value); + if (rshift) value = LLVMBuildLShr(gallivm->builder, value, lp_build_const_int32(gallivm, rshift), ""); @@ -375,7 +397,7 @@ static LLVMValueRef build_indexed_load_const( static LLVMValueRef get_instance_index_for_fetch( struct radeon_llvm_context *radeon_bld, - unsigned divisor) + unsigned param_start_instance, unsigned divisor) { struct si_shader_context *ctx = si_shader_context(&radeon_bld->soa.bld_base); @@ -389,8 +411,8 @@ static LLVMValueRef get_instance_index_for_fetch( result = LLVMBuildUDiv(gallivm->builder, result, lp_build_const_int32(gallivm, divisor), ""); - return LLVMBuildAdd(gallivm->builder, result, LLVMGetParam( - radeon_bld->main_fn, SI_PARAM_START_INSTANCE), ""); + return LLVMBuildAdd(gallivm->builder, result, + LLVMGetParam(radeon_bld->main_fn, param_start_instance), ""); } static void declare_input_vs( @@ -402,7 +424,8 @@ static void declare_input_vs( struct gallivm_state *gallivm = base->gallivm; struct si_shader_context *ctx = si_shader_context(&radeon_bld->soa.bld_base); - unsigned divisor = ctx->shader->key.vs.instance_divisors[input_index]; + unsigned divisor = + ctx->shader->key.vs.prolog.instance_divisors[input_index]; unsigned chan; @@ -424,10 +447,16 @@ static void declare_input_vs( /* Build the attribute offset */ attribute_offset = lp_build_const_int32(gallivm, 0); - if (divisor) { + if (!ctx->is_monolithic) { + buffer_index = LLVMGetParam(radeon_bld->main_fn, + ctx->param_vertex_index0 + + input_index); + } else if (divisor) { /* Build index from instance ID, start instance and divisor */ - ctx->shader->uses_instanceid = true; - buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld, divisor); + ctx->shader->info.uses_instanceid = true; + buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld, + SI_PARAM_START_INSTANCE, + divisor); } else { /* Load the buffer index for vertices. */ LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn, @@ -853,7 +882,8 @@ static int lookup_interp_param_index(unsigned interpolate, unsigned location) static unsigned select_interp_param(struct si_shader_context *ctx, unsigned param) { - if (!ctx->shader->key.ps.force_persample_interp) + if (!ctx->shader->key.ps.prolog.force_persample_interp || + !ctx->is_monolithic) return param; /* If the shader doesn't use center/centroid, just return the parameter. @@ -923,7 +953,7 @@ static void interp_fs_input(struct si_shader_context *ctx, intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant"; if (semantic_name == TGSI_SEMANTIC_COLOR && - ctx->shader->key.ps.color_two_side) { + ctx->shader->key.ps.prolog.color_two_side) { LLVMValueRef args[4]; LLVMValueRef is_face_positive; LLVMValueRef back_attr_number; @@ -997,6 +1027,7 @@ static void declare_input_fs( unsigned input_index, const struct tgsi_full_declaration *decl) { + struct lp_build_context *base = &radeon_bld->soa.bld_base.base; struct si_shader_context *ctx = si_shader_context(&radeon_bld->soa.bld_base); struct si_shader *shader = ctx->shader; @@ -1004,6 +1035,26 @@ static void declare_input_fs( LLVMValueRef interp_param = NULL; int interp_param_idx; + /* Get colors from input VGPRs (set by the prolog). */ + if (!ctx->is_monolithic && + decl->Semantic.Name == TGSI_SEMANTIC_COLOR) { + unsigned i = decl->Semantic.Index; + unsigned colors_read = shader->selector->info.colors_read; + unsigned mask = colors_read >> (i * 4); + unsigned offset = SI_PARAM_POS_FIXED_PT + 1 + + (i ? util_bitcount(colors_read & 0xf) : 0); + + radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] = + mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef; + radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] = + mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef; + radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] = + mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef; + radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] = + mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef; + return; + } + interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate, decl->Interp.Location); if (interp_param_idx == -1) @@ -1330,12 +1381,12 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { const union si_shader_key *key = &ctx->shader->key; - unsigned col_formats = key->ps.spi_shader_col_format; + unsigned col_formats = key->ps.epilog.spi_shader_col_format; int cbuf = target - V_008DFC_SQ_EXP_MRT; assert(cbuf >= 0 && cbuf < 8); spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf; - is_int8 = (key->ps.color_is_int8 >> cbuf) & 0x1; + is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1; } args[4] = uint->zero; /* COMPR flag */ @@ -1488,13 +1539,13 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base, struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; - if (ctx->shader->key.ps.alpha_func != PIPE_FUNC_NEVER) { + if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) { LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_ALPHA_REF); LLVMValueRef alpha_pass = lp_build_cmp(&bld_base->base, - ctx->shader->key.ps.alpha_func, + ctx->shader->key.ps.epilog.alpha_func, alpha, alpha_ref); LLVMValueRef arg = lp_build_select(&bld_base->base, @@ -1511,7 +1562,8 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base, } static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base, - LLVMValueRef alpha) + LLVMValueRef alpha, + unsigned samplemask_param) { struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; @@ -1519,7 +1571,7 @@ static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context * /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */ coverage = LLVMGetParam(ctx->radeon_bld.main_fn, - SI_PARAM_SAMPLE_COVERAGE); + samplemask_param); coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage); coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32", @@ -1841,7 +1893,8 @@ handle_semantic: case TGSI_SEMANTIC_COLOR: case TGSI_SEMANTIC_BCOLOR: target = V_008DFC_SQ_EXP_PARAM + param_count; - shader->vs_output_param_offset[i] = param_count; + assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset)); + shader->info.vs_output_param_offset[i] = param_count; param_count++; break; case TGSI_SEMANTIC_CLIPDIST: @@ -1855,7 +1908,8 @@ handle_semantic: case TGSI_SEMANTIC_TEXCOORD: case TGSI_SEMANTIC_GENERIC: target = V_008DFC_SQ_EXP_PARAM + param_count; - shader->vs_output_param_offset[i] = param_count; + assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset)); + shader->info.vs_output_param_offset[i] = param_count; param_count++; break; default: @@ -1883,7 +1937,7 @@ handle_semantic: } } - shader->nr_param_exports = param_count; + shader->info.nr_param_exports = param_count; /* We need to add the position output manually if it's missing. */ if (!pos_args[0][0]) { @@ -1945,7 +1999,7 @@ handle_semantic: for (i = 0; i < 4; i++) if (pos_args[i][0]) - shader->nr_pos_exports++; + shader->info.nr_pos_exports++; pos_idx = 0; for (i = 0; i < 4; i++) { @@ -1955,7 +2009,7 @@ handle_semantic: /* Specify the target we are exporting */ pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++); - if (pos_idx == shader->nr_pos_exports) + if (pos_idx == shader->info.nr_pos_exports) /* Specify that this is the last export */ pos_args[i][2] = uint->one; @@ -1989,7 +2043,7 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, invocation_id, bld_base->uint_bld.zero, "")); /* Determine the layout of one tess factor element in the buffer. */ - switch (shader->key.tcs.prim_mode) { + switch (shader->key.tcs.epilog.prim_mode) { case PIPE_PRIM_LINES: stride = 2; /* 2 dwords, 1 vec2 store */ outer_comps = 2; @@ -2061,14 +2115,51 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base) { struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef invocation_id; + LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset; + rel_patch_id = get_rel_patch_id(ctx); invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5); + tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx); - si_write_tess_factors(bld_base, - get_rel_patch_id(ctx), - invocation_id, - get_tcs_out_current_patch_data_offset(ctx)); + if (!ctx->is_monolithic) { + /* Return epilog parameters from this function. */ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + LLVMValueRef ret = ctx->return_value; + LLVMValueRef rw_buffers, rw0, rw1, tf_soffset; + unsigned vgpr; + + /* RW_BUFFERS pointer */ + rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_RW_BUFFERS); + rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, ""); + rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, ""); + rw0 = LLVMBuildExtractElement(builder, rw_buffers, + bld_base->uint_bld.zero, ""); + rw1 = LLVMBuildExtractElement(builder, rw_buffers, + bld_base->uint_bld.one, ""); + ret = LLVMBuildInsertValue(builder, ret, rw0, 0, ""); + ret = LLVMBuildInsertValue(builder, ret, rw1, 1, ""); + + /* Tess factor buffer soffset is after user SGPRs. */ + tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_TESS_FACTOR_OFFSET); + ret = LLVMBuildInsertValue(builder, ret, tf_soffset, + SI_TCS_NUM_USER_SGPR, ""); + + /* VGPRs */ + rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id); + invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id); + tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset); + + vgpr = SI_TCS_NUM_USER_SGPR + 1; + ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, ""); + ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, ""); + ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, ""); + ctx->return_value = ret; + return; + } + + si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset); } static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base) @@ -2214,16 +2305,26 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base) ""); } - /* Export PrimitiveID when PS needs it. */ - if (si_vs_exports_prim_id(ctx->shader)) { - outputs[i].name = TGSI_SEMANTIC_PRIMID; - outputs[i].sid = 0; - outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, - get_primitive_id(bld_base, 0)); - outputs[i].values[1] = bld_base->base.undef; - outputs[i].values[2] = bld_base->base.undef; - outputs[i].values[3] = bld_base->base.undef; - i++; + if (ctx->is_monolithic) { + /* Export PrimitiveID when PS needs it. */ + if (si_vs_exports_prim_id(ctx->shader)) { + outputs[i].name = TGSI_SEMANTIC_PRIMID; + outputs[i].sid = 0; + outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, + get_primitive_id(bld_base, 0)); + outputs[i].values[1] = bld_base->base.undef; + outputs[i].values[2] = bld_base->base.undef; + outputs[i].values[3] = bld_base->base.undef; + i++; + } + } else { + /* Return the primitive ID from the LLVM function. */ + ctx->return_value = + LLVMBuildInsertValue(gallivm->builder, + ctx->return_value, + bitcast(bld_base, TGSI_TYPE_FLOAT, + get_primitive_id(bld_base, 0)), + VS_EPILOG_PRIMID_LOC, ""); } si_llvm_export_vs(bld_base, outputs, i); @@ -2284,6 +2385,7 @@ static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base, static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base, LLVMValueRef *color, unsigned index, + unsigned samplemask_param, bool is_last) { struct si_shader_context *ctx = si_shader_context(bld_base); @@ -2291,30 +2393,31 @@ static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base, int i; /* Clamp color */ - if (ctx->shader->key.ps.clamp_color) + if (ctx->shader->key.ps.epilog.clamp_color) for (i = 0; i < 4; i++) color[i] = radeon_llvm_saturate(bld_base, color[i]); /* Alpha to one */ - if (ctx->shader->key.ps.alpha_to_one) + if (ctx->shader->key.ps.epilog.alpha_to_one) color[3] = base->one; /* Alpha test */ if (index == 0 && - ctx->shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS) + ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) si_alpha_test(bld_base, color[3]); /* Line & polygon smoothing */ - if (ctx->shader->key.ps.poly_line_smoothing) - color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3]); + if (ctx->shader->key.ps.epilog.poly_line_smoothing) + color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3], + samplemask_param); /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ - if (ctx->shader->key.ps.last_cbuf > 0) { + if (ctx->shader->key.ps.epilog.last_cbuf > 0) { LLVMValueRef args[8][9]; int c, last = -1; /* Get the export arguments, also find out what the last one is. */ - for (c = 0; c <= ctx->shader->key.ps.last_cbuf; c++) { + for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) { si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + c, args[c]); if (args[c][0] != bld_base->uint_bld.zero) @@ -2322,7 +2425,7 @@ static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base, } /* Emit all exports. */ - for (c = 0; c <= ctx->shader->key.ps.last_cbuf; c++) { + for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) { if (is_last && last == c) { args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */ args[c][2] = bld_base->uint_bld.one; /* DONE bit */ @@ -2385,11 +2488,11 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base) * Otherwise, find the last color export. */ if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) { - unsigned spi_format = shader->key.ps.spi_shader_col_format; + unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format; /* Don't export NULL and return if alpha-test is enabled. */ - if (shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS && - shader->key.ps.alpha_func != PIPE_FUNC_NEVER && + if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS && + shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER && (spi_format & 0xf) == 0) spi_format |= V_028714_SPI_SHADER_32_AR; @@ -2400,10 +2503,10 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base) continue; /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ - if (shader->key.ps.last_cbuf > 0) { + if (shader->key.ps.epilog.last_cbuf > 0) { /* Just set this if any of the colorbuffers are enabled. */ if (spi_format & - ((1llu << (4 * (shader->key.ps.last_cbuf + 1))) - 1)) + ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1)) last_color_export = i; continue; } @@ -2445,6 +2548,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base) ctx->radeon_bld.soa.outputs[i][j], ""); si_export_mrt_color(bld_base, color, semantic_index, + SI_PARAM_SAMPLE_COVERAGE, last_color_export == i); break; default: @@ -2458,6 +2562,100 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base) si_export_mrt_z(bld_base, depth, stencil, samplemask); } +/** + * Return PS outputs in this order: + * + * v[0:3] = color0.xyzw + * v[4:7] = color1.xyzw + * ... + * vN+0 = Depth + * vN+1 = Stencil + * vN+2 = SampleMask + * vN+3 = SampleMaskIn (used for OpenGL smoothing) + * + * The alpha-ref SGPR is returned via its original location. + */ +static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + struct si_shader *shader = ctx->shader; + struct lp_build_context *base = &bld_base->base; + struct tgsi_shader_info *info = &shader->selector->info; + LLVMBuilderRef builder = base->gallivm->builder; + unsigned i, j, first_vgpr, vgpr; + + LLVMValueRef color[8][4] = {}; + LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; + LLVMValueRef ret; + + /* Read the output values. */ + for (i = 0; i < info->num_outputs; i++) { + unsigned semantic_name = info->output_semantic_name[i]; + unsigned semantic_index = info->output_semantic_index[i]; + + switch (semantic_name) { + case TGSI_SEMANTIC_COLOR: + assert(semantic_index < 8); + for (j = 0; j < 4; j++) { + LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j]; + LLVMValueRef result = LLVMBuildLoad(builder, ptr, ""); + color[semantic_index][j] = result; + } + break; + case TGSI_SEMANTIC_POSITION: + depth = LLVMBuildLoad(builder, + ctx->radeon_bld.soa.outputs[i][2], ""); + break; + case TGSI_SEMANTIC_STENCIL: + stencil = LLVMBuildLoad(builder, + ctx->radeon_bld.soa.outputs[i][1], ""); + break; + case TGSI_SEMANTIC_SAMPLEMASK: + samplemask = LLVMBuildLoad(builder, + ctx->radeon_bld.soa.outputs[i][0], ""); + break; + default: + fprintf(stderr, "Warning: SI unhandled fs output type:%d\n", + semantic_name); + } + } + + /* Fill the return structure. */ + ret = ctx->return_value; + + /* Set SGPRs. */ + ret = LLVMBuildInsertValue(builder, ret, + bitcast(bld_base, TGSI_TYPE_SIGNED, + LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_ALPHA_REF)), + SI_SGPR_ALPHA_REF, ""); + + /* Set VGPRs */ + first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1; + for (i = 0; i < ARRAY_SIZE(color); i++) { + if (!color[i][0]) + continue; + + for (j = 0; j < 4; j++) + ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, ""); + } + if (depth) + ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, ""); + if (stencil) + ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, ""); + if (samplemask) + ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, ""); + + /* Add the input sample mask for smoothing at the end. */ + if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC) + vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC; + ret = LLVMBuildInsertValue(builder, ret, + LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_SAMPLE_COVERAGE), vgpr++, ""); + + ctx->return_value = ret; +} + static void build_tex_intrinsic(const struct lp_build_tgsi_action *action, struct lp_build_tgsi_context *bld_base, struct lp_build_emit_data *emit_data); @@ -2536,13 +2734,12 @@ static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements) /** * Load an image view, fmask view. or sampler state descriptor. */ -static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx, - LLVMValueRef index, enum desc_type type) +static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx, + LLVMValueRef list, LLVMValueRef index, + enum desc_type type) { struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, - SI_PARAM_SAMPLERS); switch (type) { case DESC_IMAGE: @@ -2558,12 +2755,21 @@ static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx, /* The sampler state is at [12:15]. */ index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), ""); index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), ""); - ptr = LLVMBuildPointerCast(builder, ptr, - const_array(ctx->v4i32, 0), ""); + list = LLVMBuildPointerCast(builder, list, + const_array(ctx->v4i32, 0), ""); break; } - return build_indexed_load_const(ctx, ptr, index); + return build_indexed_load_const(ctx, list, index); +} + +static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx, + LLVMValueRef index, enum desc_type type) +{ + LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_SAMPLERS); + + return get_sampler_desc_custom(ctx, list, index, type); } static void tex_fetch_ptrs( @@ -3546,6 +3752,30 @@ static const struct lp_build_tgsi_action interp_action = { .emit = build_interp_intrinsic, }; +static void si_create_function(struct si_shader_context *ctx, + LLVMTypeRef *returns, unsigned num_returns, + LLVMTypeRef *params, unsigned num_params, + int last_array_pointer, int last_sgpr) +{ + int i; + + radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns, + params, num_params); + radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type); + ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type); + + for (i = 0; i <= last_sgpr; ++i) { + LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i); + + /* We tell llvm that array inputs are passed by value to allow Sinking pass + * to move load. Inputs are constant so this is fine. */ + if (i <= last_array_pointer) + LLVMAddAttribute(P, LLVMByValAttribute); + else + LLVMAddAttribute(P, LLVMInRegAttribute); + } +} + static void create_meta_data(struct si_shader_context *ctx) { struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm; @@ -3579,15 +3809,57 @@ static void declare_streamout_params(struct si_shader_context *ctx, } } +static unsigned llvm_get_type_size(LLVMTypeRef type) +{ + LLVMTypeKind kind = LLVMGetTypeKind(type); + + switch (kind) { + case LLVMIntegerTypeKind: + return LLVMGetIntTypeWidth(type) / 8; + case LLVMFloatTypeKind: + return 4; + case LLVMPointerTypeKind: + return 8; + case LLVMVectorTypeKind: + return LLVMGetVectorSize(type) * + llvm_get_type_size(LLVMGetElementType(type)); + default: + assert(0); + return 0; + } +} + +static void declare_tess_lds(struct si_shader_context *ctx) +{ + struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; + LLVMTypeRef i32 = ctx->radeon_bld.soa.bld_base.uint_bld.elem_type; + + /* This is the upper bound, maximum is 32 inputs times 32 vertices */ + unsigned vertex_data_dw_size = 32*32*4; + unsigned patch_data_dw_size = 32*4; + /* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */ + unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size; + unsigned lds_dwords = patch_dw_size; + + /* The actual size is computed outside of the shader to reduce + * the number of shader variants. */ + ctx->lds = + LLVMAddGlobalInAddressSpace(gallivm->module, + LLVMArrayType(i32, lds_dwords), + "tess_lds", + LOCAL_ADDR_SPACE); +} + static void create_function(struct si_shader_context *ctx) { struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; struct gallivm_state *gallivm = bld_base->base.gallivm; struct si_shader *shader = ctx->shader; - LLVMTypeRef params[SI_NUM_PARAMS], v2i32, v3i32; - unsigned i, last_array_pointer, last_sgpr, num_params; + LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32; + LLVMTypeRef returns[16+32*4]; + unsigned i, last_array_pointer, last_sgpr, num_params, num_return_sgprs; + unsigned num_returns = 0; - v2i32 = LLVMVectorType(ctx->i32, 2); v3i32 = LLVMVectorType(ctx->i32, 3); params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS); @@ -3630,6 +3902,20 @@ static void create_function(struct si_shader_context *ctx) params[ctx->param_rel_auto_id = num_params++] = ctx->i32; params[ctx->param_vs_prim_id = num_params++] = ctx->i32; params[ctx->param_instance_id = num_params++] = ctx->i32; + + if (!ctx->is_monolithic && + !ctx->is_gs_copy_shader) { + /* Vertex load indices. */ + ctx->param_vertex_index0 = num_params; + + for (i = 0; i < shader->selector->info.num_inputs; i++) + params[num_params++] = ctx->i32; + + /* PrimitiveID output. */ + if (!shader->key.vs.as_es && !shader->key.vs.as_ls) + for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++) + returns[num_returns++] = ctx->f32; + } break; case TGSI_PROCESSOR_TESS_CTRL: @@ -3643,6 +3929,15 @@ static void create_function(struct si_shader_context *ctx) params[SI_PARAM_PATCH_ID] = ctx->i32; params[SI_PARAM_REL_IDS] = ctx->i32; num_params = SI_PARAM_REL_IDS+1; + + if (!ctx->is_monolithic) { + /* PARAM_TESS_FACTOR_OFFSET is after user SGPRs. */ + for (i = 0; i <= SI_TCS_NUM_USER_SGPR; i++) + returns[num_returns++] = ctx->i32; /* SGPRs */ + + for (i = 0; i < 3; i++) + returns[num_returns++] = ctx->f32; /* VGPRs */ + } break; case TGSI_PROCESSOR_TESS_EVAL: @@ -3663,6 +3958,11 @@ static void create_function(struct si_shader_context *ctx) params[ctx->param_tes_v = num_params++] = ctx->f32; params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32; params[ctx->param_tes_patch_id = num_params++] = ctx->i32; + + /* PrimitiveID output. */ + if (!ctx->is_monolithic && !shader->key.tes.as_es) + for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++) + returns[num_returns++] = ctx->f32; break; case TGSI_PROCESSOR_GEOMETRY: @@ -3686,13 +3986,13 @@ static void create_function(struct si_shader_context *ctx) params[SI_PARAM_ALPHA_REF] = ctx->f32; params[SI_PARAM_PRIM_MASK] = ctx->i32; last_sgpr = SI_PARAM_PRIM_MASK; - params[SI_PARAM_PERSP_SAMPLE] = v2i32; - params[SI_PARAM_PERSP_CENTER] = v2i32; - params[SI_PARAM_PERSP_CENTROID] = v2i32; + params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32; + params[SI_PARAM_PERSP_CENTER] = ctx->v2i32; + params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32; params[SI_PARAM_PERSP_PULL_MODEL] = v3i32; - params[SI_PARAM_LINEAR_SAMPLE] = v2i32; - params[SI_PARAM_LINEAR_CENTER] = v2i32; - params[SI_PARAM_LINEAR_CENTROID] = v2i32; + params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32; + params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32; + params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32; params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32; params[SI_PARAM_POS_X_FLOAT] = ctx->f32; params[SI_PARAM_POS_Y_FLOAT] = ctx->f32; @@ -3701,8 +4001,39 @@ static void create_function(struct si_shader_context *ctx) params[SI_PARAM_FRONT_FACE] = ctx->i32; params[SI_PARAM_ANCILLARY] = ctx->i32; params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32; - params[SI_PARAM_POS_FIXED_PT] = ctx->f32; + params[SI_PARAM_POS_FIXED_PT] = ctx->i32; num_params = SI_PARAM_POS_FIXED_PT+1; + + if (!ctx->is_monolithic) { + /* Color inputs from the prolog. */ + if (shader->selector->info.colors_read) { + unsigned num_color_elements = + util_bitcount(shader->selector->info.colors_read); + + assert(num_params + num_color_elements <= ARRAY_SIZE(params)); + for (i = 0; i < num_color_elements; i++) + params[num_params++] = ctx->f32; + } + + /* Outputs for the epilog. */ + num_return_sgprs = SI_SGPR_ALPHA_REF + 1; + num_returns = + num_return_sgprs + + util_bitcount(shader->selector->info.colors_written) * 4 + + shader->selector->info.writes_z + + shader->selector->info.writes_stencil + + shader->selector->info.writes_samplemask + + 1 /* SampleMaskIn */; + + num_returns = MAX2(num_returns, + num_return_sgprs + + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); + + for (i = 0; i < num_return_sgprs; i++) + returns[i] = ctx->i32; + for (; i < num_returns; i++) + returns[i] = ctx->f32; + } break; default: @@ -3711,20 +4042,38 @@ static void create_function(struct si_shader_context *ctx) } assert(num_params <= Elements(params)); - radeon_llvm_create_func(&ctx->radeon_bld, params, num_params); - radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type); - - for (i = 0; i <= last_sgpr; ++i) { - LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i); - /* We tell llvm that array inputs are passed by value to allow Sinking pass - * to move load. Inputs are constant so this is fine. */ - if (i <= last_array_pointer) - LLVMAddAttribute(P, LLVMByValAttribute); - else - LLVMAddAttribute(P, LLVMInRegAttribute); + si_create_function(ctx, returns, num_returns, params, + num_params, last_array_pointer, last_sgpr); + + /* Reserve register locations for VGPR inputs the PS prolog may need. */ + if (ctx->type == TGSI_PROCESSOR_FRAGMENT && + !ctx->is_monolithic) { + radeon_llvm_add_attribute(ctx->radeon_bld.main_fn, + "InitialPSInputAddr", + S_0286D0_PERSP_SAMPLE_ENA(1) | + S_0286D0_PERSP_CENTER_ENA(1) | + S_0286D0_PERSP_CENTROID_ENA(1) | + S_0286D0_LINEAR_SAMPLE_ENA(1) | + S_0286D0_LINEAR_CENTER_ENA(1) | + S_0286D0_LINEAR_CENTROID_ENA(1) | + S_0286D0_FRONT_FACE_ENA(1) | + S_0286D0_POS_FIXED_PT_ENA(1)); } + shader->info.num_input_sgprs = 0; + shader->info.num_input_vgprs = 0; + + for (i = 0; i <= last_sgpr; ++i) + shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4; + + /* Unused fragment shader inputs are eliminated by the compiler, + * so we don't know yet how many there will be. + */ + if (ctx->type != TGSI_PROCESSOR_FRAGMENT) + for (; i < num_params; ++i) + shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4; + if (bld_base->info && (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 || bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 || @@ -3740,22 +4089,8 @@ static void create_function(struct si_shader_context *ctx) if ((ctx->type == TGSI_PROCESSOR_VERTEX && shader->key.vs.as_ls) || ctx->type == TGSI_PROCESSOR_TESS_CTRL || - ctx->type == TGSI_PROCESSOR_TESS_EVAL) { - /* This is the upper bound, maximum is 32 inputs times 32 vertices */ - unsigned vertex_data_dw_size = 32*32*4; - unsigned patch_data_dw_size = 32*4; - /* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */ - unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size; - unsigned lds_dwords = patch_dw_size; - - /* The actual size is computed outside of the shader to reduce - * the number of shader variants. */ - ctx->lds = - LLVMAddGlobalInAddressSpace(gallivm->module, - LLVMArrayType(ctx->i32, lds_dwords), - "tess_lds", - LOCAL_ADDR_SPACE); - } + ctx->type == TGSI_PROCESSOR_TESS_EVAL) + declare_tess_lds(ctx); } static void preload_constants(struct si_shader_context *ctx) @@ -3887,6 +4222,49 @@ static void preload_ring_buffers(struct si_shader_context *ctx) } } +static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx, + LLVMValueRef param_sampler_views, + unsigned param_pos_fixed_pt) +{ + struct lp_build_tgsi_context *bld_base = + &ctx->radeon_bld.soa.bld_base; + struct gallivm_state *gallivm = bld_base->base.gallivm; + struct lp_build_emit_data result = {}; + struct tgsi_full_instruction inst = {}; + LLVMValueRef desc, sampler_index, address[2], pix; + + /* Use the fixed-point gl_FragCoord input. + * Since the stipple pattern is 32x32 and it repeats, just get 5 bits + * per coordinate to get the repeating effect. + */ + address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5); + address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5); + + /* Load the sampler view descriptor. */ + sampler_index = lp_build_const_int32(gallivm, SI_POLY_STIPPLE_SAMPLER); + desc = get_sampler_desc_custom(ctx, param_sampler_views, + sampler_index, DESC_IMAGE); + + /* Load the texel. */ + inst.Instruction.Opcode = TGSI_OPCODE_TXF; + inst.Texture.Texture = TGSI_TEXTURE_2D_MSAA; /* = use load, not load_mip */ + result.inst = &inst; + set_tex_fetch_args(ctx, &result, TGSI_OPCODE_TXF, + inst.Texture.Texture, + desc, NULL, address, ARRAY_SIZE(address), 0xf); + build_tex_intrinsic(&tex_action, bld_base, &result); + + /* Kill the thread accordingly. */ + pix = LLVMBuildExtractElement(gallivm->builder, result.output[0], + lp_build_const_int32(gallivm, 3), ""); + pix = bitcast(bld_base, TGSI_TYPE_FLOAT, pix); + pix = LLVMBuildFNeg(gallivm->builder, pix, ""); + + lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill", + LLVMVoidTypeInContext(gallivm->context), + &pix, 1, 0); +} + void si_shader_binary_read_config(struct radeon_shader_binary *binary, struct si_shader_config *conf, unsigned symbol_offset) @@ -3972,41 +4350,70 @@ void si_shader_apply_scratch_relocs(struct si_context *sctx, } } +static unsigned si_get_shader_binary_size(struct si_shader *shader) +{ + unsigned size = shader->binary.code_size; + + if (shader->prolog) + size += shader->prolog->binary.code_size; + if (shader->epilog) + size += shader->epilog->binary.code_size; + return size; +} + int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader) { - const struct radeon_shader_binary *binary = &shader->binary; - unsigned code_size = binary->code_size + binary->rodata_size; + const struct radeon_shader_binary *prolog = + shader->prolog ? &shader->prolog->binary : NULL; + const struct radeon_shader_binary *epilog = + shader->epilog ? &shader->epilog->binary : NULL; + const struct radeon_shader_binary *mainb = &shader->binary; + unsigned bo_size = si_get_shader_binary_size(shader) + + (!epilog ? mainb->rodata_size : 0); unsigned char *ptr; + assert(!prolog || !prolog->rodata_size); + assert((!prolog && !epilog) || !mainb->rodata_size); + assert(!epilog || !epilog->rodata_size); + r600_resource_reference(&shader->bo, NULL); shader->bo = si_resource_create_custom(&sscreen->b.b, PIPE_USAGE_IMMUTABLE, - code_size); + bo_size); if (!shader->bo) return -ENOMEM; + /* Upload. */ ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL, PIPE_TRANSFER_READ_WRITE); - util_memcpy_cpu_to_le32(ptr, binary->code, binary->code_size); - if (binary->rodata_size > 0) { - ptr += binary->code_size; - util_memcpy_cpu_to_le32(ptr, binary->rodata, - binary->rodata_size); + + if (prolog) { + util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size); + ptr += prolog->code_size; } + util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size); + ptr += mainb->code_size; + + if (epilog) + util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size); + else if (mainb->rodata_size > 0) + util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size); + sscreen->b.ws->buffer_unmap(shader->bo->buf); return 0; } static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary, - struct pipe_debug_callback *debug) + struct pipe_debug_callback *debug, + const char *name) { char *line, *p; unsigned i, count; if (binary->disasm_string) { - fprintf(stderr, "\nShader Disassembly:\n\n"); - fprintf(stderr, "%s\n", binary->disasm_string); + fprintf(stderr, "Shader %s disassembly:\n", name); + fprintf(stderr, "%s", binary->disasm_string); if (debug && debug->debug_message) { /* Very long debug messages are cut off, so send the @@ -4036,7 +4443,7 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary "Shader Disassembly End"); } } else { - fprintf(stderr, "SI CODE:\n"); + fprintf(stderr, "Shader %s binary:\n", name); for (i = 0; i < binary->code_size; i += 4) { fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i, binary->code[i + 3], binary->code[i + 2], @@ -4115,16 +4522,60 @@ static void si_shader_dump_stats(struct si_screen *sscreen, max_simd_waves); } +static const char *si_get_shader_name(struct si_shader *shader, + unsigned processor) +{ + switch (processor) { + case TGSI_PROCESSOR_VERTEX: + if (shader->key.vs.as_es) + return "Vertex Shader as ES"; + else if (shader->key.vs.as_ls) + return "Vertex Shader as LS"; + else + return "Vertex Shader as VS"; + case TGSI_PROCESSOR_TESS_CTRL: + return "Tessellation Control Shader"; + case TGSI_PROCESSOR_TESS_EVAL: + if (shader->key.tes.as_es) + return "Tessellation Evaluation Shader as ES"; + else + return "Tessellation Evaluation Shader as VS"; + case TGSI_PROCESSOR_GEOMETRY: + if (shader->gs_copy_shader == NULL) + return "GS Copy Shader as VS"; + else + return "Geometry Shader"; + case TGSI_PROCESSOR_FRAGMENT: + return "Pixel Shader"; + case TGSI_PROCESSOR_COMPUTE: + return "Compute Shader"; + default: + return "Unknown Shader"; + } +} + void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, struct pipe_debug_callback *debug, unsigned processor) { - if (r600_can_dump_shader(&sscreen->b, processor)) - if (!(sscreen->b.debug_flags & DBG_NO_ASM)) - si_shader_dump_disassembly(&shader->binary, debug); + if (r600_can_dump_shader(&sscreen->b, processor) && + !(sscreen->b.debug_flags & DBG_NO_ASM)) { + fprintf(stderr, "\n%s:\n", si_get_shader_name(shader, processor)); + + if (shader->prolog) + si_shader_dump_disassembly(&shader->prolog->binary, + debug, "prolog"); + + si_shader_dump_disassembly(&shader->binary, debug, "main"); + + if (shader->epilog) + si_shader_dump_disassembly(&shader->epilog->binary, + debug, "epilog"); + fprintf(stderr, "\n"); + } si_shader_dump_stats(sscreen, &shader->config, shader->selector ? shader->selector->info.num_inputs : 0, - shader->binary.code_size, debug, processor); + si_get_shader_binary_size(shader), debug, processor); } int si_compile_llvm(struct si_screen *sscreen, @@ -4177,6 +4628,19 @@ int si_compile_llvm(struct si_screen *sscreen, FREE(binary->global_symbol_offsets); binary->config = NULL; binary->global_symbol_offsets = NULL; + + /* Some shaders can't have rodata because their binaries can be + * concatenated. + */ + if (binary->rodata_size && + (processor == TGSI_PROCESSOR_VERTEX || + processor == TGSI_PROCESSOR_TESS_CTRL || + processor == TGSI_PROCESSOR_TESS_EVAL || + processor == TGSI_PROCESSOR_FRAGMENT)) { + fprintf(stderr, "radeonsi: The shader can't have rodata."); + return -EINVAL; + } + return r; } @@ -4196,7 +4660,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen, outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0])); - si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm, gsinfo); + si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm); ctx->type = TGSI_PROCESSOR_VERTEX; ctx->is_gs_copy_shader = true; @@ -4241,7 +4705,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen, si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs); - LLVMBuildRetVoid(bld_base->base.gallivm->builder); + LLVMBuildRet(gallivm->builder, ctx->return_value); /* Dump LLVM IR before any optimization passes */ if (sscreen->b.debug_flags & DBG_PREOPT_IR && @@ -4278,35 +4742,38 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f) switch (shader) { case PIPE_SHADER_VERTEX: fprintf(f, " instance_divisors = {"); - for (i = 0; i < Elements(key->vs.instance_divisors); i++) + for (i = 0; i < Elements(key->vs.prolog.instance_divisors); i++) fprintf(f, !i ? "%u" : ", %u", - key->vs.instance_divisors[i]); + key->vs.prolog.instance_divisors[i]); fprintf(f, "}\n"); fprintf(f, " as_es = %u\n", key->vs.as_es); fprintf(f, " as_ls = %u\n", key->vs.as_ls); - fprintf(f, " export_prim_id = %u\n", key->vs.export_prim_id); + fprintf(f, " export_prim_id = %u\n", key->vs.epilog.export_prim_id); break; case PIPE_SHADER_TESS_CTRL: - fprintf(f, " prim_mode = %u\n", key->tcs.prim_mode); + fprintf(f, " prim_mode = %u\n", key->tcs.epilog.prim_mode); break; case PIPE_SHADER_TESS_EVAL: fprintf(f, " as_es = %u\n", key->tes.as_es); - fprintf(f, " export_prim_id = %u\n", key->tes.export_prim_id); + fprintf(f, " export_prim_id = %u\n", key->tes.epilog.export_prim_id); break; case PIPE_SHADER_GEOMETRY: break; case PIPE_SHADER_FRAGMENT: - fprintf(f, " spi_shader_col_format = 0x%x\n", key->ps.spi_shader_col_format); - fprintf(f, " last_cbuf = %u\n", key->ps.last_cbuf); - fprintf(f, " color_two_side = %u\n", key->ps.color_two_side); - fprintf(f, " alpha_func = %u\n", key->ps.alpha_func); - fprintf(f, " alpha_to_one = %u\n", key->ps.alpha_to_one); - fprintf(f, " poly_stipple = %u\n", key->ps.poly_stipple); - fprintf(f, " clamp_color = %u\n", key->ps.clamp_color); + fprintf(f, " prolog.color_two_side = %u\n", key->ps.prolog.color_two_side); + fprintf(f, " prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple); + fprintf(f, " prolog.force_persample_interp = %u\n", key->ps.prolog.force_persample_interp); + fprintf(f, " epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format); + fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8); + fprintf(f, " epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf); + fprintf(f, " epilog.alpha_func = %u\n", key->ps.epilog.alpha_func); + fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one); + fprintf(f, " epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing); + fprintf(f, " epilog.clamp_color = %u\n", key->ps.epilog.clamp_color); break; default: @@ -4317,13 +4784,12 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f) static void si_init_shader_ctx(struct si_shader_context *ctx, struct si_screen *sscreen, struct si_shader *shader, - LLVMTargetMachineRef tm, - struct tgsi_shader_info *info) + LLVMTargetMachineRef tm) { struct lp_build_tgsi_context *bld_base; memset(ctx, 0, sizeof(*ctx)); - radeon_llvm_context_init(&ctx->radeon_bld); + radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--"); ctx->tm = tm; ctx->screen = sscreen; if (shader && shader->selector) @@ -4336,15 +4802,18 @@ static void si_init_shader_ctx(struct si_shader_context *ctx, ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context); ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context); ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context); - ctx->i128 = LLVMInt128TypeInContext(ctx->radeon_bld.gallivm.context); + ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context); + ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128); ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context); ctx->v16i8 = LLVMVectorType(ctx->i8, 16); + ctx->v2i32 = LLVMVectorType(ctx->i32, 2); ctx->v4i32 = LLVMVectorType(ctx->i32, 4); ctx->v4f32 = LLVMVectorType(ctx->f32, 4); ctx->v8i32 = LLVMVectorType(ctx->i32, 8); bld_base = &ctx->radeon_bld.soa.bld_base; - bld_base->info = info; + if (shader && shader->selector) + bld_base->info = &shader->selector->info; bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant; bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action; @@ -4380,40 +4849,31 @@ static void si_init_shader_ctx(struct si_shader_context *ctx, bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32"; } -int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, - struct si_shader *shader, - struct pipe_debug_callback *debug) +int si_compile_tgsi_shader(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + bool is_monolithic, + struct pipe_debug_callback *debug) { struct si_shader_selector *sel = shader->selector; - struct tgsi_token *tokens = sel->tokens; struct si_shader_context ctx; struct lp_build_tgsi_context *bld_base; - struct tgsi_shader_info stipple_shader_info; LLVMModuleRef mod; int r = 0; - bool poly_stipple = sel->type == PIPE_SHADER_FRAGMENT && - shader->key.ps.poly_stipple; - - if (poly_stipple) { - tokens = util_pstipple_create_fragment_shader(tokens, NULL, - SI_POLY_STIPPLE_SAMPLER, - TGSI_FILE_SYSTEM_VALUE); - tgsi_scan_shader(tokens, &stipple_shader_info); - } /* Dump TGSI code before doing TGSI->LLVM conversion in case the * conversion fails. */ if (r600_can_dump_shader(&sscreen->b, sel->info.processor) && !(sscreen->b.debug_flags & DBG_NO_TGSI)) { si_dump_shader_key(sel->type, &shader->key, stderr); - tgsi_dump(tokens, 0); + tgsi_dump(sel->tokens, 0); si_dump_streamout(&sel->so); } - si_init_shader_ctx(&ctx, sscreen, shader, tm, - poly_stipple ? &stipple_shader_info : &sel->info); + si_init_shader_ctx(&ctx, sscreen, shader, tm); + ctx.is_monolithic = is_monolithic; - shader->uses_instanceid = sel->info.uses_instanceid; + shader->info.uses_instanceid = sel->info.uses_instanceid; bld_base = &ctx.radeon_bld.soa.bld_base; ctx.radeon_bld.load_system_value = declare_system_value; @@ -4447,7 +4907,10 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, break; case TGSI_PROCESSOR_FRAGMENT: ctx.radeon_bld.load_input = declare_input_fs; - bld_base->emit_epilogue = si_llvm_emit_fs_epilogue; + if (is_monolithic) + bld_base->emit_epilogue = si_llvm_emit_fs_epilogue; + else + bld_base->emit_epilogue = si_llvm_return_fs_outputs; break; default: assert(!"Unsupported shader type"); @@ -4461,6 +4924,14 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, preload_streamout_buffers(&ctx); preload_ring_buffers(&ctx); + if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT && + shader->key.ps.prolog.poly_stipple) { + LLVMValueRef views = LLVMGetParam(ctx.radeon_bld.main_fn, + SI_PARAM_SAMPLERS); + si_llvm_emit_polygon_stipple(&ctx, views, + SI_PARAM_POS_FIXED_PT); + } + if (ctx.type == TGSI_PROCESSOR_GEOMETRY) { int i; for (i = 0; i < 4; i++) { @@ -4470,12 +4941,12 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, } } - if (!lp_build_tgsi_llvm(bld_base, tokens)) { + if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) { fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n"); goto out; } - LLVMBuildRetVoid(bld_base->base.gallivm->builder); + LLVMBuildRet(bld_base->base.gallivm->builder, ctx.return_value); mod = bld_base->base.gallivm->module; /* Dump LLVM IR before any optimization passes */ @@ -4492,16 +4963,49 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, goto out; } - si_shader_dump(sscreen, shader, debug, ctx.type); + radeon_llvm_dispose(&ctx.radeon_bld); - r = si_shader_binary_upload(sscreen, shader); - if (r) { - fprintf(stderr, "LLVM failed to upload shader\n"); - goto out; + /* Calculate the number of fragment input VGPRs. */ + if (ctx.type == TGSI_PROCESSOR_FRAGMENT) { + shader->info.num_input_vgprs = 0; + shader->info.face_vgpr_index = -1; + + if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 2; + if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 2; + if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 2; + if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 3; + if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 2; + if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 2; + if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 2; + if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 1; + if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 1; + if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 1; + if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 1; + if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 1; + if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) { + shader->info.face_vgpr_index = shader->info.num_input_vgprs; + shader->info.num_input_vgprs += 1; + } + if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 1; + if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 1; + if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)) + shader->info.num_input_vgprs += 1; } - radeon_llvm_dispose(&ctx.radeon_bld); - if (ctx.type == TGSI_PROCESSOR_GEOMETRY) { shader->gs_copy_shader = CALLOC_STRUCT(si_shader); shader->gs_copy_shader->selector = shader->selector; @@ -4517,11 +5021,968 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, out: for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++) FREE(ctx.constants[i]); - if (poly_stipple) - tgsi_free_tokens(tokens); return r; } +/** + * Create, compile and return a shader part (prolog or epilog). + * + * \param sscreen screen + * \param list list of shader parts of the same category + * \param key shader part key + * \param tm LLVM target machine + * \param debug debug callback + * \param compile the callback responsible for compilation + * \return non-NULL on success + */ +static struct si_shader_part * +si_get_shader_part(struct si_screen *sscreen, + struct si_shader_part **list, + union si_shader_part_key *key, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + bool (*compile)(struct si_screen *, + LLVMTargetMachineRef, + struct pipe_debug_callback *, + struct si_shader_part *)) +{ + struct si_shader_part *result; + + pipe_mutex_lock(sscreen->shader_parts_mutex); + + /* Find existing. */ + for (result = *list; result; result = result->next) { + if (memcmp(&result->key, key, sizeof(*key)) == 0) { + pipe_mutex_unlock(sscreen->shader_parts_mutex); + return result; + } + } + + /* Compile a new one. */ + result = CALLOC_STRUCT(si_shader_part); + result->key = *key; + if (!compile(sscreen, tm, debug, result)) { + FREE(result); + pipe_mutex_unlock(sscreen->shader_parts_mutex); + return NULL; + } + + result->next = *list; + *list = result; + pipe_mutex_unlock(sscreen->shader_parts_mutex); + return result; +} + +/** + * Create a vertex shader prolog. + * + * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values). + * All inputs are returned unmodified. The vertex load indices are + * stored after them, which will used by the API VS for fetching inputs. + * + * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are: + * input_v0, + * input_v1, + * input_v2, + * input_v3, + * (VertexID + BaseVertex), + * (InstanceID + StartInstance), + * (InstanceID / 2 + StartInstance) + */ +static bool si_compile_vs_prolog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + struct si_shader_part *out) +{ + union si_shader_part_key *key = &out->key; + struct si_shader shader = {}; + struct si_shader_context ctx; + struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm; + LLVMTypeRef *params, *returns; + LLVMValueRef ret, func; + int last_sgpr, num_params, num_returns, i; + bool status = true; + + si_init_shader_ctx(&ctx, sscreen, &shader, tm); + ctx.type = TGSI_PROCESSOR_VERTEX; + ctx.param_vertex_id = key->vs_prolog.num_input_sgprs; + ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3; + + /* 4 preloaded VGPRs + vertex load indices as prolog outputs */ + params = alloca((key->vs_prolog.num_input_sgprs + 4) * + sizeof(LLVMTypeRef)); + returns = alloca((key->vs_prolog.num_input_sgprs + 4 + + key->vs_prolog.last_input + 1) * + sizeof(LLVMTypeRef)); + num_params = 0; + num_returns = 0; + + /* Declare input and output SGPRs. */ + num_params = 0; + for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { + params[num_params++] = ctx.i32; + returns[num_returns++] = ctx.i32; + } + last_sgpr = num_params - 1; + + /* 4 preloaded VGPRs (outputs must be floats) */ + for (i = 0; i < 4; i++) { + params[num_params++] = ctx.i32; + returns[num_returns++] = ctx.f32; + } + + /* Vertex load indices. */ + for (i = 0; i <= key->vs_prolog.last_input; i++) + returns[num_returns++] = ctx.f32; + + /* Create the function. */ + si_create_function(&ctx, returns, num_returns, params, + num_params, -1, last_sgpr); + func = ctx.radeon_bld.main_fn; + + /* Copy inputs to outputs. This should be no-op, as the registers match, + * but it will prevent the compiler from overwriting them unintentionally. + */ + ret = ctx.return_value; + for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { + LLVMValueRef p = LLVMGetParam(func, i); + ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); + } + for (i = num_params - 4; i < num_params; i++) { + LLVMValueRef p = LLVMGetParam(func, i); + p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, ""); + ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); + } + + /* Compute vertex load indices from instance divisors. */ + for (i = 0; i <= key->vs_prolog.last_input; i++) { + unsigned divisor = key->vs_prolog.states.instance_divisors[i]; + LLVMValueRef index; + + if (divisor) { + /* InstanceID / Divisor + StartInstance */ + index = get_instance_index_for_fetch(&ctx.radeon_bld, + SI_SGPR_START_INSTANCE, + divisor); + } else { + /* VertexID + BaseVertex */ + index = LLVMBuildAdd(gallivm->builder, + LLVMGetParam(func, ctx.param_vertex_id), + LLVMGetParam(func, SI_SGPR_BASE_VERTEX), ""); + } + + index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, ""); + ret = LLVMBuildInsertValue(gallivm->builder, ret, index, + num_params++, ""); + } + + /* Compile. */ + LLVMBuildRet(gallivm->builder, ret); + radeon_llvm_finalize_module(&ctx.radeon_bld); + + if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, + gallivm->module, debug, ctx.type, + "Vertex Shader Prolog")) + status = false; + + radeon_llvm_dispose(&ctx.radeon_bld); + return status; +} + +/** + * Compile the vertex shader epilog. This is also used by the tessellation + * evaluation shader compiled as VS. + * + * The input is PrimitiveID. + * + * If PrimitiveID is required by the pixel shader, export it. + * Otherwise, do nothing. + */ +static bool si_compile_vs_epilog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + struct si_shader_part *out) +{ + union si_shader_part_key *key = &out->key; + struct si_shader_context ctx; + struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm; + struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base; + LLVMTypeRef params[5]; + int num_params, i; + bool status = true; + + si_init_shader_ctx(&ctx, sscreen, NULL, tm); + ctx.type = TGSI_PROCESSOR_VERTEX; + + /* Declare input VGPRs. */ + num_params = key->vs_epilog.states.export_prim_id ? + (VS_EPILOG_PRIMID_LOC + 1) : 0; + assert(num_params <= ARRAY_SIZE(params)); + + for (i = 0; i < num_params; i++) + params[i] = ctx.f32; + + /* Create the function. */ + si_create_function(&ctx, NULL, 0, params, num_params, + -1, -1); + + /* Emit exports. */ + if (key->vs_epilog.states.export_prim_id) { + struct lp_build_context *base = &bld_base->base; + struct lp_build_context *uint = &bld_base->uint_bld; + LLVMValueRef args[9]; + + args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */ + args[1] = uint->zero; /* whether the EXEC mask is valid */ + args[2] = uint->zero; /* DONE bit */ + args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM + + key->vs_epilog.prim_id_param_offset); + args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */ + args[5] = LLVMGetParam(ctx.radeon_bld.main_fn, + VS_EPILOG_PRIMID_LOC); /* X */ + args[6] = uint->undef; /* Y */ + args[7] = uint->undef; /* Z */ + args[8] = uint->undef; /* W */ + + lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", + LLVMVoidTypeInContext(base->gallivm->context), + args, 9, 0); + } + + /* Compile. */ + LLVMBuildRet(gallivm->builder, ctx.return_value); + radeon_llvm_finalize_module(&ctx.radeon_bld); + + if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, + gallivm->module, debug, ctx.type, + "Vertex Shader Epilog")) + status = false; + + radeon_llvm_dispose(&ctx.radeon_bld); + return status; +} + +/** + * Create & compile a vertex shader epilog. This a helper used by VS and TES. + */ +static bool si_get_vs_epilog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug, + struct si_vs_epilog_bits *states) +{ + union si_shader_part_key epilog_key; + + memset(&epilog_key, 0, sizeof(epilog_key)); + epilog_key.vs_epilog.states = *states; + + /* Set up the PrimitiveID output. */ + if (shader->key.vs.epilog.export_prim_id) { + unsigned index = shader->selector->info.num_outputs; + unsigned offset = shader->info.nr_param_exports++; + + epilog_key.vs_epilog.prim_id_param_offset = offset; + assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset)); + shader->info.vs_output_param_offset[index] = offset; + } + + shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs, + &epilog_key, tm, debug, + si_compile_vs_epilog); + return shader->epilog != NULL; +} + +/** + * Select and compile (or reuse) vertex shader parts (prolog & epilog). + */ +static bool si_shader_select_vs_parts(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug) +{ + struct tgsi_shader_info *info = &shader->selector->info; + union si_shader_part_key prolog_key; + unsigned i; + + /* Get the prolog. */ + memset(&prolog_key, 0, sizeof(prolog_key)); + prolog_key.vs_prolog.states = shader->key.vs.prolog; + prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs; + prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1; + + /* The prolog is a no-op if there are no inputs. */ + if (info->num_inputs) { + shader->prolog = + si_get_shader_part(sscreen, &sscreen->vs_prologs, + &prolog_key, tm, debug, + si_compile_vs_prolog); + if (!shader->prolog) + return false; + } + + /* Get the epilog. */ + if (!shader->key.vs.as_es && !shader->key.vs.as_ls && + !si_get_vs_epilog(sscreen, tm, shader, debug, + &shader->key.vs.epilog)) + return false; + + /* Set the instanceID flag. */ + for (i = 0; i < info->num_inputs; i++) + if (prolog_key.vs_prolog.states.instance_divisors[i]) + shader->info.uses_instanceid = true; + + return true; +} + +/** + * Select and compile (or reuse) TES parts (epilog). + */ +static bool si_shader_select_tes_parts(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug) +{ + if (shader->key.tes.as_es) + return true; + + /* TES compiled as VS. */ + return si_get_vs_epilog(sscreen, tm, shader, debug, + &shader->key.tes.epilog); +} + +/** + * Compile the TCS epilog. This writes tesselation factors to memory based on + * the output primitive type of the tesselator (determined by TES). + */ +static bool si_compile_tcs_epilog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + struct si_shader_part *out) +{ + union si_shader_part_key *key = &out->key; + struct si_shader shader = {}; + struct si_shader_context ctx; + struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm; + struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base; + LLVMTypeRef params[16]; + LLVMValueRef func; + int last_array_pointer, last_sgpr, num_params; + bool status = true; + + si_init_shader_ctx(&ctx, sscreen, &shader, tm); + ctx.type = TGSI_PROCESSOR_TESS_CTRL; + shader.key.tcs.epilog = key->tcs_epilog.states; + + /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */ + params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS); + last_array_pointer = SI_PARAM_RW_BUFFERS; + params[SI_PARAM_CONST_BUFFERS] = ctx.i64; + params[SI_PARAM_SAMPLERS] = ctx.i64; + params[SI_PARAM_UNUSED] = ctx.i64; + params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32; + params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32; + params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32; + params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32; + last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET; + num_params = last_sgpr + 1; + + params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */ + params[num_params++] = ctx.i32; /* invocation ID within the patch */ + params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */ + + /* Create the function. */ + si_create_function(&ctx, NULL, 0, params, num_params, + last_array_pointer, last_sgpr); + declare_tess_lds(&ctx); + func = ctx.radeon_bld.main_fn; + + si_write_tess_factors(bld_base, + LLVMGetParam(func, last_sgpr + 1), + LLVMGetParam(func, last_sgpr + 2), + LLVMGetParam(func, last_sgpr + 3)); + + /* Compile. */ + LLVMBuildRet(gallivm->builder, ctx.return_value); + radeon_llvm_finalize_module(&ctx.radeon_bld); + + if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, + gallivm->module, debug, ctx.type, + "Tessellation Control Shader Epilog")) + status = false; + + radeon_llvm_dispose(&ctx.radeon_bld); + return status; +} + +/** + * Select and compile (or reuse) TCS parts (epilog). + */ +static bool si_shader_select_tcs_parts(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug) +{ + union si_shader_part_key epilog_key; + + /* Get the epilog. */ + memset(&epilog_key, 0, sizeof(epilog_key)); + epilog_key.tcs_epilog.states = shader->key.tcs.epilog; + + shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, + &epilog_key, tm, debug, + si_compile_tcs_epilog); + return shader->epilog != NULL; +} + +/** + * Compile the pixel shader prolog. This handles: + * - two-side color selection and interpolation + * - overriding interpolation parameters for the API PS + * - polygon stippling + * + * All preloaded SGPRs and VGPRs are passed through unmodified unless they are + * overriden by other states. (e.g. per-sample interpolation) + * Interpolated colors are stored after the preloaded VGPRs. + */ +static bool si_compile_ps_prolog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + struct si_shader_part *out) +{ + union si_shader_part_key *key = &out->key; + struct si_shader shader = {}; + struct si_shader_context ctx; + struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm; + LLVMTypeRef *params; + LLVMValueRef ret, func; + int last_sgpr, num_params, num_returns, i, num_color_channels; + bool status = true; + + si_init_shader_ctx(&ctx, sscreen, &shader, tm); + ctx.type = TGSI_PROCESSOR_FRAGMENT; + shader.key.ps.prolog = key->ps_prolog.states; + + /* Number of inputs + 8 color elements. */ + params = alloca((key->ps_prolog.num_input_sgprs + + key->ps_prolog.num_input_vgprs + 8) * + sizeof(LLVMTypeRef)); + + /* Declare inputs. */ + num_params = 0; + for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) + params[num_params++] = ctx.i32; + last_sgpr = num_params - 1; + + for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) + params[num_params++] = ctx.f32; + + /* Declare outputs (same as inputs + add colors if needed) */ + num_returns = num_params; + num_color_channels = util_bitcount(key->ps_prolog.colors_read); + for (i = 0; i < num_color_channels; i++) + params[num_returns++] = ctx.f32; + + /* Create the function. */ + si_create_function(&ctx, params, num_returns, params, + num_params, -1, last_sgpr); + func = ctx.radeon_bld.main_fn; + + /* Copy inputs to outputs. This should be no-op, as the registers match, + * but it will prevent the compiler from overwriting them unintentionally. + */ + ret = ctx.return_value; + for (i = 0; i < num_params; i++) { + LLVMValueRef p = LLVMGetParam(func, i); + ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); + } + + /* Polygon stippling. */ + if (key->ps_prolog.states.poly_stipple) { + /* POS_FIXED_PT is always last. */ + unsigned pos = key->ps_prolog.num_input_sgprs + + key->ps_prolog.num_input_vgprs - 1; + LLVMValueRef ptr[2], views; + + /* Get the pointer to sampler views. */ + ptr[0] = LLVMGetParam(func, SI_SGPR_SAMPLERS); + ptr[1] = LLVMGetParam(func, SI_SGPR_SAMPLERS+1); + views = lp_build_gather_values(gallivm, ptr, 2); + views = LLVMBuildBitCast(gallivm->builder, views, ctx.i64, ""); + views = LLVMBuildIntToPtr(gallivm->builder, views, + const_array(ctx.v8i32, SI_NUM_SAMPLERS), ""); + + si_llvm_emit_polygon_stipple(&ctx, views, pos); + } + + /* Interpolate colors. */ + for (i = 0; i < 2; i++) { + unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf; + unsigned face_vgpr = key->ps_prolog.num_input_sgprs + + key->ps_prolog.face_vgpr_index; + LLVMValueRef interp[2], color[4]; + LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL; + + if (!writemask) + continue; + + /* If the interpolation qualifier is not CONSTANT (-1). */ + if (key->ps_prolog.color_interp_vgpr_index[i] != -1) { + unsigned interp_vgpr = key->ps_prolog.num_input_sgprs + + key->ps_prolog.color_interp_vgpr_index[i]; + + interp[0] = LLVMGetParam(func, interp_vgpr); + interp[1] = LLVMGetParam(func, interp_vgpr + 1); + interp_ij = lp_build_gather_values(gallivm, interp, 2); + interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij, + ctx.v2i32, ""); + } + + /* Use the absolute location of the input. */ + prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); + + if (key->ps_prolog.states.color_two_side) { + face = LLVMGetParam(func, face_vgpr); + face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, ""); + } + + interp_fs_input(&ctx, + key->ps_prolog.color_attr_index[i], + TGSI_SEMANTIC_COLOR, i, + key->ps_prolog.num_interp_inputs, + key->ps_prolog.colors_read, interp_ij, + prim_mask, face, color); + + while (writemask) { + unsigned chan = u_bit_scan(&writemask); + ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan], + num_params++, ""); + } + } + + /* Force per-sample interpolation. */ + if (key->ps_prolog.states.force_persample_interp) { + unsigned i, base = key->ps_prolog.num_input_sgprs; + LLVMValueRef persp_sample[2], linear_sample[2]; + + /* Read PERSP_SAMPLE. */ + for (i = 0; i < 2; i++) + persp_sample[i] = LLVMGetParam(func, base + i); + /* Overwrite PERSP_CENTER. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(gallivm->builder, ret, + persp_sample[i], base + 2 + i, ""); + /* Overwrite PERSP_CENTROID. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(gallivm->builder, ret, + persp_sample[i], base + 4 + i, ""); + /* Read LINEAR_SAMPLE. */ + for (i = 0; i < 2; i++) + linear_sample[i] = LLVMGetParam(func, base + 6 + i); + /* Overwrite LINEAR_CENTER. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(gallivm->builder, ret, + linear_sample[i], base + 8 + i, ""); + /* Overwrite LINEAR_CENTROID. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(gallivm->builder, ret, + linear_sample[i], base + 10 + i, ""); + } + + /* Compile. */ + LLVMBuildRet(gallivm->builder, ret); + radeon_llvm_finalize_module(&ctx.radeon_bld); + + if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, + gallivm->module, debug, ctx.type, + "Fragment Shader Prolog")) + status = false; + + radeon_llvm_dispose(&ctx.radeon_bld); + return status; +} + +/** + * Compile the pixel shader epilog. This handles everything that must be + * emulated for pixel shader exports. (alpha-test, format conversions, etc) + */ +static bool si_compile_ps_epilog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + struct si_shader_part *out) +{ + union si_shader_part_key *key = &out->key; + struct si_shader shader = {}; + struct si_shader_context ctx; + struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm; + struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base; + LLVMTypeRef params[16+8*4+3]; + LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; + int last_array_pointer, last_sgpr, num_params, i; + bool status = true; + + si_init_shader_ctx(&ctx, sscreen, &shader, tm); + ctx.type = TGSI_PROCESSOR_FRAGMENT; + shader.key.ps.epilog = key->ps_epilog.states; + + /* Declare input SGPRs. */ + params[SI_PARAM_RW_BUFFERS] = ctx.i64; + params[SI_PARAM_CONST_BUFFERS] = ctx.i64; + params[SI_PARAM_SAMPLERS] = ctx.i64; + params[SI_PARAM_UNUSED] = ctx.i64; + params[SI_PARAM_ALPHA_REF] = ctx.f32; + last_array_pointer = -1; + last_sgpr = SI_PARAM_ALPHA_REF; + + /* Declare input VGPRs. */ + num_params = (last_sgpr + 1) + + util_bitcount(key->ps_epilog.colors_written) * 4 + + key->ps_epilog.writes_z + + key->ps_epilog.writes_stencil + + key->ps_epilog.writes_samplemask; + + num_params = MAX2(num_params, + last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); + + assert(num_params <= ARRAY_SIZE(params)); + + for (i = last_sgpr + 1; i < num_params; i++) + params[i] = ctx.f32; + + /* Create the function. */ + si_create_function(&ctx, NULL, 0, params, num_params, + last_array_pointer, last_sgpr); + /* Disable elimination of unused inputs. */ + radeon_llvm_add_attribute(ctx.radeon_bld.main_fn, + "InitialPSInputAddr", 0xffffff); + + /* Process colors. */ + unsigned vgpr = last_sgpr + 1; + unsigned colors_written = key->ps_epilog.colors_written; + int last_color_export = -1; + + /* Find the last color export. */ + if (!key->ps_epilog.writes_z && + !key->ps_epilog.writes_stencil && + !key->ps_epilog.writes_samplemask) { + unsigned spi_format = key->ps_epilog.states.spi_shader_col_format; + + /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ + if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) { + /* Just set this if any of the colorbuffers are enabled. */ + if (spi_format & + ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1)) + last_color_export = 0; + } else { + for (i = 0; i < 8; i++) + if (colors_written & (1 << i) && + (spi_format >> (i * 4)) & 0xf) + last_color_export = i; + } + } + + while (colors_written) { + LLVMValueRef color[4]; + int mrt = u_bit_scan(&colors_written); + + for (i = 0; i < 4; i++) + color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++); + + si_export_mrt_color(bld_base, color, mrt, + num_params - 1, + mrt == last_color_export); + } + + /* Process depth, stencil, samplemask. */ + if (key->ps_epilog.writes_z) + depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++); + if (key->ps_epilog.writes_stencil) + stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++); + if (key->ps_epilog.writes_samplemask) + samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++); + + if (depth || stencil || samplemask) + si_export_mrt_z(bld_base, depth, stencil, samplemask); + else if (last_color_export == -1) + si_export_null(bld_base); + + /* Compile. */ + LLVMBuildRetVoid(gallivm->builder); + radeon_llvm_finalize_module(&ctx.radeon_bld); + + if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, + gallivm->module, debug, ctx.type, + "Fragment Shader Epilog")) + status = false; + + radeon_llvm_dispose(&ctx.radeon_bld); + return status; +} + +/** + * Select and compile (or reuse) pixel shader parts (prolog & epilog). + */ +static bool si_shader_select_ps_parts(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug) +{ + struct tgsi_shader_info *info = &shader->selector->info; + union si_shader_part_key prolog_key; + union si_shader_part_key epilog_key; + unsigned i; + + /* Get the prolog. */ + memset(&prolog_key, 0, sizeof(prolog_key)); + prolog_key.ps_prolog.states = shader->key.ps.prolog; + prolog_key.ps_prolog.colors_read = info->colors_read; + prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs; + prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs; + + if (info->colors_read) { + unsigned *color = shader->selector->color_attr_index; + + if (shader->key.ps.prolog.color_two_side) { + /* BCOLORs are stored after the last input. */ + prolog_key.ps_prolog.num_interp_inputs = info->num_inputs; + prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index; + shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1); + } + + for (i = 0; i < 2; i++) { + unsigned location = info->input_interpolate_loc[color[i]]; + + if (!(info->colors_read & (0xf << i*4))) + continue; + + prolog_key.ps_prolog.color_attr_index[i] = color[i]; + + /* Force per-sample interpolation for the colors here. */ + if (shader->key.ps.prolog.force_persample_interp) + location = TGSI_INTERPOLATE_LOC_SAMPLE; + + switch (info->input_interpolate[color[i]]) { + case TGSI_INTERPOLATE_CONSTANT: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1; + break; + case TGSI_INTERPOLATE_PERSPECTIVE: + case TGSI_INTERPOLATE_COLOR: + switch (location) { + case TGSI_INTERPOLATE_LOC_SAMPLE: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0; + shader->config.spi_ps_input_ena |= + S_0286CC_PERSP_SAMPLE_ENA(1); + break; + case TGSI_INTERPOLATE_LOC_CENTER: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2; + shader->config.spi_ps_input_ena |= + S_0286CC_PERSP_CENTER_ENA(1); + break; + case TGSI_INTERPOLATE_LOC_CENTROID: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4; + shader->config.spi_ps_input_ena |= + S_0286CC_PERSP_CENTROID_ENA(1); + break; + default: + assert(0); + } + break; + case TGSI_INTERPOLATE_LINEAR: + switch (location) { + case TGSI_INTERPOLATE_LOC_SAMPLE: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6; + shader->config.spi_ps_input_ena |= + S_0286CC_LINEAR_SAMPLE_ENA(1); + break; + case TGSI_INTERPOLATE_LOC_CENTER: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8; + shader->config.spi_ps_input_ena |= + S_0286CC_LINEAR_CENTER_ENA(1); + break; + case TGSI_INTERPOLATE_LOC_CENTROID: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10; + shader->config.spi_ps_input_ena |= + S_0286CC_LINEAR_CENTROID_ENA(1); + break; + default: + assert(0); + } + break; + default: + assert(0); + } + } + } + + /* The prolog is a no-op if these aren't set. */ + if (prolog_key.ps_prolog.colors_read || + prolog_key.ps_prolog.states.force_persample_interp || + prolog_key.ps_prolog.states.poly_stipple) { + shader->prolog = + si_get_shader_part(sscreen, &sscreen->ps_prologs, + &prolog_key, tm, debug, + si_compile_ps_prolog); + if (!shader->prolog) + return false; + } + + /* Get the epilog. */ + memset(&epilog_key, 0, sizeof(epilog_key)); + epilog_key.ps_epilog.colors_written = info->colors_written; + epilog_key.ps_epilog.writes_z = info->writes_z; + epilog_key.ps_epilog.writes_stencil = info->writes_stencil; + epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask; + epilog_key.ps_epilog.states = shader->key.ps.epilog; + + shader->epilog = + si_get_shader_part(sscreen, &sscreen->ps_epilogs, + &epilog_key, tm, debug, + si_compile_ps_epilog); + if (!shader->epilog) + return false; + + /* Enable POS_FIXED_PT if polygon stippling is enabled. */ + if (shader->key.ps.prolog.poly_stipple) { + shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1); + assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)); + } + + /* Set up the enable bits for per-sample shading if needed. */ + if (shader->key.ps.prolog.force_persample_interp) { + if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) || + G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena)) { + shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA; + shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; + shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1); + } + if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) || + G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena)) { + shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA; + shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; + shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1); + } + } + + /* POW_W_FLOAT requires that one of the perspective weights is enabled. */ + if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) && + !(shader->config.spi_ps_input_ena & 0xf)) { + shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); + assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr)); + } + + /* At least one pair of interpolation weights must be enabled. */ + if (!(shader->config.spi_ps_input_ena & 0x7f)) { + shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); + assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)); + } + + /* The sample mask input is always enabled, because the API shader always + * passes it through to the epilog. Disable it here if it's unused. + */ + if (!shader->key.ps.epilog.poly_line_smoothing && + !shader->selector->info.reads_samplemask) + shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA; + + return true; +} + +int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug) +{ + struct si_shader *mainp = shader->selector->main_shader_part; + int r; + + /* LS and ES are always compiled on demand. */ + if (!mainp || + (shader->selector->type == PIPE_SHADER_VERTEX && + (shader->key.vs.as_es || shader->key.vs.as_ls)) || + (shader->selector->type == PIPE_SHADER_TESS_EVAL && + shader->key.tes.as_es)) { + /* Monolithic shader (compiled as a whole, has many variants, + * may take a long time to compile). + */ + r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug); + if (r) + return r; + } else { + /* The shader consists of 2-3 parts: + * + * - the middle part is the user shader, it has 1 variant only + * and it was compiled during the creation of the shader + * selector + * - the prolog part is inserted at the beginning + * - the epilog part is inserted at the end + * + * The prolog and epilog have many (but simple) variants. + */ + + /* Copy the compiled TGSI shader data over. */ + shader->is_binary_shared = true; + shader->binary = mainp->binary; + shader->config = mainp->config; + shader->info.num_input_sgprs = mainp->info.num_input_sgprs; + shader->info.num_input_vgprs = mainp->info.num_input_vgprs; + shader->info.face_vgpr_index = mainp->info.face_vgpr_index; + memcpy(shader->info.vs_output_param_offset, + mainp->info.vs_output_param_offset, + sizeof(mainp->info.vs_output_param_offset)); + shader->info.uses_instanceid = mainp->info.uses_instanceid; + shader->info.nr_pos_exports = mainp->info.nr_pos_exports; + shader->info.nr_param_exports = mainp->info.nr_param_exports; + + /* Select prologs and/or epilogs. */ + switch (shader->selector->type) { + case PIPE_SHADER_VERTEX: + if (!si_shader_select_vs_parts(sscreen, tm, shader, debug)) + return -1; + break; + case PIPE_SHADER_TESS_CTRL: + if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug)) + return -1; + break; + case PIPE_SHADER_TESS_EVAL: + if (!si_shader_select_tes_parts(sscreen, tm, shader, debug)) + return -1; + break; + case PIPE_SHADER_FRAGMENT: + if (!si_shader_select_ps_parts(sscreen, tm, shader, debug)) + return -1; + + /* Make sure we have at least as many VGPRs as there + * are allocated inputs. + */ + shader->config.num_vgprs = MAX2(shader->config.num_vgprs, + shader->info.num_input_vgprs); + break; + } + + /* Update SGPR and VGPR counts. */ + if (shader->prolog) { + shader->config.num_sgprs = MAX2(shader->config.num_sgprs, + shader->prolog->config.num_sgprs); + shader->config.num_vgprs = MAX2(shader->config.num_vgprs, + shader->prolog->config.num_vgprs); + } + if (shader->epilog) { + shader->config.num_sgprs = MAX2(shader->config.num_sgprs, + shader->epilog->config.num_sgprs); + shader->config.num_vgprs = MAX2(shader->config.num_vgprs, + shader->epilog->config.num_vgprs); + } + } + + si_shader_dump(sscreen, shader, debug, shader->selector->info.processor); + + /* Upload. */ + r = si_shader_binary_upload(sscreen, shader); + if (r) { + fprintf(stderr, "LLVM failed to upload shader\n"); + return r; + } + + return 0; +} + void si_shader_destroy(struct si_shader *shader) { if (shader->gs_copy_shader) { @@ -4534,5 +5995,6 @@ void si_shader_destroy(struct si_shader *shader) r600_resource_reference(&shader->bo, NULL); - radeon_shader_binary_clean(&shader->binary); + if (!shader->is_binary_shared) + radeon_shader_binary_clean(&shader->binary); } |