8 files changed, 2152 insertions, 266 deletions
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 7370a113d3d..9f5f4c682bc 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -196,9 +196,7 @@ static unsigned compute_num_waves_for_scratch(
 }
 
 static void si_launch_grid(
-		struct pipe_context *ctx,
-		const uint *block_layout, const uint *grid_layout,
-		uint32_t pc, const void *input)
+		struct pipe_context *ctx, const struct pipe_grid_info *info)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
@@ -232,7 +230,7 @@ static void si_launch_grid(
 	pm4->compute_pkt = true;
 
 	/* Read the config information */
-	si_shader_binary_read_config(&shader->binary, &shader->config, pc);
+	si_shader_binary_read_config(&shader->binary, &shader->config, info->pc);
 
 	/* Upload the kernel arguments */
 
@@ -242,15 +240,16 @@ static void si_launch_grid(
 	kernel_args = sctx->b.ws->buffer_map(input_buffer->buf,
 			sctx->b.gfx.cs, PIPE_TRANSFER_WRITE);
 	for (i = 0; i < 3; i++) {
-		kernel_args[i] = grid_layout[i];
-		kernel_args[i + 3] = grid_layout[i] * block_layout[i];
-		kernel_args[i + 6] = block_layout[i];
+		kernel_args[i] = info->grid[i];
+		kernel_args[i + 3] = info->grid[i] * info->block[i];
+		kernel_args[i + 6] = info->block[i];
 	}
 
 	num_waves_for_scratch =	compute_num_waves_for_scratch(
-		&sctx->screen->b.info, block_layout, grid_layout);
+		&sctx->screen->b.info, info->block, info->grid);
 
-	memcpy(kernel_args + (num_work_size_bytes / 4), input, program->input_size);
+	memcpy(kernel_args + (num_work_size_bytes / 4), info->input,
+          program->input_size);
 
 	if (shader->config.scratch_bytes_per_wave > 0) {
 
@@ -291,11 +290,11 @@ static void si_launch_grid(
 	si_pm4_set_reg(pm4, R_00B818_COMPUTE_START_Z, 0);
 
 	si_pm4_set_reg(pm4, R_00B81C_COMPUTE_NUM_THREAD_X,
-				S_00B81C_NUM_THREAD_FULL(block_layout[0]));
+				S_00B81C_NUM_THREAD_FULL(info->block[0]));
 	si_pm4_set_reg(pm4, R_00B820_COMPUTE_NUM_THREAD_Y,
-				S_00B820_NUM_THREAD_FULL(block_layout[1]));
+				S_00B820_NUM_THREAD_FULL(info->block[1]));
 	si_pm4_set_reg(pm4, R_00B824_COMPUTE_NUM_THREAD_Z,
-				S_00B824_NUM_THREAD_FULL(block_layout[2]));
+				S_00B824_NUM_THREAD_FULL(info->block[2]));
 
 	/* Global buffers */
 	for (i = 0; i < MAX_GLOBAL_BUFFERS; i++) {
@@ -323,7 +322,7 @@ static void si_launch_grid(
 	}
 
 	shader_va = shader->bo->gpu_address;
-	shader_va += pc;
+	shader_va += info->pc;
 
 	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo,
 				  RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
@@ -375,9 +374,9 @@ static void si_launch_grid(
 		;
 
 	si_pm4_cmd_begin(pm4, PKT3_DISPATCH_DIRECT);
-	si_pm4_cmd_add(pm4, grid_layout[0]); /* Thread groups DIM_X */
-	si_pm4_cmd_add(pm4, grid_layout[1]); /* Thread groups DIM_Y */
-	si_pm4_cmd_add(pm4, grid_layout[2]); /* Thread gropus DIM_Z */
+	si_pm4_cmd_add(pm4, info->grid[0]); /* Thread groups DIM_X */
+	si_pm4_cmd_add(pm4, info->grid[1]); /* Thread groups DIM_Y */
+	si_pm4_cmd_add(pm4, info->grid[2]); /* Thread gropus DIM_Z */
 	si_pm4_cmd_add(pm4, 1); /* DISPATCH_INITIATOR */
         si_pm4_cmd_end(pm4, false);
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index e9d69d2db38..37fd4a25d59 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -22,6 +22,7 @@
  */
 
 #include "si_pipe.h"
+#include "si_shader.h"
 #include "si_public.h"
 #include "sid.h"
 
@@ -448,6 +449,10 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 		switch (param) {
 		case PIPE_SHADER_CAP_PREFERRED_IR:
 			return PIPE_SHADER_IR_NATIVE;
+
+		case PIPE_SHADER_CAP_SUPPORTED_IRS:
+			return 0;
+
 		case PIPE_SHADER_CAP_DOUBLES:
 			return HAVE_LLVM >= 0x0307;
 
@@ -511,6 +516,8 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 		return 16;
 	case PIPE_SHADER_CAP_PREFERRED_IR:
 		return PIPE_SHADER_IR_TGSI;
+	case PIPE_SHADER_CAP_SUPPORTED_IRS:
+		return 0;
 	case PIPE_SHADER_CAP_DOUBLES:
 		return HAVE_LLVM >= 0x0307;
 	case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
@@ -522,6 +529,7 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		return 32;
 	case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+	case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
 		return 0;
 	}
 	return 0;
@@ -530,6 +538,14 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 static void si_destroy_screen(struct pipe_screen* pscreen)
 {
 	struct si_screen *sscreen = (struct si_screen *)pscreen;
+	struct si_shader_part *parts[] = {
+		sscreen->vs_prologs,
+		sscreen->vs_epilogs,
+		sscreen->tcs_epilogs,
+		sscreen->ps_prologs,
+		sscreen->ps_epilogs
+	};
+	unsigned i;
 
 	if (!sscreen)
 		return;
@@ -537,6 +553,18 @@ static void si_destroy_screen(struct pipe_screen* pscreen)
 	if (!sscreen->b.ws->unref(sscreen->b.ws))
 		return;
 
+	/* Free shader parts. */
+	for (i = 0; i < ARRAY_SIZE(parts); i++) {
+		while (parts[i]) {
+			struct si_shader_part *part = parts[i];
+
+			parts[i] = part->next;
+			radeon_shader_binary_clean(&part->binary);
+			FREE(part);
+		}
+	}
+	pipe_mutex_destroy(sscreen->shader_parts_mutex);
+	si_destroy_shader_cache(sscreen);
 	r600_destroy_common_screen(&sscreen->b);
 }
 
@@ -584,7 +612,8 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
 	sscreen->b.b.resource_create = r600_resource_create_common;
 
 	if (!r600_common_screen_init(&sscreen->b, ws) ||
-	    !si_init_gs_info(sscreen)) {
+	    !si_init_gs_info(sscreen) ||
+	    !si_init_shader_cache(sscreen)) {
 		FREE(sscreen);
 		return NULL;
 	}
@@ -594,6 +623,10 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
 
 	sscreen->b.has_cp_dma = true;
 	sscreen->b.has_streamout = true;
+	pipe_mutex_init(sscreen->shader_parts_mutex);
+	sscreen->use_monolithic_shaders =
+		HAVE_LLVM < 0x0308 ||
+		(sscreen->b.debug_flags & DBG_MONOLITHIC_SHADERS) != 0;
 
 	if (debug_get_bool_option("RADEON_DUMP_SHADERS", FALSE))
 		sscreen->b.debug_flags |= DBG_FS | DBG_VS | DBG_GS | DBG_PS | DBG_CS;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index b5790d6b564..ef860a58b83 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -80,10 +80,36 @@
 #define SI_MAX_BORDER_COLORS	4096
 
 struct si_compute;
+struct hash_table;
 
 struct si_screen {
 	struct r600_common_screen	b;
 	unsigned			gs_table_depth;
+
+	/* Whether shaders are monolithic (1-part) or separate (3-part). */
+	bool				use_monolithic_shaders;
+
+	pipe_mutex			shader_parts_mutex;
+	struct si_shader_part		*vs_prologs;
+	struct si_shader_part		*vs_epilogs;
+	struct si_shader_part		*tcs_epilogs;
+	struct si_shader_part		*ps_prologs;
+	struct si_shader_part		*ps_epilogs;
+
+	/* Shader cache in memory.
+	 *
+	 * Design & limitations:
+	 * - The shader cache is per screen (= per process), never saved to
+	 *   disk, and skips redundant shader compilations from TGSI to bytecode.
+	 * - It can only be used with one-variant-per-shader support, in which
+	 *   case only the main (typically middle) part of shaders is cached.
+	 * - Only VS, TCS, TES, PS are cached, out of which only the hw VS
+	 *   variants of VS and TES are cached, so LS and ES aren't.
+	 * - GS and CS aren't cached, but it's certainly possible to cache
+	 *   those as well.
+	 */
+	pipe_mutex			shader_cache_mutex;
+	struct hash_table		*shader_cache;
 };
 
 struct si_blend_color {
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index baa1090e2fb..57458ae1381 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -70,6 +70,12 @@ struct si_shader_context
 
 	unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */
 	bool is_gs_copy_shader;
+
+	/* Whether to generate the optimized shader variant compiled as a whole
+	 * (without a prolog and epilog)
+	 */
+	bool is_monolithic;
+
 	int param_streamout_config;
 	int param_streamout_write_index;
 	int param_streamout_offset[4];
@@ -77,6 +83,7 @@ struct si_shader_context
 	int param_rel_auto_id;
 	int param_vs_prim_id;
 	int param_instance_id;
+	int param_vertex_index0;
 	int param_tes_u;
 	int param_tes_v;
 	int param_tes_rel_patch_id;
@@ -96,14 +103,17 @@ struct si_shader_context
 	LLVMValueRef esgs_ring;
 	LLVMValueRef gsvs_ring[4];
 	LLVMValueRef gs_next_vertex[4];
+	LLVMValueRef return_value;
 
 	LLVMTypeRef voidt;
 	LLVMTypeRef i1;
 	LLVMTypeRef i8;
 	LLVMTypeRef i32;
+	LLVMTypeRef i64;
 	LLVMTypeRef i128;
 	LLVMTypeRef f32;
 	LLVMTypeRef v16i8;
+	LLVMTypeRef v2i32;
 	LLVMTypeRef v4i32;
 	LLVMTypeRef v4f32;
 	LLVMTypeRef v8i32;
@@ -118,9 +128,17 @@ static struct si_shader_context *si_shader_context(
 static void si_init_shader_ctx(struct si_shader_context *ctx,
 			       struct si_screen *sscreen,
 			       struct si_shader *shader,
-			       LLVMTargetMachineRef tm,
-			       struct tgsi_shader_info *info);
+			       LLVMTargetMachineRef tm);
 
+/* Ideally pass the sample mask input to the PS epilog as v13, which
+ * is its usual location, so that the shader doesn't have to add v_mov.
+ */
+#define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
+
+/* The VS location of the PrimitiveID input is the same in the epilog,
+ * so that the main shader part doesn't have to move it.
+ */
+#define VS_EPILOG_PRIMID_LOC 2
 
 #define PERSPECTIVE_BASE 0
 #define LINEAR_BASE 9
@@ -196,6 +214,10 @@ static LLVMValueRef unpack_param(struct si_shader_context *ctx,
 	LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn,
 					  param);
 
+	if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
+		value = bitcast(&ctx->radeon_bld.soa.bld_base,
+				TGSI_TYPE_UNSIGNED, value);
+
 	if (rshift)
 		value = LLVMBuildLShr(gallivm->builder, value,
 				      lp_build_const_int32(gallivm, rshift), "");
@@ -375,7 +397,7 @@ static LLVMValueRef build_indexed_load_const(
 
 static LLVMValueRef get_instance_index_for_fetch(
 	struct radeon_llvm_context *radeon_bld,
-	unsigned divisor)
+	unsigned param_start_instance, unsigned divisor)
 {
 	struct si_shader_context *ctx =
 		si_shader_context(&radeon_bld->soa.bld_base);
@@ -389,8 +411,8 @@ static LLVMValueRef get_instance_index_for_fetch(
 		result = LLVMBuildUDiv(gallivm->builder, result,
 				lp_build_const_int32(gallivm, divisor), "");
 
-	return LLVMBuildAdd(gallivm->builder, result, LLVMGetParam(
-			radeon_bld->main_fn, SI_PARAM_START_INSTANCE), "");
+	return LLVMBuildAdd(gallivm->builder, result,
+			    LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
 }
 
 static void declare_input_vs(
@@ -402,7 +424,8 @@ static void declare_input_vs(
 	struct gallivm_state *gallivm = base->gallivm;
 	struct si_shader_context *ctx =
 		si_shader_context(&radeon_bld->soa.bld_base);
-	unsigned divisor = ctx->shader->key.vs.instance_divisors[input_index];
+	unsigned divisor =
+		ctx->shader->key.vs.prolog.instance_divisors[input_index];
 
 	unsigned chan;
 
@@ -424,10 +447,16 @@ static void declare_input_vs(
 	/* Build the attribute offset */
 	attribute_offset = lp_build_const_int32(gallivm, 0);
 
-	if (divisor) {
+	if (!ctx->is_monolithic) {
+		buffer_index = LLVMGetParam(radeon_bld->main_fn,
+					    ctx->param_vertex_index0 +
+					    input_index);
+	} else if (divisor) {
 		/* Build index from instance ID, start instance and divisor */
-		ctx->shader->uses_instanceid = true;
-		buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld, divisor);
+		ctx->shader->info.uses_instanceid = true;
+		buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
+							    SI_PARAM_START_INSTANCE,
+							    divisor);
 	} else {
 		/* Load the buffer index for vertices. */
 		LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
@@ -853,7 +882,8 @@ static int lookup_interp_param_index(unsigned interpolate, unsigned location)
 static unsigned select_interp_param(struct si_shader_context *ctx,
 				    unsigned param)
 {
-	if (!ctx->shader->key.ps.force_persample_interp)
+	if (!ctx->shader->key.ps.prolog.force_persample_interp ||
+	    !ctx->is_monolithic)
 		return param;
 
 	/* If the shader doesn't use center/centroid, just return the parameter.
@@ -923,7 +953,7 @@ static void interp_fs_input(struct si_shader_context *ctx,
 	intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
 
 	if (semantic_name == TGSI_SEMANTIC_COLOR &&
-	    ctx->shader->key.ps.color_two_side) {
+	    ctx->shader->key.ps.prolog.color_two_side) {
 		LLVMValueRef args[4];
 		LLVMValueRef is_face_positive;
 		LLVMValueRef back_attr_number;
@@ -997,6 +1027,7 @@ static void declare_input_fs(
 	unsigned input_index,
 	const struct tgsi_full_declaration *decl)
 {
+	struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
 	struct si_shader_context *ctx =
 		si_shader_context(&radeon_bld->soa.bld_base);
 	struct si_shader *shader = ctx->shader;
@@ -1004,6 +1035,26 @@ static void declare_input_fs(
 	LLVMValueRef interp_param = NULL;
 	int interp_param_idx;
 
+	/* Get colors from input VGPRs (set by the prolog). */
+	if (!ctx->is_monolithic &&
+	    decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
+		unsigned i = decl->Semantic.Index;
+		unsigned colors_read = shader->selector->info.colors_read;
+		unsigned mask = colors_read >> (i * 4);
+		unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
+				  (i ? util_bitcount(colors_read & 0xf) : 0);
+
+		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
+			mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
+		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
+			mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
+		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
+			mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
+		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
+			mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
+		return;
+	}
+
 	interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
 						     decl->Interp.Location);
 	if (interp_param_idx == -1)
@@ -1330,12 +1381,12 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 
 	if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
 		const union si_shader_key *key = &ctx->shader->key;
-		unsigned col_formats = key->ps.spi_shader_col_format;
+		unsigned col_formats = key->ps.epilog.spi_shader_col_format;
 		int cbuf = target - V_008DFC_SQ_EXP_MRT;
 
 		assert(cbuf >= 0 && cbuf < 8);
 		spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
-		is_int8 = (key->ps.color_is_int8 >> cbuf) & 0x1;
+		is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1;
 	}
 
 	args[4] = uint->zero; /* COMPR flag */
@@ -1488,13 +1539,13 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 
-	if (ctx->shader->key.ps.alpha_func != PIPE_FUNC_NEVER) {
+	if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
 		LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn,
 				SI_PARAM_ALPHA_REF);
 
 		LLVMValueRef alpha_pass =
 			lp_build_cmp(&bld_base->base,
-				     ctx->shader->key.ps.alpha_func,
+				     ctx->shader->key.ps.epilog.alpha_func,
 				     alpha, alpha_ref);
 		LLVMValueRef arg =
 			lp_build_select(&bld_base->base,
@@ -1511,7 +1562,8 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
 }
 
 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
-						  LLVMValueRef alpha)
+						  LLVMValueRef alpha,
+						  unsigned samplemask_param)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
@@ -1519,7 +1571,7 @@ static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *
 
 	/* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
 	coverage = LLVMGetParam(ctx->radeon_bld.main_fn,
-				SI_PARAM_SAMPLE_COVERAGE);
+				samplemask_param);
 	coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
 
 	coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
@@ -1841,7 +1893,8 @@ handle_semantic:
 		case TGSI_SEMANTIC_COLOR:
 		case TGSI_SEMANTIC_BCOLOR:
 			target = V_008DFC_SQ_EXP_PARAM + param_count;
-			shader->vs_output_param_offset[i] = param_count;
+			assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
+			shader->info.vs_output_param_offset[i] = param_count;
 			param_count++;
 			break;
 		case TGSI_SEMANTIC_CLIPDIST:
@@ -1855,7 +1908,8 @@ handle_semantic:
 		case TGSI_SEMANTIC_TEXCOORD:
 		case TGSI_SEMANTIC_GENERIC:
 			target = V_008DFC_SQ_EXP_PARAM + param_count;
-			shader->vs_output_param_offset[i] = param_count;
+			assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
+			shader->info.vs_output_param_offset[i] = param_count;
 			param_count++;
 			break;
 		default:
@@ -1883,7 +1937,7 @@ handle_semantic:
 		}
 	}
 
-	shader->nr_param_exports = param_count;
+	shader->info.nr_param_exports = param_count;
 
 	/* We need to add the position output manually if it's missing. */
 	if (!pos_args[0][0]) {
@@ -1945,7 +1999,7 @@ handle_semantic:
 
 	for (i = 0; i < 4; i++)
 		if (pos_args[i][0])
-			shader->nr_pos_exports++;
+			shader->info.nr_pos_exports++;
 
 	pos_idx = 0;
 	for (i = 0; i < 4; i++) {
@@ -1955,7 +2009,7 @@ handle_semantic:
 		/* Specify the target we are exporting */
 		pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
 
-		if (pos_idx == shader->nr_pos_exports)
+		if (pos_idx == shader->info.nr_pos_exports)
 			/* Specify that this is the last export */
 			pos_args[i][2] = uint->one;
 
@@ -1989,7 +2043,7 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
 				  invocation_id, bld_base->uint_bld.zero, ""));
 
 	/* Determine the layout of one tess factor element in the buffer. */
-	switch (shader->key.tcs.prim_mode) {
+	switch (shader->key.tcs.epilog.prim_mode) {
 	case PIPE_PRIM_LINES:
 		stride = 2; /* 2 dwords, 1 vec2 store */
 		outer_comps = 2;
@@ -2061,14 +2115,51 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
-	LLVMValueRef invocation_id;
+	LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
 
+	rel_patch_id = get_rel_patch_id(ctx);
 	invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
+	tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
 
-	si_write_tess_factors(bld_base,
-			      get_rel_patch_id(ctx),
-			      invocation_id,
-			      get_tcs_out_current_patch_data_offset(ctx));
+	if (!ctx->is_monolithic) {
+		/* Return epilog parameters from this function. */
+		LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+		LLVMValueRef ret = ctx->return_value;
+		LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
+		unsigned vgpr;
+
+		/* RW_BUFFERS pointer */
+		rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
+					  SI_PARAM_RW_BUFFERS);
+		rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
+		rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
+		rw0 = LLVMBuildExtractElement(builder, rw_buffers,
+					      bld_base->uint_bld.zero, "");
+		rw1 = LLVMBuildExtractElement(builder, rw_buffers,
+					      bld_base->uint_bld.one, "");
+		ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
+		ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
+
+		/* Tess factor buffer soffset is after user SGPRs. */
+		tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
+					  SI_PARAM_TESS_FACTOR_OFFSET);
+		ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
+					   SI_TCS_NUM_USER_SGPR, "");
+
+		/* VGPRs */
+		rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
+		invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
+		tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
+
+		vgpr = SI_TCS_NUM_USER_SGPR + 1;
+		ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
+		ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
+		ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
+		ctx->return_value = ret;
+		return;
+	}
+
+	si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset);
 }
 
 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
@@ -2214,16 +2305,26 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
 					      "");
 	}
 
-	/* Export PrimitiveID when PS needs it. */
-	if (si_vs_exports_prim_id(ctx->shader)) {
-		outputs[i].name = TGSI_SEMANTIC_PRIMID;
-		outputs[i].sid = 0;
-		outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
-					       get_primitive_id(bld_base, 0));
-		outputs[i].values[1] = bld_base->base.undef;
-		outputs[i].values[2] = bld_base->base.undef;
-		outputs[i].values[3] = bld_base->base.undef;
-		i++;
+	if (ctx->is_monolithic) {
+		/* Export PrimitiveID when PS needs it. */
+		if (si_vs_exports_prim_id(ctx->shader)) {
+			outputs[i].name = TGSI_SEMANTIC_PRIMID;
+			outputs[i].sid = 0;
+			outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+						       get_primitive_id(bld_base, 0));
+			outputs[i].values[1] = bld_base->base.undef;
+			outputs[i].values[2] = bld_base->base.undef;
+			outputs[i].values[3] = bld_base->base.undef;
+			i++;
+		}
+	} else {
+		/* Return the primitive ID from the LLVM function. */
+		ctx->return_value =
+			LLVMBuildInsertValue(gallivm->builder,
+					     ctx->return_value,
+					     bitcast(bld_base, TGSI_TYPE_FLOAT,
+						     get_primitive_id(bld_base, 0)),
+					     VS_EPILOG_PRIMID_LOC, "");
 	}
 
 	si_llvm_export_vs(bld_base, outputs, i);
@@ -2284,6 +2385,7 @@ static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
 
 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
 				LLVMValueRef *color, unsigned index,
+				unsigned samplemask_param,
 				bool is_last)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
@@ -2291,30 +2393,31 @@ static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
 	int i;
 
 	/* Clamp color */
-	if (ctx->shader->key.ps.clamp_color)
+	if (ctx->shader->key.ps.epilog.clamp_color)
 		for (i = 0; i < 4; i++)
 			color[i] = radeon_llvm_saturate(bld_base, color[i]);
 
 	/* Alpha to one */
-	if (ctx->shader->key.ps.alpha_to_one)
+	if (ctx->shader->key.ps.epilog.alpha_to_one)
 		color[3] = base->one;
 
 	/* Alpha test */
 	if (index == 0 &&
-	    ctx->shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS)
+	    ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
 		si_alpha_test(bld_base, color[3]);
 
 	/* Line & polygon smoothing */
-	if (ctx->shader->key.ps.poly_line_smoothing)
-		color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3]);
+	if (ctx->shader->key.ps.epilog.poly_line_smoothing)
+		color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
+							 samplemask_param);
 
 	/* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
-	if (ctx->shader->key.ps.last_cbuf > 0) {
+	if (ctx->shader->key.ps.epilog.last_cbuf > 0) {
 		LLVMValueRef args[8][9];
 		int c, last = -1;
 
 		/* Get the export arguments, also find out what the last one is. */
-		for (c = 0; c <= ctx->shader->key.ps.last_cbuf; c++) {
+		for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
 			si_llvm_init_export_args(bld_base, color,
 						 V_008DFC_SQ_EXP_MRT + c, args[c]);
 			if (args[c][0] != bld_base->uint_bld.zero)
@@ -2322,7 +2425,7 @@ static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
 		}
 
 		/* Emit all exports. */
-		for (c = 0; c <= ctx->shader->key.ps.last_cbuf; c++) {
+		for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
 			if (is_last && last == c) {
 				args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
 				args[c][2] = bld_base->uint_bld.one; /* DONE bit */
@@ -2385,11 +2488,11 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
 	 * Otherwise, find the last color export.
 	 */
 	if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) {
-		unsigned spi_format = shader->key.ps.spi_shader_col_format;
+		unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format;
 
 		/* Don't export NULL and return if alpha-test is enabled. */
-		if (shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS &&
-		    shader->key.ps.alpha_func != PIPE_FUNC_NEVER &&
+		if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS &&
+		    shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER &&
 		    (spi_format & 0xf) == 0)
 			spi_format |= V_028714_SPI_SHADER_32_AR;
 
@@ -2400,10 +2503,10 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
 				continue;
 
 			/* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
-			if (shader->key.ps.last_cbuf > 0) {
+			if (shader->key.ps.epilog.last_cbuf > 0) {
 				/* Just set this if any of the colorbuffers are enabled. */
 				if (spi_format &
-				    ((1llu << (4 * (shader->key.ps.last_cbuf + 1))) - 1))
+				    ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1))
 					last_color_export = i;
 				continue;
 			}
@@ -2445,6 +2548,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
 							 ctx->radeon_bld.soa.outputs[i][j], "");
 
 			si_export_mrt_color(bld_base, color, semantic_index,
+					    SI_PARAM_SAMPLE_COVERAGE,
 					    last_color_export == i);
 			break;
 		default:
@@ -2458,6 +2562,100 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
 		si_export_mrt_z(bld_base, depth, stencil, samplemask);
 }
 
+/**
+ * Return PS outputs in this order:
+ *
+ * v[0:3] = color0.xyzw
+ * v[4:7] = color1.xyzw
+ * ...
+ * vN+0 = Depth
+ * vN+1 = Stencil
+ * vN+2 = SampleMask
+ * vN+3 = SampleMaskIn (used for OpenGL smoothing)
+ *
+ * The alpha-ref SGPR is returned via its original location.
+ */
+static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct si_shader *shader = ctx->shader;
+	struct lp_build_context *base = &bld_base->base;
+	struct tgsi_shader_info *info = &shader->selector->info;
+	LLVMBuilderRef builder = base->gallivm->builder;
+	unsigned i, j, first_vgpr, vgpr;
+
+	LLVMValueRef color[8][4] = {};
+	LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
+	LLVMValueRef ret;
+
+	/* Read the output values. */
+	for (i = 0; i < info->num_outputs; i++) {
+		unsigned semantic_name = info->output_semantic_name[i];
+		unsigned semantic_index = info->output_semantic_index[i];
+
+		switch (semantic_name) {
+		case TGSI_SEMANTIC_COLOR:
+			assert(semantic_index < 8);
+			for (j = 0; j < 4; j++) {
+				LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j];
+				LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
+				color[semantic_index][j] = result;
+			}
+			break;
+		case TGSI_SEMANTIC_POSITION:
+			depth = LLVMBuildLoad(builder,
+					      ctx->radeon_bld.soa.outputs[i][2], "");
+			break;
+		case TGSI_SEMANTIC_STENCIL:
+			stencil = LLVMBuildLoad(builder,
+						ctx->radeon_bld.soa.outputs[i][1], "");
+			break;
+		case TGSI_SEMANTIC_SAMPLEMASK:
+			samplemask = LLVMBuildLoad(builder,
+						   ctx->radeon_bld.soa.outputs[i][0], "");
+			break;
+		default:
+			fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
+				semantic_name);
+		}
+	}
+
+	/* Fill the return structure. */
+	ret = ctx->return_value;
+
+	/* Set SGPRs. */
+	ret = LLVMBuildInsertValue(builder, ret,
+				   bitcast(bld_base, TGSI_TYPE_SIGNED,
+					   LLVMGetParam(ctx->radeon_bld.main_fn,
+							SI_PARAM_ALPHA_REF)),
+				   SI_SGPR_ALPHA_REF, "");
+
+	/* Set VGPRs */
+	first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
+	for (i = 0; i < ARRAY_SIZE(color); i++) {
+		if (!color[i][0])
+			continue;
+
+		for (j = 0; j < 4; j++)
+			ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
+	}
+	if (depth)
+		ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
+	if (stencil)
+		ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
+	if (samplemask)
+		ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
+
+	/* Add the input sample mask for smoothing at the end. */
+	if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
+		vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
+	ret = LLVMBuildInsertValue(builder, ret,
+				   LLVMGetParam(ctx->radeon_bld.main_fn,
+						SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
+
+	ctx->return_value = ret;
+}
+
 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
 				struct lp_build_tgsi_context *bld_base,
 				struct lp_build_emit_data *emit_data);
@@ -2536,13 +2734,12 @@ static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
 /**
  * Load an image view, fmask view. or sampler state descriptor.
  */
-static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
-				     LLVMValueRef index, enum desc_type type)
+static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx,
+					    LLVMValueRef list, LLVMValueRef index,
+					    enum desc_type type)
 {
 	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
-	LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
-					SI_PARAM_SAMPLERS);
 
 	switch (type) {
 	case DESC_IMAGE:
@@ -2558,12 +2755,21 @@ static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
 		/* The sampler state is at [12:15]. */
 		index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
 		index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
-		ptr = LLVMBuildPointerCast(builder, ptr,
-					   const_array(ctx->v4i32, 0), "");
+		list = LLVMBuildPointerCast(builder, list,
+					    const_array(ctx->v4i32, 0), "");
 		break;
 	}
 
-	return build_indexed_load_const(ctx, ptr, index);
+	return build_indexed_load_const(ctx, list, index);
+}
+
+static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
+				     LLVMValueRef index, enum desc_type type)
+{
+	LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn,
+					 SI_PARAM_SAMPLERS);
+
+	return get_sampler_desc_custom(ctx, list, index, type);
 }
 
 static void tex_fetch_ptrs(
@@ -3546,6 +3752,30 @@ static const struct lp_build_tgsi_action interp_action = {
 	.emit = build_interp_intrinsic,
 };
 
+static void si_create_function(struct si_shader_context *ctx,
+			       LLVMTypeRef *returns, unsigned num_returns,
+			       LLVMTypeRef *params, unsigned num_params,
+			       int last_array_pointer, int last_sgpr)
+{
+	int i;
+
+	radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns,
+				params, num_params);
+	radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
+	ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type);
+
+	for (i = 0; i <= last_sgpr; ++i) {
+		LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
+
+		/* We tell llvm that array inputs are passed by value to allow Sinking pass
+		 * to move load. Inputs are constant so this is fine. */
+		if (i <= last_array_pointer)
+			LLVMAddAttribute(P, LLVMByValAttribute);
+		else
+			LLVMAddAttribute(P, LLVMInRegAttribute);
+	}
+}
+
 static void create_meta_data(struct si_shader_context *ctx)
 {
 	struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
@@ -3579,15 +3809,57 @@ static void declare_streamout_params(struct si_shader_context *ctx,
 	}
 }
 
+static unsigned llvm_get_type_size(LLVMTypeRef type)
+{
+	LLVMTypeKind kind = LLVMGetTypeKind(type);
+
+	switch (kind) {
+	case LLVMIntegerTypeKind:
+		return LLVMGetIntTypeWidth(type) / 8;
+	case LLVMFloatTypeKind:
+		return 4;
+	case LLVMPointerTypeKind:
+		return 8;
+	case LLVMVectorTypeKind:
+		return LLVMGetVectorSize(type) *
+		       llvm_get_type_size(LLVMGetElementType(type));
+	default:
+		assert(0);
+		return 0;
+	}
+}
+
+static void declare_tess_lds(struct si_shader_context *ctx)
+{
+	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
+	LLVMTypeRef i32 = ctx->radeon_bld.soa.bld_base.uint_bld.elem_type;
+
+	/* This is the upper bound, maximum is 32 inputs times 32 vertices */
+	unsigned vertex_data_dw_size = 32*32*4;
+	unsigned patch_data_dw_size = 32*4;
+	/* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */
+	unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size;
+	unsigned lds_dwords = patch_dw_size;
+
+	/* The actual size is computed outside of the shader to reduce
+	 * the number of shader variants. */
+	ctx->lds =
+		LLVMAddGlobalInAddressSpace(gallivm->module,
+					    LLVMArrayType(i32, lds_dwords),
+					    "tess_lds",
+					    LOCAL_ADDR_SPACE);
+}
+
 static void create_function(struct si_shader_context *ctx)
 {
 	struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	struct si_shader *shader = ctx->shader;
-	LLVMTypeRef params[SI_NUM_PARAMS], v2i32, v3i32;
-	unsigned i, last_array_pointer, last_sgpr, num_params;
+	LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32;
+	LLVMTypeRef returns[16+32*4];
+	unsigned i, last_array_pointer, last_sgpr, num_params, num_return_sgprs;
+	unsigned num_returns = 0;
 
-	v2i32 = LLVMVectorType(ctx->i32, 2);
 	v3i32 = LLVMVectorType(ctx->i32, 3);
 
 	params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
@@ -3630,6 +3902,20 @@ static void create_function(struct si_shader_context *ctx)
 		params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
 		params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
 		params[ctx->param_instance_id = num_params++] = ctx->i32;
+
+		if (!ctx->is_monolithic &&
+		    !ctx->is_gs_copy_shader) {
+			/* Vertex load indices. */
+			ctx->param_vertex_index0 = num_params;
+
+			for (i = 0; i < shader->selector->info.num_inputs; i++)
+				params[num_params++] = ctx->i32;
+
+			/* PrimitiveID output. */
+			if (!shader->key.vs.as_es && !shader->key.vs.as_ls)
+				for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
+					returns[num_returns++] = ctx->f32;
+		}
 		break;
 
 	case TGSI_PROCESSOR_TESS_CTRL:
@@ -3643,6 +3929,15 @@ static void create_function(struct si_shader_context *ctx)
 		params[SI_PARAM_PATCH_ID] = ctx->i32;
 		params[SI_PARAM_REL_IDS] = ctx->i32;
 		num_params = SI_PARAM_REL_IDS+1;
+
+		if (!ctx->is_monolithic) {
+			/* PARAM_TESS_FACTOR_OFFSET is after user SGPRs. */
+			for (i = 0; i <= SI_TCS_NUM_USER_SGPR; i++)
+				returns[num_returns++] = ctx->i32; /* SGPRs */
+
+			for (i = 0; i < 3; i++)
+				returns[num_returns++] = ctx->f32; /* VGPRs */
+		}
 		break;
 
 	case TGSI_PROCESSOR_TESS_EVAL:
@@ -3663,6 +3958,11 @@ static void create_function(struct si_shader_context *ctx)
 		params[ctx->param_tes_v = num_params++] = ctx->f32;
 		params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
 		params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
+
+		/* PrimitiveID output. */
+		if (!ctx->is_monolithic && !shader->key.tes.as_es)
+			for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
+				returns[num_returns++] = ctx->f32;
 		break;
 
 	case TGSI_PROCESSOR_GEOMETRY:
@@ -3686,13 +3986,13 @@ static void create_function(struct si_shader_context *ctx)
 		params[SI_PARAM_ALPHA_REF] = ctx->f32;
 		params[SI_PARAM_PRIM_MASK] = ctx->i32;
 		last_sgpr = SI_PARAM_PRIM_MASK;
-		params[SI_PARAM_PERSP_SAMPLE] = v2i32;
-		params[SI_PARAM_PERSP_CENTER] = v2i32;
-		params[SI_PARAM_PERSP_CENTROID] = v2i32;
+		params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
+		params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
+		params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
 		params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
-		params[SI_PARAM_LINEAR_SAMPLE] = v2i32;
-		params[SI_PARAM_LINEAR_CENTER] = v2i32;
-		params[SI_PARAM_LINEAR_CENTROID] = v2i32;
+		params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
+		params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
+		params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
 		params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
 		params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
 		params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
@@ -3701,8 +4001,39 @@ static void create_function(struct si_shader_context *ctx)
 		params[SI_PARAM_FRONT_FACE] = ctx->i32;
 		params[SI_PARAM_ANCILLARY] = ctx->i32;
 		params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
-		params[SI_PARAM_POS_FIXED_PT] = ctx->f32;
+		params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
 		num_params = SI_PARAM_POS_FIXED_PT+1;
+
+		if (!ctx->is_monolithic) {
+			/* Color inputs from the prolog. */
+			if (shader->selector->info.colors_read) {
+				unsigned num_color_elements =
+					util_bitcount(shader->selector->info.colors_read);
+
+				assert(num_params + num_color_elements <= ARRAY_SIZE(params));
+				for (i = 0; i < num_color_elements; i++)
+					params[num_params++] = ctx->f32;
+			}
+
+			/* Outputs for the epilog. */
+			num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
+			num_returns =
+				num_return_sgprs +
+				util_bitcount(shader->selector->info.colors_written) * 4 +
+				shader->selector->info.writes_z +
+				shader->selector->info.writes_stencil +
+				shader->selector->info.writes_samplemask +
+				1 /* SampleMaskIn */;
+
+			num_returns = MAX2(num_returns,
+					   num_return_sgprs +
+					   PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
+
+			for (i = 0; i < num_return_sgprs; i++)
+				returns[i] = ctx->i32;
+			for (; i < num_returns; i++)
+				returns[i] = ctx->f32;
+		}
 		break;
 
 	default:
@@ -3711,20 +4042,38 @@ static void create_function(struct si_shader_context *ctx)
 	}
 
 	assert(num_params <= Elements(params));
-	radeon_llvm_create_func(&ctx->radeon_bld, params, num_params);
-	radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
-
-	for (i = 0; i <= last_sgpr; ++i) {
-		LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
 
-		/* We tell llvm that array inputs are passed by value to allow Sinking pass
-		 * to move load. Inputs are constant so this is fine. */
-		if (i <= last_array_pointer)
-			LLVMAddAttribute(P, LLVMByValAttribute);
-		else
-			LLVMAddAttribute(P, LLVMInRegAttribute);
+	si_create_function(ctx, returns, num_returns, params,
+			   num_params, last_array_pointer, last_sgpr);
+
+	/* Reserve register locations for VGPR inputs the PS prolog may need. */
+	if (ctx->type == TGSI_PROCESSOR_FRAGMENT &&
+	    !ctx->is_monolithic) {
+		radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
+					  "InitialPSInputAddr",
+					  S_0286D0_PERSP_SAMPLE_ENA(1) |
+					  S_0286D0_PERSP_CENTER_ENA(1) |
+					  S_0286D0_PERSP_CENTROID_ENA(1) |
+					  S_0286D0_LINEAR_SAMPLE_ENA(1) |
+					  S_0286D0_LINEAR_CENTER_ENA(1) |
+					  S_0286D0_LINEAR_CENTROID_ENA(1) |
+					  S_0286D0_FRONT_FACE_ENA(1) |
+					  S_0286D0_POS_FIXED_PT_ENA(1));
 	}
 
+	shader->info.num_input_sgprs = 0;
+	shader->info.num_input_vgprs = 0;
+
+	for (i = 0; i <= last_sgpr; ++i)
+		shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
+
+	/* Unused fragment shader inputs are eliminated by the compiler,
+	 * so we don't know yet how many there will be.
+	 */
+	if (ctx->type != TGSI_PROCESSOR_FRAGMENT)
+		for (; i < num_params; ++i)
+			shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
+
 	if (bld_base->info &&
 	    (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
 	     bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
@@ -3740,22 +4089,8 @@ static void create_function(struct si_shader_context *ctx)
 
 	if ((ctx->type == TGSI_PROCESSOR_VERTEX && shader->key.vs.as_ls) ||
 	    ctx->type == TGSI_PROCESSOR_TESS_CTRL ||
-	    ctx->type == TGSI_PROCESSOR_TESS_EVAL) {
-		/* This is the upper bound, maximum is 32 inputs times 32 vertices */
-		unsigned vertex_data_dw_size = 32*32*4;
-		unsigned patch_data_dw_size = 32*4;
-		/* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */
-		unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size;
-		unsigned lds_dwords = patch_dw_size;
-
-		/* The actual size is computed outside of the shader to reduce
-		 * the number of shader variants. */
-		ctx->lds =
-			LLVMAddGlobalInAddressSpace(gallivm->module,
-						    LLVMArrayType(ctx->i32, lds_dwords),
-						    "tess_lds",
-						    LOCAL_ADDR_SPACE);
-	}
+	    ctx->type == TGSI_PROCESSOR_TESS_EVAL)
+		declare_tess_lds(ctx);
 }
 
 static void preload_constants(struct si_shader_context *ctx)
@@ -3887,6 +4222,49 @@ static void preload_ring_buffers(struct si_shader_context *ctx)
 	}
 }
 
+static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
+					 LLVMValueRef param_sampler_views,
+					 unsigned param_pos_fixed_pt)
+{
+	struct lp_build_tgsi_context *bld_base =
+		&ctx->radeon_bld.soa.bld_base;
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	struct lp_build_emit_data result = {};
+	struct tgsi_full_instruction inst = {};
+	LLVMValueRef desc, sampler_index, address[2], pix;
+
+	/* Use the fixed-point gl_FragCoord input.
+	 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
+	 * per coordinate to get the repeating effect.
+	 */
+	address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
+	address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
+
+	/* Load the sampler view descriptor. */
+	sampler_index = lp_build_const_int32(gallivm, SI_POLY_STIPPLE_SAMPLER);
+	desc = get_sampler_desc_custom(ctx, param_sampler_views,
+				       sampler_index, DESC_IMAGE);
+
+	/* Load the texel. */
+	inst.Instruction.Opcode = TGSI_OPCODE_TXF;
+	inst.Texture.Texture = TGSI_TEXTURE_2D_MSAA; /* = use load, not load_mip */
+	result.inst = &inst;
+	set_tex_fetch_args(ctx, &result, TGSI_OPCODE_TXF,
+			   inst.Texture.Texture,
+			   desc, NULL, address, ARRAY_SIZE(address), 0xf);
+	build_tex_intrinsic(&tex_action, bld_base, &result);
+
+	/* Kill the thread accordingly. */
+	pix = LLVMBuildExtractElement(gallivm->builder, result.output[0],
+				      lp_build_const_int32(gallivm, 3), "");
+	pix = bitcast(bld_base, TGSI_TYPE_FLOAT, pix);
+	pix = LLVMBuildFNeg(gallivm->builder, pix, "");
+
+	lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
+			   LLVMVoidTypeInContext(gallivm->context),
+			   &pix, 1, 0);
+}
+
 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
 				  struct si_shader_config *conf,
 				  unsigned symbol_offset)
@@ -3972,41 +4350,70 @@ void si_shader_apply_scratch_relocs(struct si_context *sctx,
 	}
 }
 
+static unsigned si_get_shader_binary_size(struct si_shader *shader)
+{
+	unsigned size = shader->binary.code_size;
+
+	if (shader->prolog)
+		size += shader->prolog->binary.code_size;
+	if (shader->epilog)
+		size += shader->epilog->binary.code_size;
+	return size;
+}
+
 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
 {
-	const struct radeon_shader_binary *binary = &shader->binary;
-	unsigned code_size = binary->code_size + binary->rodata_size;
+	const struct radeon_shader_binary *prolog =
+		shader->prolog ? &shader->prolog->binary : NULL;
+	const struct radeon_shader_binary *epilog =
+		shader->epilog ? &shader->epilog->binary : NULL;
+	const struct radeon_shader_binary *mainb = &shader->binary;
+	unsigned bo_size = si_get_shader_binary_size(shader) +
+			   (!epilog ? mainb->rodata_size : 0);
 	unsigned char *ptr;
 
+	assert(!prolog || !prolog->rodata_size);
+	assert((!prolog && !epilog) || !mainb->rodata_size);
+	assert(!epilog || !epilog->rodata_size);
+
 	r600_resource_reference(&shader->bo, NULL);
 	shader->bo = si_resource_create_custom(&sscreen->b.b,
 					       PIPE_USAGE_IMMUTABLE,
-					       code_size);
+					       bo_size);
 	if (!shader->bo)
 		return -ENOMEM;
 
+	/* Upload. */
 	ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
 					PIPE_TRANSFER_READ_WRITE);
-	util_memcpy_cpu_to_le32(ptr, binary->code, binary->code_size);
-	if (binary->rodata_size > 0) {
-		ptr += binary->code_size;
-		util_memcpy_cpu_to_le32(ptr, binary->rodata,
-					binary->rodata_size);
+
+	if (prolog) {
+		util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
+		ptr += prolog->code_size;
 	}
 
+	util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
+	ptr += mainb->code_size;
+
+	if (epilog)
+		util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
+	else if (mainb->rodata_size > 0)
+		util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
+
 	sscreen->b.ws->buffer_unmap(shader->bo->buf);
 	return 0;
 }
 
 static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
-				       struct pipe_debug_callback *debug)
+				       struct pipe_debug_callback *debug,
+				       const char *name)
 {
 	char *line, *p;
 	unsigned i, count;
 
 	if (binary->disasm_string) {
-		fprintf(stderr, "\nShader Disassembly:\n\n");
-		fprintf(stderr, "%s\n", binary->disasm_string);
+		fprintf(stderr, "Shader %s disassembly:\n", name);
+		fprintf(stderr, "%s", binary->disasm_string);
 
 		if (debug && debug->debug_message) {
 			/* Very long debug messages are cut off, so send the
@@ -4036,7 +4443,7 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary
 					   "Shader Disassembly End");
 		}
 	} else {
-		fprintf(stderr, "SI CODE:\n");
+		fprintf(stderr, "Shader %s binary:\n", name);
 		for (i = 0; i < binary->code_size; i += 4) {
 			fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i,
 				binary->code[i + 3], binary->code[i + 2],
@@ -4115,16 +4522,60 @@ static void si_shader_dump_stats(struct si_screen *sscreen,
 			   max_simd_waves);
 }
 
+static const char *si_get_shader_name(struct si_shader *shader,
+				      unsigned processor)
+{
+	switch (processor) {
+	case TGSI_PROCESSOR_VERTEX:
+		if (shader->key.vs.as_es)
+			return "Vertex Shader as ES";
+		else if (shader->key.vs.as_ls)
+			return "Vertex Shader as LS";
+		else
+			return "Vertex Shader as VS";
+	case TGSI_PROCESSOR_TESS_CTRL:
+		return "Tessellation Control Shader";
+	case TGSI_PROCESSOR_TESS_EVAL:
+		if (shader->key.tes.as_es)
+			return "Tessellation Evaluation Shader as ES";
+		else
+			return "Tessellation Evaluation Shader as VS";
+	case TGSI_PROCESSOR_GEOMETRY:
+		if (shader->gs_copy_shader == NULL)
+			return "GS Copy Shader as VS";
+		else
+			return "Geometry Shader";
+	case TGSI_PROCESSOR_FRAGMENT:
+		return "Pixel Shader";
+	case TGSI_PROCESSOR_COMPUTE:
+		return "Compute Shader";
+	default:
+		return "Unknown Shader";
+	}
+}
+
 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
 		    struct pipe_debug_callback *debug, unsigned processor)
 {
-	if (r600_can_dump_shader(&sscreen->b, processor))
-		if (!(sscreen->b.debug_flags & DBG_NO_ASM))
-			si_shader_dump_disassembly(&shader->binary, debug);
+	if (r600_can_dump_shader(&sscreen->b, processor) &&
+	    !(sscreen->b.debug_flags & DBG_NO_ASM)) {
+		fprintf(stderr, "\n%s:\n", si_get_shader_name(shader, processor));
+
+		if (shader->prolog)
+			si_shader_dump_disassembly(&shader->prolog->binary,
+						   debug, "prolog");
+
+		si_shader_dump_disassembly(&shader->binary, debug, "main");
+
+		if (shader->epilog)
+			si_shader_dump_disassembly(&shader->epilog->binary,
+						   debug, "epilog");
+		fprintf(stderr, "\n");
+	}
 
 	si_shader_dump_stats(sscreen, &shader->config,
 			     shader->selector ? shader->selector->info.num_inputs : 0,
-			     shader->binary.code_size, debug, processor);
+			     si_get_shader_binary_size(shader), debug, processor);
 }
 
 int si_compile_llvm(struct si_screen *sscreen,
@@ -4177,6 +4628,19 @@ int si_compile_llvm(struct si_screen *sscreen,
 	FREE(binary->global_symbol_offsets);
 	binary->config = NULL;
 	binary->global_symbol_offsets = NULL;
+
+	/* Some shaders can't have rodata because their binaries can be
+	 * concatenated.
+	 */
+	if (binary->rodata_size &&
+	    (processor == TGSI_PROCESSOR_VERTEX ||
+	     processor == TGSI_PROCESSOR_TESS_CTRL ||
+	     processor == TGSI_PROCESSOR_TESS_EVAL ||
+	     processor == TGSI_PROCESSOR_FRAGMENT)) {
+		fprintf(stderr, "radeonsi: The shader can't have rodata.");
+		return -EINVAL;
+	}
+
 	return r;
 }
 
@@ -4196,7 +4660,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 
 	outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
 
-	si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm, gsinfo);
+	si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm);
 	ctx->type = TGSI_PROCESSOR_VERTEX;
 	ctx->is_gs_copy_shader = true;
 
@@ -4241,7 +4705,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 
 	si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
 
-	LLVMBuildRetVoid(bld_base->base.gallivm->builder);
+	LLVMBuildRet(gallivm->builder, ctx->return_value);
 
 	/* Dump LLVM IR before any optimization passes */
 	if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
@@ -4278,35 +4742,38 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 	switch (shader) {
 	case PIPE_SHADER_VERTEX:
 		fprintf(f, "  instance_divisors = {");
-		for (i = 0; i < Elements(key->vs.instance_divisors); i++)
+		for (i = 0; i < Elements(key->vs.prolog.instance_divisors); i++)
 			fprintf(f, !i ? "%u" : ", %u",
-				key->vs.instance_divisors[i]);
+				key->vs.prolog.instance_divisors[i]);
 		fprintf(f, "}\n");
 		fprintf(f, "  as_es = %u\n", key->vs.as_es);
 		fprintf(f, "  as_ls = %u\n", key->vs.as_ls);
-		fprintf(f, "  export_prim_id = %u\n", key->vs.export_prim_id);
+		fprintf(f, "  export_prim_id = %u\n", key->vs.epilog.export_prim_id);
 		break;
 
 	case PIPE_SHADER_TESS_CTRL:
-		fprintf(f, "  prim_mode = %u\n", key->tcs.prim_mode);
+		fprintf(f, "  prim_mode = %u\n", key->tcs.epilog.prim_mode);
 		break;
 
 	case PIPE_SHADER_TESS_EVAL:
 		fprintf(f, "  as_es = %u\n", key->tes.as_es);
-		fprintf(f, "  export_prim_id = %u\n", key->tes.export_prim_id);
+		fprintf(f, "  export_prim_id = %u\n", key->tes.epilog.export_prim_id);
 		break;
 
 	case PIPE_SHADER_GEOMETRY:
 		break;
 
 	case PIPE_SHADER_FRAGMENT:
-		fprintf(f, "  spi_shader_col_format = 0x%x\n", key->ps.spi_shader_col_format);
-		fprintf(f, "  last_cbuf = %u\n", key->ps.last_cbuf);
-		fprintf(f, "  color_two_side = %u\n", key->ps.color_two_side);
-		fprintf(f, "  alpha_func = %u\n", key->ps.alpha_func);
-		fprintf(f, "  alpha_to_one = %u\n", key->ps.alpha_to_one);
-		fprintf(f, "  poly_stipple = %u\n", key->ps.poly_stipple);
-		fprintf(f, "  clamp_color = %u\n", key->ps.clamp_color);
+		fprintf(f, "  prolog.color_two_side = %u\n", key->ps.prolog.color_two_side);
+		fprintf(f, "  prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple);
+		fprintf(f, "  prolog.force_persample_interp = %u\n", key->ps.prolog.force_persample_interp);
+		fprintf(f, "  epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
+		fprintf(f, "  epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
+		fprintf(f, "  epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
+		fprintf(f, "  epilog.alpha_func = %u\n", key->ps.epilog.alpha_func);
+		fprintf(f, "  epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one);
+		fprintf(f, "  epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing);
+		fprintf(f, "  epilog.clamp_color = %u\n", key->ps.epilog.clamp_color);
 		break;
 
 	default:
@@ -4317,13 +4784,12 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 static void si_init_shader_ctx(struct si_shader_context *ctx,
 			       struct si_screen *sscreen,
 			       struct si_shader *shader,
-			       LLVMTargetMachineRef tm,
-			       struct tgsi_shader_info *info)
+			       LLVMTargetMachineRef tm)
 {
 	struct lp_build_tgsi_context *bld_base;
 
 	memset(ctx, 0, sizeof(*ctx));
-	radeon_llvm_context_init(&ctx->radeon_bld);
+	radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--");
 	ctx->tm = tm;
 	ctx->screen = sscreen;
 	if (shader && shader->selector)
@@ -4336,15 +4802,18 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
 	ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context);
 	ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context);
 	ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context);
-	ctx->i128 = LLVMInt128TypeInContext(ctx->radeon_bld.gallivm.context);
+	ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context);
+	ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128);
 	ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context);
 	ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
+	ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
 	ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
 	ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
 	ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
 
 	bld_base = &ctx->radeon_bld.soa.bld_base;
-	bld_base->info = info;
+	if (shader && shader->selector)
+		bld_base->info = &shader->selector->info;
 	bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
 
 	bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
@@ -4380,40 +4849,31 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
 	bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
 }
 
-int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
-		     struct si_shader *shader,
-		     struct pipe_debug_callback *debug)
+int si_compile_tgsi_shader(struct si_screen *sscreen,
+			   LLVMTargetMachineRef tm,
+			   struct si_shader *shader,
+			   bool is_monolithic,
+			   struct pipe_debug_callback *debug)
 {
 	struct si_shader_selector *sel = shader->selector;
-	struct tgsi_token *tokens = sel->tokens;
 	struct si_shader_context ctx;
 	struct lp_build_tgsi_context *bld_base;
-	struct tgsi_shader_info stipple_shader_info;
 	LLVMModuleRef mod;
 	int r = 0;
-	bool poly_stipple = sel->type == PIPE_SHADER_FRAGMENT &&
-			    shader->key.ps.poly_stipple;
-
-	if (poly_stipple) {
-		tokens = util_pstipple_create_fragment_shader(tokens, NULL,
-						SI_POLY_STIPPLE_SAMPLER,
-						TGSI_FILE_SYSTEM_VALUE);
-		tgsi_scan_shader(tokens, &stipple_shader_info);
-	}
 
 	/* Dump TGSI code before doing TGSI->LLVM conversion in case the
 	 * conversion fails. */
 	if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
 	    !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
 		si_dump_shader_key(sel->type, &shader->key, stderr);
-		tgsi_dump(tokens, 0);
+		tgsi_dump(sel->tokens, 0);
 		si_dump_streamout(&sel->so);
 	}
 
-	si_init_shader_ctx(&ctx, sscreen, shader, tm,
-			   poly_stipple ? &stipple_shader_info : &sel->info);
+	si_init_shader_ctx(&ctx, sscreen, shader, tm);
+	ctx.is_monolithic = is_monolithic;
 
-	shader->uses_instanceid = sel->info.uses_instanceid;
+	shader->info.uses_instanceid = sel->info.uses_instanceid;
 
 	bld_base = &ctx.radeon_bld.soa.bld_base;
 	ctx.radeon_bld.load_system_value = declare_system_value;
@@ -4447,7 +4907,10 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 		break;
 	case TGSI_PROCESSOR_FRAGMENT:
 		ctx.radeon_bld.load_input = declare_input_fs;
-		bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
+		if (is_monolithic)
+			bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
+		else
+			bld_base->emit_epilogue = si_llvm_return_fs_outputs;
 		break;
 	default:
 		assert(!"Unsupported shader type");
@@ -4461,6 +4924,14 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	preload_streamout_buffers(&ctx);
 	preload_ring_buffers(&ctx);
 
+	if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT &&
+	    shader->key.ps.prolog.poly_stipple) {
+		LLVMValueRef views = LLVMGetParam(ctx.radeon_bld.main_fn,
+						  SI_PARAM_SAMPLERS);
+		si_llvm_emit_polygon_stipple(&ctx, views,
+					     SI_PARAM_POS_FIXED_PT);
+	}
+
 	if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
 		int i;
 		for (i = 0; i < 4; i++) {
@@ -4470,12 +4941,12 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 		}
 	}
 
-	if (!lp_build_tgsi_llvm(bld_base, tokens)) {
+	if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
 		fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
 		goto out;
 	}
 
-	LLVMBuildRetVoid(bld_base->base.gallivm->builder);
+	LLVMBuildRet(bld_base->base.gallivm->builder, ctx.return_value);
 	mod = bld_base->base.gallivm->module;
 
 	/* Dump LLVM IR before any optimization passes */
@@ -4492,16 +4963,49 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 		goto out;
 	}
 
-	si_shader_dump(sscreen, shader, debug, ctx.type);
+	radeon_llvm_dispose(&ctx.radeon_bld);
 
-	r = si_shader_binary_upload(sscreen, shader);
-	if (r) {
-		fprintf(stderr, "LLVM failed to upload shader\n");
-		goto out;
+	/* Calculate the number of fragment input VGPRs. */
+	if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
+		shader->info.num_input_vgprs = 0;
+		shader->info.face_vgpr_index = -1;
+
+		if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
+			shader->info.num_input_vgprs += 2;
+		if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
+			shader->info.num_input_vgprs += 2;
+		if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
+			shader->info.num_input_vgprs += 2;
+		if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
+			shader->info.num_input_vgprs += 3;
+		if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
+			shader->info.num_input_vgprs += 2;
+		if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
+			shader->info.num_input_vgprs += 2;
+		if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
+			shader->info.num_input_vgprs += 2;
+		if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
+			shader->info.num_input_vgprs += 1;
+		if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
+			shader->info.num_input_vgprs += 1;
+		if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
+			shader->info.num_input_vgprs += 1;
+		if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
+			shader->info.num_input_vgprs += 1;
+		if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
+			shader->info.num_input_vgprs += 1;
+		if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
+			shader->info.face_vgpr_index = shader->info.num_input_vgprs;
+			shader->info.num_input_vgprs += 1;
+		}
+		if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
+			shader->info.num_input_vgprs += 1;
+		if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
+			shader->info.num_input_vgprs += 1;
+		if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
+			shader->info.num_input_vgprs += 1;
 	}
 
-	radeon_llvm_dispose(&ctx.radeon_bld);
-
 	if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
 		shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
 		shader->gs_copy_shader->selector = shader->selector;
@@ -4517,11 +5021,968 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 out:
 	for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++)
 		FREE(ctx.constants[i]);
-	if (poly_stipple)
-		tgsi_free_tokens(tokens);
 	return r;
 }
 
+/**
+ * Create, compile and return a shader part (prolog or epilog).
+ *
+ * \param sscreen	screen
+ * \param list		list of shader parts of the same category
+ * \param key		shader part key
+ * \param tm		LLVM target machine
+ * \param debug		debug callback
+ * \param compile	the callback responsible for compilation
+ * \return		non-NULL on success
+ */
+static struct si_shader_part *
+si_get_shader_part(struct si_screen *sscreen,
+		   struct si_shader_part **list,
+		   union si_shader_part_key *key,
+		   LLVMTargetMachineRef tm,
+		   struct pipe_debug_callback *debug,
+		   bool (*compile)(struct si_screen *,
+				   LLVMTargetMachineRef,
+				   struct pipe_debug_callback *,
+				   struct si_shader_part *))
+{
+	struct si_shader_part *result;
+
+	pipe_mutex_lock(sscreen->shader_parts_mutex);
+
+	/* Find existing. */
+	for (result = *list; result; result = result->next) {
+		if (memcmp(&result->key, key, sizeof(*key)) == 0) {
+			pipe_mutex_unlock(sscreen->shader_parts_mutex);
+			return result;
+		}
+	}
+
+	/* Compile a new one. */
+	result = CALLOC_STRUCT(si_shader_part);
+	result->key = *key;
+	if (!compile(sscreen, tm, debug, result)) {
+		FREE(result);
+		pipe_mutex_unlock(sscreen->shader_parts_mutex);
+		return NULL;
+	}
+
+	result->next = *list;
+	*list = result;
+	pipe_mutex_unlock(sscreen->shader_parts_mutex);
+	return result;
+}
+
+/**
+ * Create a vertex shader prolog.
+ *
+ * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
+ * All inputs are returned unmodified. The vertex load indices are
+ * stored after them, which will used by the API VS for fetching inputs.
+ *
+ * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
+ *   input_v0,
+ *   input_v1,
+ *   input_v2,
+ *   input_v3,
+ *   (VertexID + BaseVertex),
+ *   (InstanceID + StartInstance),
+ *   (InstanceID / 2 + StartInstance)
+ */
+static bool si_compile_vs_prolog(struct si_screen *sscreen,
+				 LLVMTargetMachineRef tm,
+				 struct pipe_debug_callback *debug,
+				 struct si_shader_part *out)
+{
+	union si_shader_part_key *key = &out->key;
+	struct si_shader shader = {};
+	struct si_shader_context ctx;
+	struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+	LLVMTypeRef *params, *returns;
+	LLVMValueRef ret, func;
+	int last_sgpr, num_params, num_returns, i;
+	bool status = true;
+
+	si_init_shader_ctx(&ctx, sscreen, &shader, tm);
+	ctx.type = TGSI_PROCESSOR_VERTEX;
+	ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
+	ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
+
+	/* 4 preloaded VGPRs + vertex load indices as prolog outputs */
+	params = alloca((key->vs_prolog.num_input_sgprs + 4) *
+			sizeof(LLVMTypeRef));
+	returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
+			  key->vs_prolog.last_input + 1) *
+			 sizeof(LLVMTypeRef));
+	num_params = 0;
+	num_returns = 0;
+
+	/* Declare input and output SGPRs. */
+	num_params = 0;
+	for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
+		params[num_params++] = ctx.i32;
+		returns[num_returns++] = ctx.i32;
+	}
+	last_sgpr = num_params - 1;
+
+	/* 4 preloaded VGPRs (outputs must be floats) */
+	for (i = 0; i < 4; i++) {
+		params[num_params++] = ctx.i32;
+		returns[num_returns++] = ctx.f32;
+	}
+
+	/* Vertex load indices. */
+	for (i = 0; i <= key->vs_prolog.last_input; i++)
+		returns[num_returns++] = ctx.f32;
+
+	/* Create the function. */
+	si_create_function(&ctx, returns, num_returns, params,
+			   num_params, -1, last_sgpr);
+	func = ctx.radeon_bld.main_fn;
+
+	/* Copy inputs to outputs. This should be no-op, as the registers match,
+	 * but it will prevent the compiler from overwriting them unintentionally.
+	 */
+	ret = ctx.return_value;
+	for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
+		LLVMValueRef p = LLVMGetParam(func, i);
+		ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
+	}
+	for (i = num_params - 4; i < num_params; i++) {
+		LLVMValueRef p = LLVMGetParam(func, i);
+		p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, "");
+		ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
+	}
+
+	/* Compute vertex load indices from instance divisors. */
+	for (i = 0; i <= key->vs_prolog.last_input; i++) {
+		unsigned divisor = key->vs_prolog.states.instance_divisors[i];
+		LLVMValueRef index;
+
+		if (divisor) {
+			/* InstanceID / Divisor + StartInstance */
+			index = get_instance_index_for_fetch(&ctx.radeon_bld,
+							     SI_SGPR_START_INSTANCE,
+							     divisor);
+		} else {
+			/* VertexID + BaseVertex */
+			index = LLVMBuildAdd(gallivm->builder,
+					     LLVMGetParam(func, ctx.param_vertex_id),
+					     LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
+		}
+
+		index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, "");
+		ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
+					   num_params++, "");
+	}
+
+	/* Compile. */
+	LLVMBuildRet(gallivm->builder, ret);
+	radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+	if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+			    gallivm->module, debug, ctx.type,
+			    "Vertex Shader Prolog"))
+		status = false;
+
+	radeon_llvm_dispose(&ctx.radeon_bld);
+	return status;
+}
+
+/**
+ * Compile the vertex shader epilog. This is also used by the tessellation
+ * evaluation shader compiled as VS.
+ *
+ * The input is PrimitiveID.
+ *
+ * If PrimitiveID is required by the pixel shader, export it.
+ * Otherwise, do nothing.
+ */
+static bool si_compile_vs_epilog(struct si_screen *sscreen,
+				 LLVMTargetMachineRef tm,
+				 struct pipe_debug_callback *debug,
+				 struct si_shader_part *out)
+{
+	union si_shader_part_key *key = &out->key;
+	struct si_shader_context ctx;
+	struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+	struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
+	LLVMTypeRef params[5];
+	int num_params, i;
+	bool status = true;
+
+	si_init_shader_ctx(&ctx, sscreen, NULL, tm);
+	ctx.type = TGSI_PROCESSOR_VERTEX;
+
+	/* Declare input VGPRs. */
+	num_params = key->vs_epilog.states.export_prim_id ?
+			   (VS_EPILOG_PRIMID_LOC + 1) : 0;
+	assert(num_params <= ARRAY_SIZE(params));
+
+	for (i = 0; i < num_params; i++)
+		params[i] = ctx.f32;
+
+	/* Create the function. */
+	si_create_function(&ctx, NULL, 0, params, num_params,
+			   -1, -1);
+
+	/* Emit exports. */
+	if (key->vs_epilog.states.export_prim_id) {
+		struct lp_build_context *base = &bld_base->base;
+		struct lp_build_context *uint = &bld_base->uint_bld;
+		LLVMValueRef args[9];
+
+		args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
+		args[1] = uint->zero; /* whether the EXEC mask is valid */
+		args[2] = uint->zero; /* DONE bit */
+		args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM +
+					       key->vs_epilog.prim_id_param_offset);
+		args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
+		args[5] = LLVMGetParam(ctx.radeon_bld.main_fn,
+				       VS_EPILOG_PRIMID_LOC); /* X */
+		args[6] = uint->undef; /* Y */
+		args[7] = uint->undef; /* Z */
+		args[8] = uint->undef; /* W */
+
+		lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+				   LLVMVoidTypeInContext(base->gallivm->context),
+				   args, 9, 0);
+	}
+
+	/* Compile. */
+	LLVMBuildRet(gallivm->builder, ctx.return_value);
+	radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+	if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+			    gallivm->module, debug, ctx.type,
+			    "Vertex Shader Epilog"))
+		status = false;
+
+	radeon_llvm_dispose(&ctx.radeon_bld);
+	return status;
+}
+
+/**
+ * Create & compile a vertex shader epilog. This a helper used by VS and TES.
+ */
+static bool si_get_vs_epilog(struct si_screen *sscreen,
+			     LLVMTargetMachineRef tm,
+		             struct si_shader *shader,
+		             struct pipe_debug_callback *debug,
+			     struct si_vs_epilog_bits *states)
+{
+	union si_shader_part_key epilog_key;
+
+	memset(&epilog_key, 0, sizeof(epilog_key));
+	epilog_key.vs_epilog.states = *states;
+
+	/* Set up the PrimitiveID output. */
+	if (shader->key.vs.epilog.export_prim_id) {
+		unsigned index = shader->selector->info.num_outputs;
+		unsigned offset = shader->info.nr_param_exports++;
+
+		epilog_key.vs_epilog.prim_id_param_offset = offset;
+		assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
+		shader->info.vs_output_param_offset[index] = offset;
+	}
+
+	shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
+					    &epilog_key, tm, debug,
+					    si_compile_vs_epilog);
+	return shader->epilog != NULL;
+}
+
+/**
+ * Select and compile (or reuse) vertex shader parts (prolog & epilog).
+ */
+static bool si_shader_select_vs_parts(struct si_screen *sscreen,
+				      LLVMTargetMachineRef tm,
+				      struct si_shader *shader,
+				      struct pipe_debug_callback *debug)
+{
+	struct tgsi_shader_info *info = &shader->selector->info;
+	union si_shader_part_key prolog_key;
+	unsigned i;
+
+	/* Get the prolog. */
+	memset(&prolog_key, 0, sizeof(prolog_key));
+	prolog_key.vs_prolog.states = shader->key.vs.prolog;
+	prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
+	prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
+
+	/* The prolog is a no-op if there are no inputs. */
+	if (info->num_inputs) {
+		shader->prolog =
+			si_get_shader_part(sscreen, &sscreen->vs_prologs,
+					   &prolog_key, tm, debug,
+					   si_compile_vs_prolog);
+		if (!shader->prolog)
+			return false;
+	}
+
+	/* Get the epilog. */
+	if (!shader->key.vs.as_es && !shader->key.vs.as_ls &&
+	    !si_get_vs_epilog(sscreen, tm, shader, debug,
+			      &shader->key.vs.epilog))
+		return false;
+
+	/* Set the instanceID flag. */
+	for (i = 0; i < info->num_inputs; i++)
+		if (prolog_key.vs_prolog.states.instance_divisors[i])
+			shader->info.uses_instanceid = true;
+
+	return true;
+}
+
+/**
+ * Select and compile (or reuse) TES parts (epilog).
+ */
+static bool si_shader_select_tes_parts(struct si_screen *sscreen,
+				       LLVMTargetMachineRef tm,
+				       struct si_shader *shader,
+				       struct pipe_debug_callback *debug)
+{
+	if (shader->key.tes.as_es)
+		return true;
+
+	/* TES compiled as VS. */
+	return si_get_vs_epilog(sscreen, tm, shader, debug,
+				&shader->key.tes.epilog);
+}
+
+/**
+ * Compile the TCS epilog. This writes tesselation factors to memory based on
+ * the output primitive type of the tesselator (determined by TES).
+ */
+static bool si_compile_tcs_epilog(struct si_screen *sscreen,
+				  LLVMTargetMachineRef tm,
+				  struct pipe_debug_callback *debug,
+				  struct si_shader_part *out)
+{
+	union si_shader_part_key *key = &out->key;
+	struct si_shader shader = {};
+	struct si_shader_context ctx;
+	struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+	struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
+	LLVMTypeRef params[16];
+	LLVMValueRef func;
+	int last_array_pointer, last_sgpr, num_params;
+	bool status = true;
+
+	si_init_shader_ctx(&ctx, sscreen, &shader, tm);
+	ctx.type = TGSI_PROCESSOR_TESS_CTRL;
+	shader.key.tcs.epilog = key->tcs_epilog.states;
+
+	/* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
+	params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS);
+	last_array_pointer = SI_PARAM_RW_BUFFERS;
+	params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
+	params[SI_PARAM_SAMPLERS] = ctx.i64;
+	params[SI_PARAM_UNUSED] = ctx.i64;
+	params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
+	params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
+	params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
+	params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32;
+	last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
+	num_params = last_sgpr + 1;
+
+	params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */
+	params[num_params++] = ctx.i32; /* invocation ID within the patch */
+	params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */
+
+	/* Create the function. */
+	si_create_function(&ctx, NULL, 0, params, num_params,
+			   last_array_pointer, last_sgpr);
+	declare_tess_lds(&ctx);
+	func = ctx.radeon_bld.main_fn;
+
+	si_write_tess_factors(bld_base,
+			      LLVMGetParam(func, last_sgpr + 1),
+			      LLVMGetParam(func, last_sgpr + 2),
+			      LLVMGetParam(func, last_sgpr + 3));
+
+	/* Compile. */
+	LLVMBuildRet(gallivm->builder, ctx.return_value);
+	radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+	if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+			    gallivm->module, debug, ctx.type,
+			    "Tessellation Control Shader Epilog"))
+		status = false;
+
+	radeon_llvm_dispose(&ctx.radeon_bld);
+	return status;
+}
+
+/**
+ * Select and compile (or reuse) TCS parts (epilog).
+ */
+static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
+				       LLVMTargetMachineRef tm,
+				       struct si_shader *shader,
+				       struct pipe_debug_callback *debug)
+{
+	union si_shader_part_key epilog_key;
+
+	/* Get the epilog. */
+	memset(&epilog_key, 0, sizeof(epilog_key));
+	epilog_key.tcs_epilog.states = shader->key.tcs.epilog;
+
+	shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
+					    &epilog_key, tm, debug,
+					    si_compile_tcs_epilog);
+	return shader->epilog != NULL;
+}
+
+/**
+ * Compile the pixel shader prolog. This handles:
+ * - two-side color selection and interpolation
+ * - overriding interpolation parameters for the API PS
+ * - polygon stippling
+ *
+ * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
+ * overriden by other states. (e.g. per-sample interpolation)
+ * Interpolated colors are stored after the preloaded VGPRs.
+ */
+static bool si_compile_ps_prolog(struct si_screen *sscreen,
+				 LLVMTargetMachineRef tm,
+				 struct pipe_debug_callback *debug,
+				 struct si_shader_part *out)
+{
+	union si_shader_part_key *key = &out->key;
+	struct si_shader shader = {};
+	struct si_shader_context ctx;
+	struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+	LLVMTypeRef *params;
+	LLVMValueRef ret, func;
+	int last_sgpr, num_params, num_returns, i, num_color_channels;
+	bool status = true;
+
+	si_init_shader_ctx(&ctx, sscreen, &shader, tm);
+	ctx.type = TGSI_PROCESSOR_FRAGMENT;
+	shader.key.ps.prolog = key->ps_prolog.states;
+
+	/* Number of inputs + 8 color elements. */
+	params = alloca((key->ps_prolog.num_input_sgprs +
+			 key->ps_prolog.num_input_vgprs + 8) *
+			sizeof(LLVMTypeRef));
+
+	/* Declare inputs. */
+	num_params = 0;
+	for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
+		params[num_params++] = ctx.i32;
+	last_sgpr = num_params - 1;
+
+	for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
+		params[num_params++] = ctx.f32;
+
+	/* Declare outputs (same as inputs + add colors if needed) */
+	num_returns = num_params;
+	num_color_channels = util_bitcount(key->ps_prolog.colors_read);
+	for (i = 0; i < num_color_channels; i++)
+		params[num_returns++] = ctx.f32;
+
+	/* Create the function. */
+	si_create_function(&ctx, params, num_returns, params,
+			   num_params, -1, last_sgpr);
+	func = ctx.radeon_bld.main_fn;
+
+	/* Copy inputs to outputs. This should be no-op, as the registers match,
+	 * but it will prevent the compiler from overwriting them unintentionally.
+	 */
+	ret = ctx.return_value;
+	for (i = 0; i < num_params; i++) {
+		LLVMValueRef p = LLVMGetParam(func, i);
+		ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
+	}
+
+	/* Polygon stippling. */
+	if (key->ps_prolog.states.poly_stipple) {
+		/* POS_FIXED_PT is always last. */
+		unsigned pos = key->ps_prolog.num_input_sgprs +
+			       key->ps_prolog.num_input_vgprs - 1;
+		LLVMValueRef ptr[2], views;
+
+		/* Get the pointer to sampler views. */
+		ptr[0] = LLVMGetParam(func, SI_SGPR_SAMPLERS);
+		ptr[1] = LLVMGetParam(func, SI_SGPR_SAMPLERS+1);
+		views = lp_build_gather_values(gallivm, ptr, 2);
+		views = LLVMBuildBitCast(gallivm->builder, views, ctx.i64, "");
+		views = LLVMBuildIntToPtr(gallivm->builder, views,
+					  const_array(ctx.v8i32, SI_NUM_SAMPLERS), "");
+
+		si_llvm_emit_polygon_stipple(&ctx, views, pos);
+	}
+
+	/* Interpolate colors. */
+	for (i = 0; i < 2; i++) {
+		unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
+		unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
+				     key->ps_prolog.face_vgpr_index;
+		LLVMValueRef interp[2], color[4];
+		LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
+
+		if (!writemask)
+			continue;
+
+		/* If the interpolation qualifier is not CONSTANT (-1). */
+		if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
+			unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
+					       key->ps_prolog.color_interp_vgpr_index[i];
+
+			interp[0] = LLVMGetParam(func, interp_vgpr);
+			interp[1] = LLVMGetParam(func, interp_vgpr + 1);
+			interp_ij = lp_build_gather_values(gallivm, interp, 2);
+			interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij,
+						     ctx.v2i32, "");
+		}
+
+		/* Use the absolute location of the input. */
+		prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
+
+		if (key->ps_prolog.states.color_two_side) {
+			face = LLVMGetParam(func, face_vgpr);
+			face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, "");
+		}
+
+		interp_fs_input(&ctx,
+				key->ps_prolog.color_attr_index[i],
+				TGSI_SEMANTIC_COLOR, i,
+				key->ps_prolog.num_interp_inputs,
+				key->ps_prolog.colors_read, interp_ij,
+				prim_mask, face, color);
+
+		while (writemask) {
+			unsigned chan = u_bit_scan(&writemask);
+			ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
+						   num_params++, "");
+		}
+	}
+
+	/* Force per-sample interpolation. */
+	if (key->ps_prolog.states.force_persample_interp) {
+		unsigned i, base = key->ps_prolog.num_input_sgprs;
+		LLVMValueRef persp_sample[2], linear_sample[2];
+
+		/* Read PERSP_SAMPLE. */
+		for (i = 0; i < 2; i++)
+			persp_sample[i] = LLVMGetParam(func, base + i);
+		/* Overwrite PERSP_CENTER. */
+		for (i = 0; i < 2; i++)
+			ret = LLVMBuildInsertValue(gallivm->builder, ret,
+						   persp_sample[i], base + 2 + i, "");
+		/* Overwrite PERSP_CENTROID. */
+		for (i = 0; i < 2; i++)
+			ret = LLVMBuildInsertValue(gallivm->builder, ret,
+						   persp_sample[i], base + 4 + i, "");
+		/* Read LINEAR_SAMPLE. */
+		for (i = 0; i < 2; i++)
+			linear_sample[i] = LLVMGetParam(func, base + 6 + i);
+		/* Overwrite LINEAR_CENTER. */
+		for (i = 0; i < 2; i++)
+			ret = LLVMBuildInsertValue(gallivm->builder, ret,
+						   linear_sample[i], base + 8 + i, "");
+		/* Overwrite LINEAR_CENTROID. */
+		for (i = 0; i < 2; i++)
+			ret = LLVMBuildInsertValue(gallivm->builder, ret,
+						   linear_sample[i], base + 10 + i, "");
+	}
+
+	/* Compile. */
+	LLVMBuildRet(gallivm->builder, ret);
+	radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+	if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+			    gallivm->module, debug, ctx.type,
+			    "Fragment Shader Prolog"))
+		status = false;
+
+	radeon_llvm_dispose(&ctx.radeon_bld);
+	return status;
+}
+
+/**
+ * Compile the pixel shader epilog. This handles everything that must be
+ * emulated for pixel shader exports. (alpha-test, format conversions, etc)
+ */
+static bool si_compile_ps_epilog(struct si_screen *sscreen,
+				 LLVMTargetMachineRef tm,
+				 struct pipe_debug_callback *debug,
+				 struct si_shader_part *out)
+{
+	union si_shader_part_key *key = &out->key;
+	struct si_shader shader = {};
+	struct si_shader_context ctx;
+	struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+	struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
+	LLVMTypeRef params[16+8*4+3];
+	LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
+	int last_array_pointer, last_sgpr, num_params, i;
+	bool status = true;
+
+	si_init_shader_ctx(&ctx, sscreen, &shader, tm);
+	ctx.type = TGSI_PROCESSOR_FRAGMENT;
+	shader.key.ps.epilog = key->ps_epilog.states;
+
+	/* Declare input SGPRs. */
+	params[SI_PARAM_RW_BUFFERS] = ctx.i64;
+	params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
+	params[SI_PARAM_SAMPLERS] = ctx.i64;
+	params[SI_PARAM_UNUSED] = ctx.i64;
+	params[SI_PARAM_ALPHA_REF] = ctx.f32;
+	last_array_pointer = -1;
+	last_sgpr = SI_PARAM_ALPHA_REF;
+
+	/* Declare input VGPRs. */
+	num_params = (last_sgpr + 1) +
+		     util_bitcount(key->ps_epilog.colors_written) * 4 +
+		     key->ps_epilog.writes_z +
+		     key->ps_epilog.writes_stencil +
+		     key->ps_epilog.writes_samplemask;
+
+	num_params = MAX2(num_params,
+			  last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
+
+	assert(num_params <= ARRAY_SIZE(params));
+
+	for (i = last_sgpr + 1; i < num_params; i++)
+		params[i] = ctx.f32;
+
+	/* Create the function. */
+	si_create_function(&ctx, NULL, 0, params, num_params,
+			   last_array_pointer, last_sgpr);
+	/* Disable elimination of unused inputs. */
+	radeon_llvm_add_attribute(ctx.radeon_bld.main_fn,
+				  "InitialPSInputAddr", 0xffffff);
+
+	/* Process colors. */
+	unsigned vgpr = last_sgpr + 1;
+	unsigned colors_written = key->ps_epilog.colors_written;
+	int last_color_export = -1;
+
+	/* Find the last color export. */
+	if (!key->ps_epilog.writes_z &&
+	    !key->ps_epilog.writes_stencil &&
+	    !key->ps_epilog.writes_samplemask) {
+		unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
+
+		/* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
+		if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
+			/* Just set this if any of the colorbuffers are enabled. */
+			if (spi_format &
+			    ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
+				last_color_export = 0;
+		} else {
+			for (i = 0; i < 8; i++)
+				if (colors_written & (1 << i) &&
+				    (spi_format >> (i * 4)) & 0xf)
+					last_color_export = i;
+		}
+	}
+
+	while (colors_written) {
+		LLVMValueRef color[4];
+		int mrt = u_bit_scan(&colors_written);
+
+		for (i = 0; i < 4; i++)
+			color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
+
+		si_export_mrt_color(bld_base, color, mrt,
+				    num_params - 1,
+				    mrt == last_color_export);
+	}
+
+	/* Process depth, stencil, samplemask. */
+	if (key->ps_epilog.writes_z)
+		depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
+	if (key->ps_epilog.writes_stencil)
+		stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
+	if (key->ps_epilog.writes_samplemask)
+		samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
+
+	if (depth || stencil || samplemask)
+		si_export_mrt_z(bld_base, depth, stencil, samplemask);
+	else if (last_color_export == -1)
+		si_export_null(bld_base);
+
+	/* Compile. */
+	LLVMBuildRetVoid(gallivm->builder);
+	radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+	if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+			    gallivm->module, debug, ctx.type,
+			    "Fragment Shader Epilog"))
+		status = false;
+
+	radeon_llvm_dispose(&ctx.radeon_bld);
+	return status;
+}
+
+/**
+ * Select and compile (or reuse) pixel shader parts (prolog & epilog).
+ */
+static bool si_shader_select_ps_parts(struct si_screen *sscreen,
+				      LLVMTargetMachineRef tm,
+				      struct si_shader *shader,
+				      struct pipe_debug_callback *debug)
+{
+	struct tgsi_shader_info *info = &shader->selector->info;
+	union si_shader_part_key prolog_key;
+	union si_shader_part_key epilog_key;
+	unsigned i;
+
+	/* Get the prolog. */
+	memset(&prolog_key, 0, sizeof(prolog_key));
+	prolog_key.ps_prolog.states = shader->key.ps.prolog;
+	prolog_key.ps_prolog.colors_read = info->colors_read;
+	prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
+	prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
+
+	if (info->colors_read) {
+		unsigned *color = shader->selector->color_attr_index;
+
+		if (shader->key.ps.prolog.color_two_side) {
+			/* BCOLORs are stored after the last input. */
+			prolog_key.ps_prolog.num_interp_inputs = info->num_inputs;
+			prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
+			shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
+		}
+
+		for (i = 0; i < 2; i++) {
+			unsigned location = info->input_interpolate_loc[color[i]];
+
+			if (!(info->colors_read & (0xf << i*4)))
+				continue;
+
+			prolog_key.ps_prolog.color_attr_index[i] = color[i];
+
+			/* Force per-sample interpolation for the colors here. */
+			if (shader->key.ps.prolog.force_persample_interp)
+				location = TGSI_INTERPOLATE_LOC_SAMPLE;
+
+			switch (info->input_interpolate[color[i]]) {
+			case TGSI_INTERPOLATE_CONSTANT:
+				prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1;
+				break;
+			case TGSI_INTERPOLATE_PERSPECTIVE:
+			case TGSI_INTERPOLATE_COLOR:
+				switch (location) {
+				case TGSI_INTERPOLATE_LOC_SAMPLE:
+					prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0;
+					shader->config.spi_ps_input_ena |=
+						S_0286CC_PERSP_SAMPLE_ENA(1);
+					break;
+				case TGSI_INTERPOLATE_LOC_CENTER:
+					prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2;
+					shader->config.spi_ps_input_ena |=
+						S_0286CC_PERSP_CENTER_ENA(1);
+					break;
+				case TGSI_INTERPOLATE_LOC_CENTROID:
+					prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4;
+					shader->config.spi_ps_input_ena |=
+						S_0286CC_PERSP_CENTROID_ENA(1);
+					break;
+				default:
+					assert(0);
+				}
+				break;
+			case TGSI_INTERPOLATE_LINEAR:
+				switch (location) {
+				case TGSI_INTERPOLATE_LOC_SAMPLE:
+					prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6;
+					shader->config.spi_ps_input_ena |=
+						S_0286CC_LINEAR_SAMPLE_ENA(1);
+					break;
+				case TGSI_INTERPOLATE_LOC_CENTER:
+					prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8;
+					shader->config.spi_ps_input_ena |=
+						S_0286CC_LINEAR_CENTER_ENA(1);
+					break;
+				case TGSI_INTERPOLATE_LOC_CENTROID:
+					prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10;
+					shader->config.spi_ps_input_ena |=
+						S_0286CC_LINEAR_CENTROID_ENA(1);
+					break;
+				default:
+					assert(0);
+				}
+				break;
+			default:
+				assert(0);
+			}
+		}
+	}
+
+	/* The prolog is a no-op if these aren't set. */
+	if (prolog_key.ps_prolog.colors_read ||
+	    prolog_key.ps_prolog.states.force_persample_interp ||
+	    prolog_key.ps_prolog.states.poly_stipple) {
+		shader->prolog =
+			si_get_shader_part(sscreen, &sscreen->ps_prologs,
+					   &prolog_key, tm, debug,
+					   si_compile_ps_prolog);
+		if (!shader->prolog)
+			return false;
+	}
+
+	/* Get the epilog. */
+	memset(&epilog_key, 0, sizeof(epilog_key));
+	epilog_key.ps_epilog.colors_written = info->colors_written;
+	epilog_key.ps_epilog.writes_z = info->writes_z;
+	epilog_key.ps_epilog.writes_stencil = info->writes_stencil;
+	epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask;
+	epilog_key.ps_epilog.states = shader->key.ps.epilog;
+
+	shader->epilog =
+		si_get_shader_part(sscreen, &sscreen->ps_epilogs,
+				   &epilog_key, tm, debug,
+				   si_compile_ps_epilog);
+	if (!shader->epilog)
+		return false;
+
+	/* Enable POS_FIXED_PT if polygon stippling is enabled. */
+	if (shader->key.ps.prolog.poly_stipple) {
+		shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
+		assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
+	}
+
+	/* Set up the enable bits for per-sample shading if needed. */
+	if (shader->key.ps.prolog.force_persample_interp) {
+		if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
+		    G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena)) {
+			shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
+			shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
+			shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
+		}
+		if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
+		    G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena)) {
+			shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
+			shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
+			shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
+		}
+	}
+
+	/* POW_W_FLOAT requires that one of the perspective weights is enabled. */
+	if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
+	    !(shader->config.spi_ps_input_ena & 0xf)) {
+		shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
+		assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
+	}
+
+	/* At least one pair of interpolation weights must be enabled. */
+	if (!(shader->config.spi_ps_input_ena & 0x7f)) {
+		shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
+		assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
+	}
+
+	/* The sample mask input is always enabled, because the API shader always
+	 * passes it through to the epilog. Disable it here if it's unused.
+	 */
+	if (!shader->key.ps.epilog.poly_line_smoothing &&
+	    !shader->selector->info.reads_samplemask)
+		shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
+
+	return true;
+}
+
+int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
+		     struct si_shader *shader,
+		     struct pipe_debug_callback *debug)
+{
+	struct si_shader *mainp = shader->selector->main_shader_part;
+	int r;
+
+	/* LS and ES are always compiled on demand. */
+	if (!mainp ||
+	    (shader->selector->type == PIPE_SHADER_VERTEX &&
+	     (shader->key.vs.as_es || shader->key.vs.as_ls)) ||
+	    (shader->selector->type == PIPE_SHADER_TESS_EVAL &&
+	     shader->key.tes.as_es)) {
+		/* Monolithic shader (compiled as a whole, has many variants,
+		 * may take a long time to compile).
+		 */
+		r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
+		if (r)
+			return r;
+	} else {
+		/* The shader consists of 2-3 parts:
+		 *
+		 * - the middle part is the user shader, it has 1 variant only
+		 *   and it was compiled during the creation of the shader
+		 *   selector
+		 * - the prolog part is inserted at the beginning
+		 * - the epilog part is inserted at the end
+		 *
+		 * The prolog and epilog have many (but simple) variants.
+		 */
+
+		/* Copy the compiled TGSI shader data over. */
+		shader->is_binary_shared = true;
+		shader->binary = mainp->binary;
+		shader->config = mainp->config;
+		shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
+		shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
+		shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
+		memcpy(shader->info.vs_output_param_offset,
+		       mainp->info.vs_output_param_offset,
+		       sizeof(mainp->info.vs_output_param_offset));
+		shader->info.uses_instanceid = mainp->info.uses_instanceid;
+		shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
+		shader->info.nr_param_exports = mainp->info.nr_param_exports;
+
+		/* Select prologs and/or epilogs. */
+		switch (shader->selector->type) {
+		case PIPE_SHADER_VERTEX:
+			if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
+				return -1;
+			break;
+		case PIPE_SHADER_TESS_CTRL:
+			if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
+				return -1;
+			break;
+		case PIPE_SHADER_TESS_EVAL:
+			if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
+				return -1;
+			break;
+		case PIPE_SHADER_FRAGMENT:
+			if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
+				return -1;
+
+			/* Make sure we have at least as many VGPRs as there
+			 * are allocated inputs.
+			 */
+			shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
+							shader->info.num_input_vgprs);
+			break;
+		}
+
+		/* Update SGPR and VGPR counts. */
+		if (shader->prolog) {
+			shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
+							shader->prolog->config.num_sgprs);
+			shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
+							shader->prolog->config.num_vgprs);
+		}
+		if (shader->epilog) {
+			shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
+							shader->epilog->config.num_sgprs);
+			shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
+							shader->epilog->config.num_vgprs);
+		}
+	}
+
+	si_shader_dump(sscreen, shader, debug, shader->selector->info.processor);
+
+	/* Upload. */
+	r = si_shader_binary_upload(sscreen, shader);
+	if (r) {
+		fprintf(stderr, "LLVM failed to upload shader\n");
+		return r;
+	}
+
+	return 0;
+}
+
 void si_shader_destroy(struct si_shader *shader)
 {
 	if (shader->gs_copy_shader) {
@@ -4534,5 +5995,6 @@ void si_shader_destroy(struct si_shader *shader)
 
 	r600_resource_reference(&shader->bo, NULL);
 
-	radeon_shader_binary_clean(&shader->binary);
+	if (!shader->is_binary_shared)
+		radeon_shader_binary_clean(&shader->binary);
 }
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index dc75e0330e4..ff5c24d8918 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -75,6 +75,8 @@
 struct radeon_shader_binary;
 struct radeon_shader_reloc;
 
+#define SI_MAX_VS_OUTPUTS	40
+
 #define SI_SGPR_RW_BUFFERS	0  /* rings (& stream-out, VS only) */
 #define SI_SGPR_CONST_BUFFERS	2
 #define SI_SGPR_SAMPLERS	4  /* images & sampler states interleaved */
@@ -169,7 +171,7 @@ struct radeon_shader_reloc;
 #define SI_PARAM_SAMPLE_COVERAGE	20
 #define SI_PARAM_POS_FIXED_PT		21
 
-#define SI_NUM_PARAMS (SI_PARAM_POS_FIXED_PT + 1)
+#define SI_NUM_PARAMS (SI_PARAM_POS_FIXED_PT + 9) /* +8 for COLOR[0..1] */
 
 struct si_shader;
 
@@ -181,6 +183,11 @@ struct si_shader_selector {
 	struct si_shader	*first_variant; /* immutable after the first variant */
 	struct si_shader	*last_variant; /* mutable */
 
+	/* The compiled TGSI shader expecting a prolog and/or epilog (not
+	 * uploaded to a buffer).
+	 */
+	struct si_shader	*main_shader_part;
+
 	struct tgsi_token       *tokens;
 	struct pipe_stream_output_info  so;
 	struct tgsi_shader_info		info;
@@ -199,6 +206,7 @@ struct si_shader_selector {
 	unsigned	max_gsvs_emit_size;
 
 	/* PS parameters. */
+	unsigned	color_attr_index[2];
 	unsigned	db_shader_control;
 	/* Set 0xf or 0x0 (4 bits) per each written output.
 	 * ANDed with spi_shader_col_format.
@@ -221,37 +229,103 @@ struct si_shader_selector {
  * With both:        LS | HS  | ES  | GS | VS | PS
  */
 
+/* Common VS bits between the shader key and the prolog key. */
+struct si_vs_prolog_bits {
+	unsigned	instance_divisors[SI_NUM_VERTEX_BUFFERS];
+};
+
+/* Common VS bits between the shader key and the epilog key. */
+struct si_vs_epilog_bits {
+	unsigned	export_prim_id:1; /* when PS needs it and GS is disabled */
+	/* TODO:
+	 * - skip clipdist, culldist (including clipvertex code) exports based
+	 *   on which clip_plane_enable bits are set
+	 * - skip layer, viewport, clipdist, and culldist parameter exports
+	 *   if PS doesn't read them
+	 */
+};
+
+/* Common TCS bits between the shader key and the epilog key. */
+struct si_tcs_epilog_bits {
+	unsigned	prim_mode:3;
+};
+
+/* Common PS bits between the shader key and the prolog key. */
+struct si_ps_prolog_bits {
+	unsigned	color_two_side:1;
+	/* TODO: add a flatshade bit that skips interpolation for colors */
+	unsigned	poly_stipple:1;
+	unsigned	force_persample_interp:1;
+	/* TODO:
+	 * - add force_center_interp if MSAA is disabled and centroid or
+	 *   sample are present
+	 * - add force_center_interp_bc_optimize to force center interpolation
+	 *   based on the bc_optimize SGPR bit if MSAA is enabled, centroid is
+	 *   present and sample isn't present.
+	 */
+};
+
+/* Common PS bits between the shader key and the epilog key. */
+struct si_ps_epilog_bits {
+	unsigned	spi_shader_col_format;
+	unsigned	color_is_int8:8;
+	unsigned	last_cbuf:3;
+	unsigned	alpha_func:3;
+	unsigned	alpha_to_one:1;
+	unsigned	poly_line_smoothing:1;
+	unsigned	clamp_color:1;
+};
+
+union si_shader_part_key {
+	struct {
+		struct si_vs_prolog_bits states;
+		unsigned	num_input_sgprs:5;
+		unsigned	last_input:4;
+	} vs_prolog;
+	struct {
+		struct si_vs_epilog_bits states;
+		unsigned	prim_id_param_offset:5;
+	} vs_epilog;
+	struct {
+		struct si_tcs_epilog_bits states;
+	} tcs_epilog;
+	struct {
+		struct si_ps_prolog_bits states;
+		unsigned	num_input_sgprs:5;
+		unsigned	num_input_vgprs:5;
+		/* Color interpolation and two-side color selection. */
+		unsigned	colors_read:8; /* color input components read */
+		unsigned	num_interp_inputs:5; /* BCOLOR is at this location */
+		unsigned	face_vgpr_index:5;
+		char		color_attr_index[2];
+		char		color_interp_vgpr_index[2]; /* -1 == constant */
+	} ps_prolog;
+	struct {
+		struct si_ps_epilog_bits states;
+		unsigned	colors_written:8;
+		unsigned	writes_z:1;
+		unsigned	writes_stencil:1;
+		unsigned	writes_samplemask:1;
+	} ps_epilog;
+};
+
 union si_shader_key {
 	struct {
-		unsigned	spi_shader_col_format;
-		unsigned	color_is_int8:8;
-		unsigned	last_cbuf:3;
-		unsigned	color_two_side:1;
-		unsigned	alpha_func:3;
-		unsigned	alpha_to_one:1;
-		unsigned	poly_stipple:1;
-		unsigned	poly_line_smoothing:1;
-		unsigned	clamp_color:1;
-		unsigned	force_persample_interp:1;
+		struct si_ps_prolog_bits prolog;
+		struct si_ps_epilog_bits epilog;
 	} ps;
 	struct {
-		unsigned	instance_divisors[SI_NUM_VERTEX_BUFFERS];
-		/* Mask of "get_unique_index" bits - which outputs are read
-		 * by the next stage (needed by ES).
-		 * This describes how outputs are laid out in memory. */
+		struct si_vs_prolog_bits prolog;
+		struct si_vs_epilog_bits epilog;
 		unsigned	as_es:1; /* export shader */
 		unsigned	as_ls:1; /* local shader */
-		unsigned	export_prim_id:1; /* when PS needs it and GS is disabled */
 	} vs;
 	struct {
-		unsigned	prim_mode:3;
+		struct si_tcs_epilog_bits epilog;
 	} tcs; /* tessellation control shader */
 	struct {
-		/* Mask of "get_unique_index" bits - which outputs are read
-		 * by the next stage (needed by ES).
-		 * This describes how outputs are laid out in memory. */
+		struct si_vs_epilog_bits epilog; /* same as VS */
 		unsigned	as_es:1; /* export shader */
-		unsigned	export_prim_id:1; /* when PS needs it and GS is disabled */
 	} tes; /* tessellation evaluation shader */
 };
 
@@ -267,22 +341,42 @@ struct si_shader_config {
 	unsigned			rsrc2;
 };
 
+/* GCN-specific shader info. */
+struct si_shader_info {
+	ubyte			vs_output_param_offset[SI_MAX_VS_OUTPUTS];
+	ubyte			num_input_sgprs;
+	ubyte			num_input_vgprs;
+	char			face_vgpr_index;
+	bool			uses_instanceid;
+	ubyte			nr_pos_exports;
+	ubyte			nr_param_exports;
+};
+
 struct si_shader {
 	struct si_shader_selector	*selector;
 	struct si_shader		*next_variant;
 
+	struct si_shader_part		*prolog;
+	struct si_shader_part		*epilog;
+
 	struct si_shader		*gs_copy_shader;
 	struct si_pm4_state		*pm4;
 	struct r600_resource		*bo;
 	struct r600_resource		*scratch_bo;
 	union si_shader_key		key;
+	bool				is_binary_shared;
+
+	/* The following data is all that's needed for binary shaders. */
 	struct radeon_shader_binary	binary;
 	struct si_shader_config		config;
+	struct si_shader_info		info;
+};
 
-	unsigned		vs_output_param_offset[PIPE_MAX_SHADER_OUTPUTS];
-	bool			uses_instanceid;
-	unsigned		nr_pos_exports;
-	unsigned		nr_param_exports;
+struct si_shader_part {
+	struct si_shader_part *next;
+	union si_shader_part_key key;
+	struct radeon_shader_binary binary;
+	struct si_shader_config config;
 };
 
 static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
@@ -310,14 +404,19 @@ static inline struct si_shader* si_get_vs_state(struct si_context *sctx)
 static inline bool si_vs_exports_prim_id(struct si_shader *shader)
 {
 	if (shader->selector->type == PIPE_SHADER_VERTEX)
-		return shader->key.vs.export_prim_id;
+		return shader->key.vs.epilog.export_prim_id;
 	else if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
-		return shader->key.tes.export_prim_id;
+		return shader->key.tes.epilog.export_prim_id;
 	else
 		return false;
 }
 
-/* radeonsi_shader.c */
+/* si_shader.c */
+int si_compile_tgsi_shader(struct si_screen *sscreen,
+			   LLVMTargetMachineRef tm,
+			   struct si_shader *shader,
+			   bool is_monolithic,
+			   struct pipe_debug_callback *debug);
 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 		     struct si_shader *shader,
 		     struct pipe_debug_callback *debug);
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index bf780777b50..2dfdbeb8d8f 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -277,7 +277,7 @@ static void si_emit_cb_render_state(struct si_context *sctx, struct r600_atom *a
 	if (sctx->b.family == CHIP_STONEY) {
 		unsigned spi_shader_col_format =
 			sctx->ps_shader.cso ?
-			sctx->ps_shader.current->key.ps.spi_shader_col_format : 0;
+			sctx->ps_shader.current->key.ps.epilog.spi_shader_col_format : 0;
 		unsigned sx_ps_downconvert = 0;
 		unsigned sx_blend_opt_epsilon = 0;
 		unsigned sx_blend_opt_control = 0;
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index f64c4d45f1b..40792cbc1d5 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -280,6 +280,8 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 /* si_state_shader.c */
 bool si_update_shaders(struct si_context *sctx);
 void si_init_shader_functions(struct si_context *sctx);
+bool si_init_shader_cache(struct si_screen *sscreen);
+void si_destroy_shader_cache(struct si_screen *sscreen);
 
 /* si_state_draw.c */
 void si_emit_cache_flush(struct si_context *sctx, struct r600_atom *atom);
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 77a4e47c809..a6753a7a528 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -32,10 +32,221 @@
 
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_ureg.h"
+#include "util/hash_table.h"
+#include "util/u_hash.h"
 #include "util/u_memory.h"
 #include "util/u_prim.h"
 #include "util/u_simple_shaders.h"
 
+/* SHADER_CACHE */
+
+/**
+ * Return the TGSI binary in a buffer. The first 4 bytes contain its size as
+ * integer.
+ */
+static void *si_get_tgsi_binary(struct si_shader_selector *sel)
+{
+	unsigned tgsi_size = tgsi_num_tokens(sel->tokens) *
+			     sizeof(struct tgsi_token);
+	unsigned size = 4 + tgsi_size + sizeof(sel->so);
+	char *result = (char*)MALLOC(size);
+
+	if (!result)
+		return NULL;
+
+	*((uint32_t*)result) = size;
+	memcpy(result + 4, sel->tokens, tgsi_size);
+	memcpy(result + 4 + tgsi_size, &sel->so, sizeof(sel->so));
+	return result;
+}
+
+/** Copy "data" to "ptr" and return the next dword following copied data. */
+static uint32_t *write_data(uint32_t *ptr, const void *data, unsigned size)
+{
+	memcpy(ptr, data, size);
+	ptr += DIV_ROUND_UP(size, 4);
+	return ptr;
+}
+
+/** Read data from "ptr". Return the next dword following the data. */
+static uint32_t *read_data(uint32_t *ptr, void *data, unsigned size)
+{
+	memcpy(data, ptr, size);
+	ptr += DIV_ROUND_UP(size, 4);
+	return ptr;
+}
+
+/**
+ * Write the size as uint followed by the data. Return the next dword
+ * following the copied data.
+ */
+static uint32_t *write_chunk(uint32_t *ptr, const void *data, unsigned size)
+{
+	*ptr++ = size;
+	return write_data(ptr, data, size);
+}
+
+/**
+ * Read the size as uint followed by the data. Return both via parameters.
+ * Return the next dword following the data.
+ */
+static uint32_t *read_chunk(uint32_t *ptr, void **data, unsigned *size)
+{
+	*size = *ptr++;
+	assert(*data == NULL);
+	*data = malloc(*size);
+	return read_data(ptr, *data, *size);
+}
+
+/**
+ * Return the shader binary in a buffer. The first 4 bytes contain its size
+ * as integer.
+ */
+static void *si_get_shader_binary(struct si_shader *shader)
+{
+	/* There is always a size of data followed by the data itself. */
+	unsigned relocs_size = shader->binary.reloc_count *
+			       sizeof(shader->binary.relocs[0]);
+	unsigned disasm_size = strlen(shader->binary.disasm_string) + 1;
+	unsigned size =
+		4 + /* total size */
+		4 + /* CRC32 of the data below */
+		align(sizeof(shader->config), 4) +
+		align(sizeof(shader->info), 4) +
+		4 + align(shader->binary.code_size, 4) +
+		4 + align(shader->binary.rodata_size, 4) +
+		4 + align(relocs_size, 4) +
+		4 + align(disasm_size, 4);
+	void *buffer = CALLOC(1, size);
+	uint32_t *ptr = (uint32_t*)buffer;
+
+	if (!buffer)
+		return NULL;
+
+	*ptr++ = size;
+	ptr++; /* CRC32 is calculated at the end. */
+
+	ptr = write_data(ptr, &shader->config, sizeof(shader->config));
+	ptr = write_data(ptr, &shader->info, sizeof(shader->info));
+	ptr = write_chunk(ptr, shader->binary.code, shader->binary.code_size);
+	ptr = write_chunk(ptr, shader->binary.rodata, shader->binary.rodata_size);
+	ptr = write_chunk(ptr, shader->binary.relocs, relocs_size);
+	ptr = write_chunk(ptr, shader->binary.disasm_string, disasm_size);
+	assert((char *)ptr - (char *)buffer == size);
+
+	/* Compute CRC32. */
+	ptr = (uint32_t*)buffer;
+	ptr++;
+	*ptr = util_hash_crc32(ptr + 1, size - 8);
+
+	return buffer;
+}
+
+static bool si_load_shader_binary(struct si_shader *shader, void *binary)
+{
+	uint32_t *ptr = (uint32_t*)binary;
+	uint32_t size = *ptr++;
+	uint32_t crc32 = *ptr++;
+	unsigned chunk_size;
+
+	if (util_hash_crc32(ptr, size - 8) != crc32) {
+		fprintf(stderr, "radeonsi: binary shader has invalid CRC32\n");
+		return false;
+	}
+
+	ptr = read_data(ptr, &shader->config, sizeof(shader->config));
+	ptr = read_data(ptr, &shader->info, sizeof(shader->info));
+	ptr = read_chunk(ptr, (void**)&shader->binary.code,
+			 &shader->binary.code_size);
+	ptr = read_chunk(ptr, (void**)&shader->binary.rodata,
+			 &shader->binary.rodata_size);
+	ptr = read_chunk(ptr, (void**)&shader->binary.relocs, &chunk_size);
+	shader->binary.reloc_count = chunk_size / sizeof(shader->binary.relocs[0]);
+	ptr = read_chunk(ptr, (void**)&shader->binary.disasm_string, &chunk_size);
+
+	return true;
+}
+
+/**
+ * Insert a shader into the cache. It's assumed the shader is not in the cache.
+ * Use si_shader_cache_load_shader before calling this.
+ *
+ * Returns false on failure, in which case the tgsi_binary should be freed.
+ */
+static bool si_shader_cache_insert_shader(struct si_screen *sscreen,
+					  void *tgsi_binary,
+					  struct si_shader *shader)
+{
+	void *hw_binary = si_get_shader_binary(shader);
+
+	if (!hw_binary)
+		return false;
+
+	if (_mesa_hash_table_insert(sscreen->shader_cache, tgsi_binary,
+				    hw_binary) == NULL) {
+		FREE(hw_binary);
+		return false;
+	}
+
+	return true;
+}
+
+static bool si_shader_cache_load_shader(struct si_screen *sscreen,
+					void *tgsi_binary,
+				        struct si_shader *shader)
+{
+	struct hash_entry *entry =
+		_mesa_hash_table_search(sscreen->shader_cache, tgsi_binary);
+	if (!entry)
+		return false;
+
+	return si_load_shader_binary(shader, entry->data);
+}
+
+static uint32_t si_shader_cache_key_hash(const void *key)
+{
+	/* The first dword is the key size. */
+	return util_hash_crc32(key, *(uint32_t*)key);
+}
+
+static bool si_shader_cache_key_equals(const void *a, const void *b)
+{
+	uint32_t *keya = (uint32_t*)a;
+	uint32_t *keyb = (uint32_t*)b;
+
+	/* The first dword is the key size. */
+	if (*keya != *keyb)
+		return false;
+
+	return memcmp(keya, keyb, *keya) == 0;
+}
+
+static void si_destroy_shader_cache_entry(struct hash_entry *entry)
+{
+	FREE((void*)entry->key);
+	FREE(entry->data);
+}
+
+bool si_init_shader_cache(struct si_screen *sscreen)
+{
+	pipe_mutex_init(sscreen->shader_cache_mutex);
+	sscreen->shader_cache =
+		_mesa_hash_table_create(NULL,
+					si_shader_cache_key_hash,
+					si_shader_cache_key_equals);
+	return sscreen->shader_cache != NULL;
+}
+
+void si_destroy_shader_cache(struct si_screen *sscreen)
+{
+	if (sscreen->shader_cache)
+		_mesa_hash_table_destroy(sscreen->shader_cache,
+					 si_destroy_shader_cache_entry);
+	pipe_mutex_destroy(sscreen->shader_cache_mutex);
+}
+
+/* SHADER STATES */
+
 static void si_set_tesseval_regs(struct si_shader *shader,
 				 struct si_pm4_state *pm4)
 {
@@ -108,7 +319,7 @@ static void si_shader_ls(struct si_shader *shader)
 
 	/* We need at least 2 components for LS.
 	 * VGPR0-3: (VertexID, RelAutoindex, ???, InstanceID). */
-	vgpr_comp_cnt = shader->uses_instanceid ? 3 : 1;
+	vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 1;
 
 	num_user_sgprs = SI_LS_NUM_USER_SGPR;
 	num_sgprs = shader->config.num_sgprs;
@@ -181,7 +392,7 @@ static void si_shader_es(struct si_shader *shader)
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
 
 	if (shader->selector->type == PIPE_SHADER_VERTEX) {
-		vgpr_comp_cnt = shader->uses_instanceid ? 3 : 0;
+		vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 0;
 		num_user_sgprs = SI_ES_NUM_USER_SGPR;
 	} else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
 		vgpr_comp_cnt = 3; /* all components are needed for TES */
@@ -347,7 +558,7 @@ static void si_shader_vs(struct si_shader *shader, struct si_shader *gs)
 		vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */
 		num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR;
 	} else if (shader->selector->type == PIPE_SHADER_VERTEX) {
-		vgpr_comp_cnt = shader->uses_instanceid ? 3 : (enable_prim_id ? 2 : 0);
+		vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : (enable_prim_id ? 2 : 0);
 		num_user_sgprs = SI_VS_NUM_USER_SGPR;
 	} else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
 		vgpr_comp_cnt = 3; /* all components are needed for TES */
@@ -363,19 +574,19 @@ static void si_shader_vs(struct si_shader *shader, struct si_shader *gs)
 	assert(num_sgprs <= 104);
 
 	/* VS is required to export at least one param. */
-	nparams = MAX2(shader->nr_param_exports, 1);
+	nparams = MAX2(shader->info.nr_param_exports, 1);
 	si_pm4_set_reg(pm4, R_0286C4_SPI_VS_OUT_CONFIG,
 		       S_0286C4_VS_EXPORT_COUNT(nparams - 1));
 
 	si_pm4_set_reg(pm4, R_02870C_SPI_SHADER_POS_FORMAT,
 		       S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
-		       S_02870C_POS1_EXPORT_FORMAT(shader->nr_pos_exports > 1 ?
+		       S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ?
 						   V_02870C_SPI_SHADER_4COMP :
 						   V_02870C_SPI_SHADER_NONE) |
-		       S_02870C_POS2_EXPORT_FORMAT(shader->nr_pos_exports > 2 ?
+		       S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ?
 						   V_02870C_SPI_SHADER_4COMP :
 						   V_02870C_SPI_SHADER_NONE) |
-		       S_02870C_POS3_EXPORT_FORMAT(shader->nr_pos_exports > 3 ?
+		       S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ?
 						   V_02870C_SPI_SHADER_4COMP :
 						   V_02870C_SPI_SHADER_NONE));
 
@@ -415,7 +626,7 @@ static unsigned si_get_ps_num_interp(struct si_shader *ps)
 	unsigned num_colors = !!(info->colors_read & 0x0f) +
 			      !!(info->colors_read & 0xf0);
 	unsigned num_interp = ps->selector->info.num_inputs +
-			      (ps->key.ps.color_two_side ? num_colors : 0);
+			      (ps->key.ps.prolog.color_two_side ? num_colors : 0);
 
 	assert(num_interp <= 32);
 	return MIN2(num_interp, 32);
@@ -423,7 +634,7 @@ static unsigned si_get_ps_num_interp(struct si_shader *ps)
 
 static unsigned si_get_spi_shader_col_format(struct si_shader *shader)
 {
-	unsigned value = shader->key.ps.spi_shader_col_format;
+	unsigned value = shader->key.ps.epilog.spi_shader_col_format;
 	unsigned i, num_targets = (util_last_bit(value) + 3) / 4;
 
 	/* If the i-th target format is set, all previous target formats must
@@ -528,7 +739,7 @@ static void si_shader_ps(struct si_shader *shader)
 	if (!spi_shader_col_format &&
 	    !info->writes_z && !info->writes_stencil && !info->writes_samplemask &&
 	    (shader->selector->info.uses_kill ||
-	     shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS))
+	     shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS))
 		spi_shader_col_format = V_028714_SPI_SHADER_32_R;
 
 	si_pm4_set_reg(pm4, R_0286CC_SPI_PS_INPUT_ENA, input_ena);
@@ -638,11 +849,13 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 
 	switch (sel->type) {
 	case PIPE_SHADER_VERTEX:
-		if (sctx->vertex_elements)
-			for (i = 0; i < sctx->vertex_elements->count; ++i)
-				key->vs.instance_divisors[i] =
+		if (sctx->vertex_elements) {
+			unsigned count = MIN2(sel->info.num_inputs,
+					      sctx->vertex_elements->count);
+			for (i = 0; i < count; ++i)
+				key->vs.prolog.instance_divisors[i] =
 					sctx->vertex_elements->elements[i].instance_divisor;
-
+		}
 		if (sctx->tes_shader.cso)
 			key->vs.as_ls = 1;
 		else if (sctx->gs_shader.cso)
@@ -650,17 +863,17 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 
 		if (!sctx->gs_shader.cso && sctx->ps_shader.cso &&
 		    sctx->ps_shader.cso->info.uses_primid)
-			key->vs.export_prim_id = 1;
+			key->vs.epilog.export_prim_id = 1;
 		break;
 	case PIPE_SHADER_TESS_CTRL:
-		key->tcs.prim_mode =
+		key->tcs.epilog.prim_mode =
 			sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
 		break;
 	case PIPE_SHADER_TESS_EVAL:
 		if (sctx->gs_shader.cso)
 			key->tes.as_es = 1;
 		else if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
-			key->tes.export_prim_id = 1;
+			key->tes.epilog.export_prim_id = 1;
 		break;
 	case PIPE_SHADER_GEOMETRY:
 		break;
@@ -670,13 +883,13 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 
 		if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
 		    sel->info.colors_written == 0x1)
-			key->ps.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
+			key->ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
 
 		if (blend) {
 			/* Select the shader color format based on whether
 			 * blending or alpha are needed.
 			 */
-			key->ps.spi_shader_col_format =
+			key->ps.epilog.spi_shader_col_format =
 				(blend->blend_enable_4bit & blend->need_src_alpha_4bit &
 				 sctx->framebuffer.spi_shader_col_format_blend_alpha) |
 				(blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
@@ -686,26 +899,26 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 				(~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
 				 sctx->framebuffer.spi_shader_col_format);
 		} else
-			key->ps.spi_shader_col_format = sctx->framebuffer.spi_shader_col_format;
+			key->ps.epilog.spi_shader_col_format = sctx->framebuffer.spi_shader_col_format;
 
 		/* If alpha-to-coverage is enabled, we have to export alpha
 		 * even if there is no color buffer.
 		 */
-		if (!(key->ps.spi_shader_col_format & 0xf) &&
+		if (!(key->ps.epilog.spi_shader_col_format & 0xf) &&
 		    blend && blend->alpha_to_coverage)
-			key->ps.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR;
+			key->ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR;
 
 		/* On SI and CIK except Hawaii, the CB doesn't clamp outputs
 		 * to the range supported by the type if a channel has less
 		 * than 16 bits and the export format is 16_ABGR.
 		 */
 		if (sctx->b.chip_class <= CIK && sctx->b.family != CHIP_HAWAII)
-			key->ps.color_is_int8 = sctx->framebuffer.color_is_int8;
+			key->ps.epilog.color_is_int8 = sctx->framebuffer.color_is_int8;
 
 		/* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */
-		if (!key->ps.last_cbuf) {
-			key->ps.spi_shader_col_format &= sel->colors_written_4bit;
-			key->ps.color_is_int8 &= sel->info.colors_written;
+		if (!key->ps.epilog.last_cbuf) {
+			key->ps.epilog.spi_shader_col_format &= sel->colors_written_4bit;
+			key->ps.epilog.color_is_int8 &= sel->info.colors_written;
 		}
 
 		if (rs) {
@@ -714,31 +927,32 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 				       sctx->current_rast_prim >= PIPE_PRIM_TRIANGLES_ADJACENCY;
 			bool is_line = !is_poly && sctx->current_rast_prim != PIPE_PRIM_POINTS;
 
-			key->ps.color_two_side = rs->two_side && sel->info.colors_read;
+			key->ps.prolog.color_two_side = rs->two_side && sel->info.colors_read;
 
 			if (sctx->queued.named.blend) {
-				key->ps.alpha_to_one = sctx->queued.named.blend->alpha_to_one &&
-						       rs->multisample_enable &&
-						       !sctx->framebuffer.cb0_is_integer;
+				key->ps.epilog.alpha_to_one = sctx->queued.named.blend->alpha_to_one &&
+							      rs->multisample_enable &&
+							      !sctx->framebuffer.cb0_is_integer;
 			}
 
-			key->ps.poly_stipple = rs->poly_stipple_enable && is_poly;
-			key->ps.poly_line_smoothing = ((is_poly && rs->poly_smooth) ||
-						       (is_line && rs->line_smooth)) &&
-						      sctx->framebuffer.nr_samples <= 1;
-			key->ps.clamp_color = rs->clamp_fragment_color;
-
-			key->ps.force_persample_interp = rs->force_persample_interp &&
-							 rs->multisample_enable &&
-							 sctx->framebuffer.nr_samples > 1 &&
-							 sctx->ps_iter_samples > 1 &&
-							 (sel->info.uses_persp_center ||
-							  sel->info.uses_persp_centroid ||
-							  sel->info.uses_linear_center ||
-							  sel->info.uses_linear_centroid);
+			key->ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
+			key->ps.epilog.poly_line_smoothing = ((is_poly && rs->poly_smooth) ||
+							      (is_line && rs->line_smooth)) &&
+							     sctx->framebuffer.nr_samples <= 1;
+			key->ps.epilog.clamp_color = rs->clamp_fragment_color;
+
+			key->ps.prolog.force_persample_interp =
+				rs->force_persample_interp &&
+				rs->multisample_enable &&
+				sctx->framebuffer.nr_samples > 1 &&
+				sctx->ps_iter_samples > 1 &&
+				(sel->info.uses_persp_center ||
+				 sel->info.uses_persp_centroid ||
+				 sel->info.uses_linear_center ||
+				 sel->info.uses_linear_centroid);
 		}
 
-		key->ps.alpha_func = si_get_alpha_test_func(sctx);
+		key->ps.epilog.alpha_func = si_get_alpha_test_func(sctx);
 		break;
 	}
 	default:
@@ -821,6 +1035,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 				       const struct pipe_shader_state *state)
 {
 	struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+	struct si_context *sctx = (struct si_context*)ctx;
 	struct si_shader_selector *sel = CALLOC_STRUCT(si_shader_selector);
 	int i;
 
@@ -900,6 +1115,13 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 		for (i = 0; i < 8; i++)
 			if (sel->info.colors_written & (1 << i))
 				sel->colors_written_4bit |= 0xf << (4 * i);
+
+		for (i = 0; i < sel->info.num_inputs; i++) {
+			if (sel->info.input_semantic_name[i] == TGSI_SEMANTIC_COLOR) {
+				int index = sel->info.input_semantic_index[i];
+				sel->color_attr_index[index] = i;
+			}
+		}
 		break;
 	}
 
@@ -921,6 +1143,44 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 		break;
 	}
 
+	/* Compile the main shader part for use with a prolog and/or epilog. */
+	if (sel->type != PIPE_SHADER_GEOMETRY &&
+	    !sscreen->use_monolithic_shaders) {
+		struct si_shader *shader = CALLOC_STRUCT(si_shader);
+		void *tgsi_binary;
+
+		if (!shader)
+			goto error;
+
+		shader->selector = sel;
+
+		tgsi_binary = si_get_tgsi_binary(sel);
+
+		/* Try to load the shader from the shader cache. */
+		pipe_mutex_lock(sscreen->shader_cache_mutex);
+
+		if (tgsi_binary &&
+		    si_shader_cache_load_shader(sscreen, tgsi_binary, shader)) {
+			FREE(tgsi_binary);
+		} else {
+			/* Compile the shader if it hasn't been loaded from the cache. */
+			if (si_compile_tgsi_shader(sscreen, sctx->tm, shader, false,
+						   &sctx->b.debug) != 0) {
+				FREE(shader);
+				FREE(tgsi_binary);
+				pipe_mutex_unlock(sscreen->shader_cache_mutex);
+				goto error;
+			}
+
+			if (tgsi_binary &&
+			    !si_shader_cache_insert_shader(sscreen, tgsi_binary, shader))
+				FREE(tgsi_binary);
+		}
+		pipe_mutex_unlock(sscreen->shader_cache_mutex);
+
+		sel->main_shader_part = shader;
+	}
+
 	/* Pre-compilation. */
 	if (sel->type == PIPE_SHADER_GEOMETRY ||
 	    sscreen->b.debug_flags & DBG_PRECOMPILE) {
@@ -934,27 +1194,29 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 		 */
 		switch (sel->type) {
 		case PIPE_SHADER_TESS_CTRL:
-			key.tcs.prim_mode = PIPE_PRIM_TRIANGLES;
+			key.tcs.epilog.prim_mode = PIPE_PRIM_TRIANGLES;
 			break;
 		case PIPE_SHADER_FRAGMENT:
-			key.ps.alpha_func = PIPE_FUNC_ALWAYS;
+			key.ps.epilog.alpha_func = PIPE_FUNC_ALWAYS;
 			for (i = 0; i < 8; i++)
 				if (sel->info.colors_written & (1 << i))
-					key.ps.spi_shader_col_format |=
+					key.ps.epilog.spi_shader_col_format |=
 						V_028710_SPI_SHADER_FP16_ABGR << (i * 4);
 			break;
 		}
 
-		if (si_shader_select_with_key(ctx, &state, &key)) {
-			fprintf(stderr, "radeonsi: can't create a shader\n");
-			tgsi_free_tokens(sel->tokens);
-			FREE(sel);
-			return NULL;
-		}
+		if (si_shader_select_with_key(ctx, &state, &key))
+			goto error;
 	}
 
 	pipe_mutex_init(sel->mutex);
 	return sel;
+
+error:
+	fprintf(stderr, "radeonsi: can't create a shader\n");
+	tgsi_free_tokens(sel->tokens);
+	FREE(sel);
+	return NULL;
 }
 
 /**
@@ -1119,6 +1381,9 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
 		p = c;
 	}
 
+	if (sel->main_shader_part)
+		si_delete_shader(sctx, sel->main_shader_part);
+
 	pipe_mutex_destroy(sel->mutex);
 	free(sel->tokens);
 	free(sel);
@@ -1144,14 +1409,14 @@ static unsigned si_get_ps_input_cntl(struct si_context *sctx,
 	for (j = 0; j < vsinfo->num_outputs; j++) {
 		if (name == vsinfo->output_semantic_name[j] &&
 		    index == vsinfo->output_semantic_index[j]) {
-			ps_input_cntl |= S_028644_OFFSET(vs->vs_output_param_offset[j]);
+			ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[j]);
 			break;
 		}
 	}
 
 	if (name == TGSI_SEMANTIC_PRIMID)
 		/* PrimID is written after the last output. */
-		ps_input_cntl |= S_028644_OFFSET(vs->vs_output_param_offset[vsinfo->num_outputs]);
+		ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[vsinfo->num_outputs]);
 	else if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
 		/* No corresponding output found, load defaults into input.
 		 * Don't set any other bits.
@@ -1191,7 +1456,7 @@ static void si_emit_spi_map(struct si_context *sctx, struct r600_atom *atom)
 		}
 	}
 
-	if (ps->key.ps.color_two_side) {
+	if (ps->key.ps.prolog.color_two_side) {
 		unsigned bcol = TGSI_SEMANTIC_BCOLOR;
 
 		for (i = 0; i < 2; i++) {
@@ -1745,8 +2010,8 @@ bool si_update_shaders(struct si_context *sctx)
 			si_mark_atom_dirty(sctx, &sctx->db_render_state);
 		}
 
-		if (sctx->smoothing_enabled != sctx->ps_shader.current->key.ps.poly_line_smoothing) {
-			sctx->smoothing_enabled = sctx->ps_shader.current->key.ps.poly_line_smoothing;
+		if (sctx->smoothing_enabled != sctx->ps_shader.current->key.ps.epilog.poly_line_smoothing) {
+			sctx->smoothing_enabled = sctx->ps_shader.current->key.ps.epilog.poly_line_smoothing;
 			si_mark_atom_dirty(sctx, &sctx->msaa_config);
 
 			if (sctx->b.chip_class == SI)