12 files changed, 669 insertions, 188 deletions
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 75a9d56d110..a93887ec271 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -680,6 +680,14 @@ static bool do_hardware_msaa_resolve(struct pipe_context *ctx,
 	enum pipe_format format = int_to_norm_format(info->dst.format);
 	unsigned sample_mask = ~0;
 
+	/* Hardware MSAA resolve doesn't work if SPI format = NORM16_ABGR and
+	 * the format is R16G16. Use R16A16, which does work.
+	 */
+	if (format == PIPE_FORMAT_R16G16_UNORM)
+		format = PIPE_FORMAT_R16A16_UNORM;
+	if (format == PIPE_FORMAT_R16G16_SNORM)
+		format = PIPE_FORMAT_R16A16_SNORM;
+
 	if (info->src.resource->nr_samples > 1 &&
 	    info->dst.resource->nr_samples <= 1 &&
 	    util_max_layer(info->src.resource, 0) == 0 &&
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 5a08cbfb198..6ef6eeec178 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -61,7 +61,7 @@ static void init_scratch_buffer(struct si_context *sctx, struct si_compute *prog
 
 	/* Compute the scratch buffer size using the maximum number of waves.
 	 * This way we don't need to recompute it for each kernel launch. */
-	unsigned scratch_waves = 32 * sctx->screen->b.info.max_compute_units;
+	unsigned scratch_waves = 32 * sctx->screen->b.info.num_good_compute_units;
 	for (i = 0; i < program->shader.binary.global_symbol_count; i++) {
 		unsigned offset =
 				program->shader.binary.global_symbol_offsets[i];
@@ -402,7 +402,7 @@ static void si_launch_grid(
 
 	num_waves_for_scratch =
 		MIN2(num_waves_for_scratch,
-		     32 * sctx->screen->b.info.max_compute_units);
+		     32 * sctx->screen->b.info.num_good_compute_units);
 	si_pm4_set_reg(pm4, R_00B860_COMPUTE_TMPRING_SIZE,
 		/* The maximum value for WAVES is 32 * num CU.
 		 * If you program this value incorrectly, the GPU will hang if
diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c
index a07b1c56579..e16ebbdef3e 100644
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
@@ -771,7 +771,7 @@ void si_check_vm_faults(struct si_context *sctx)
 	if (!si_vm_fault_occured(sctx, &addr))
 		return;
 
-	f = dd_get_debug_file();
+	f = dd_get_debug_file(false);
 	if (!f)
 		return;
 
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index d157a9ffb00..6c796731a18 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -138,6 +138,22 @@ static void si_release_sampler_views(struct si_sampler_views *views)
 	si_release_descriptors(&views->desc);
 }
 
+static void si_sampler_view_add_buffers(struct si_context *sctx,
+					struct si_sampler_view *rview)
+{
+	if (rview->resource) {
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+			rview->resource, RADEON_USAGE_READ,
+			r600_get_sampler_view_priority(rview->resource));
+	}
+
+	if (rview->dcc_buffer && rview->dcc_buffer != rview->resource) {
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+			rview->dcc_buffer, RADEON_USAGE_READ,
+			RADEON_PRIO_DCC);
+	}
+}
+
 static void si_sampler_views_begin_new_cs(struct si_context *sctx,
 					  struct si_sampler_views *views)
 {
@@ -149,12 +165,7 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx,
 		struct si_sampler_view *rview =
 			(struct si_sampler_view*)views->views[i];
 
-		if (!rview->resource)
-			continue;
-
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-				      rview->resource, RADEON_USAGE_READ,
-				      r600_get_sampler_view_priority(rview->resource));
+		si_sampler_view_add_buffers(sctx, rview);
 	}
 
 	if (!views->desc.buffer)
@@ -176,15 +187,7 @@ static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
 		struct si_sampler_view *rview =
 			(struct si_sampler_view*)view;
 
-		if (rview->resource)
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-				rview->resource, RADEON_USAGE_READ,
-				r600_get_sampler_view_priority(rview->resource));
-
-		if (rview->dcc_buffer && rview->dcc_buffer != rview->resource)
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-				rview->dcc_buffer, RADEON_USAGE_READ,
-				RADEON_PRIO_DCC);
+		si_sampler_view_add_buffers(sctx, rview);
 
 		pipe_sampler_view_reference(&views->views[slot], view);
 		memcpy(views->desc.list + slot*8, view_desc, 8*4);
@@ -978,9 +981,11 @@ void si_emit_shader_userdata(struct si_context *sctx, struct r600_atom *atom)
 		si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, vs_base, true);
 		si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, vs_base, true);
 
-		/* The TESSEVAL shader needs this for the ESGS ring buffer. */
-		si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc,
-				       R_00B330_SPI_SHADER_USER_DATA_ES_0, true);
+		if (sctx->tes_shader.cso) {
+			/* The TESSEVAL shader needs this for the ESGS ring buffer. */
+			si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc,
+					       R_00B330_SPI_SHADER_USER_DATA_ES_0, true);
+		}
 	} else if (sctx->tes_shader.cso) {
 		/* The TESSEVAL shader needs this for streamout. */
 		si_emit_shader_pointer(sctx, &sctx->rw_buffers[PIPE_SHADER_VERTEX].desc,
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 3e20c3b81fa..0c1ae90f9da 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -208,7 +208,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 	 * this for non-cs shaders.  Using the wrong value here can result in
 	 * GPU lockups, but the maximum value seems to always work.
 	 */
-	sctx->scratch_waves = 32 * sscreen->b.info.max_compute_units;
+	sctx->scratch_waves = 32 * sscreen->b.info.num_good_compute_units;
 
 #if HAVE_LLVM >= 0x0306
 	/* Initialize LLVM TargetMachine */
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index f83cb024f0e..e2725fe3679 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -125,7 +125,11 @@ struct si_framebuffer {
 	unsigned			log_samples;
 	unsigned			cb0_is_integer;
 	unsigned			compressed_cb_mask;
-	unsigned			export_16bpc;
+	unsigned			spi_shader_col_format;
+	unsigned			spi_shader_col_format_alpha;
+	unsigned			spi_shader_col_format_blend;
+	unsigned			spi_shader_col_format_blend_alpha;
+	unsigned			color_is_int8; /* bitmask */
 	unsigned			dirty_cbufs;
 	bool				dirty_zsbuf;
 };
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 2de7def8dd2..94c1129c88d 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -68,6 +68,7 @@ struct si_shader_context
 	struct si_shader *shader;
 	struct si_screen *screen;
 	unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */
+	bool is_gs_copy_shader;
 	int param_streamout_config;
 	int param_streamout_write_index;
 	int param_streamout_offset[4];
@@ -1119,9 +1120,20 @@ static void declare_system_value(
 		value = get_sample_id(radeon_bld);
 		break;
 
-	case TGSI_SEMANTIC_SAMPLEPOS:
-		value = load_sample_position(radeon_bld, get_sample_id(radeon_bld));
+	case TGSI_SEMANTIC_SAMPLEPOS: {
+		LLVMValueRef pos[4] = {
+			LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
+			LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
+			lp_build_const_float(gallivm, 0),
+			lp_build_const_float(gallivm, 0)
+		};
+		pos[0] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
+						  TGSI_OPCODE_FRC, pos[0]);
+		pos[1] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
+						  TGSI_OPCODE_FRC, pos[1]);
+		value = lp_build_gather_values(gallivm, pos, 4);
 		break;
+	}
 
 	case TGSI_SEMANTIC_SAMPLEMASK:
 		/* Smoothing isn't MSAA in GL, but it's MSAA in hardware.
@@ -1255,6 +1267,28 @@ static LLVMValueRef fetch_constant(
 	return result;
 }
 
+/* Upper 16 bits must be zero. */
+static LLVMValueRef si_llvm_pack_two_int16(struct gallivm_state *gallivm,
+					   LLVMValueRef val[2])
+{
+	return LLVMBuildOr(gallivm->builder, val[0],
+			   LLVMBuildShl(gallivm->builder, val[1],
+					lp_build_const_int32(gallivm, 16),
+					""), "");
+}
+
+/* Upper 16 bits are ignored and will be dropped. */
+static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct gallivm_state *gallivm,
+						    LLVMValueRef val[2])
+{
+	LLVMValueRef v[2] = {
+		LLVMBuildAnd(gallivm->builder, val[0],
+			     lp_build_const_int32(gallivm, 0xffff), ""),
+		val[1],
+	};
+	return si_llvm_pack_two_int16(gallivm, v);
+}
+
 /* Initialize arguments for the shader export intrinsic */
 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 				     LLVMValueRef *values,
@@ -1265,16 +1299,15 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 	struct lp_build_context *uint =
 				&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld;
 	struct lp_build_context *base = &bld_base->base;
-	unsigned compressed = 0;
+	struct gallivm_state *gallivm = base->gallivm;
+	LLVMBuilderRef builder = base->gallivm->builder;
+	LLVMValueRef val[4];
+	unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
 	unsigned chan;
+	bool is_int8;
 
-	/* XXX: This controls which components of the output
-	 * registers actually get exported. (e.g bit 0 means export
-	 * X component, bit 1 means export Y component, etc.)  I'm
-	 * hard coding this to 0xf for now.  In the future, we might
-	 * want to do something else.
-	 */
-	args[0] = lp_build_const_int32(base->gallivm, 0xf);
+	/* Default is 0xf. Adjusted below depending on the format. */
+	args[0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
 
 	/* Specify whether the EXEC mask represents the valid mask */
 	args[1] = uint->zero;
@@ -1286,17 +1319,47 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 	args[3] = lp_build_const_int32(base->gallivm, target);
 
 	if (si_shader_ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+		const union si_shader_key *key = &si_shader_ctx->shader->key;
+		unsigned col_formats = key->ps.spi_shader_col_format;
 		int cbuf = target - V_008DFC_SQ_EXP_MRT;
 
-		if (cbuf >= 0 && cbuf < 8)
-			compressed = (si_shader_ctx->shader->key.ps.export_16bpc >> cbuf) & 0x1;
+		assert(cbuf >= 0 && cbuf < 8);
+		spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
+		is_int8 = (key->ps.color_is_int8 >> cbuf) & 0x1;
 	}
 
-	/* Set COMPR flag */
-	args[4] = compressed ? uint->one : uint->zero;
+	args[4] = uint->zero; /* COMPR flag */
+	args[5] = base->undef;
+	args[6] = base->undef;
+	args[7] = base->undef;
+	args[8] = base->undef;
+
+	switch (spi_shader_col_format) {
+	case V_028714_SPI_SHADER_ZERO:
+		args[0] = uint->zero; /* writemask */
+		args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
+		break;
+
+	case V_028714_SPI_SHADER_32_R:
+		args[0] = uint->one; /* writemask */
+		args[5] = values[0];
+		break;
+
+	case V_028714_SPI_SHADER_32_GR:
+		args[0] = lp_build_const_int32(base->gallivm, 0x3); /* writemask */
+		args[5] = values[0];
+		args[6] = values[1];
+		break;
+
+	case V_028714_SPI_SHADER_32_AR:
+		args[0] = lp_build_const_int32(base->gallivm, 0x9); /* writemask */
+		args[5] = values[0];
+		args[8] = values[3];
+		break;
+
+	case V_028714_SPI_SHADER_FP16_ABGR:
+		args[4] = uint->one; /* COMPR flag */
 
-	if (compressed) {
-		/* Pixel shader needs to pack output values before export */
 		for (chan = 0; chan < 2; chan++) {
 			LLVMValueRef pack_args[2] = {
 				values[2 * chan],
@@ -1306,18 +1369,107 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 
 			packed = lp_build_intrinsic(base->gallivm->builder,
 						    "llvm.SI.packf16",
-						    LLVMInt32TypeInContext(base->gallivm->context),
-						    pack_args, 2,
+						    uint->elem_type, pack_args, 2,
 						    LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 			args[chan + 5] =
 				LLVMBuildBitCast(base->gallivm->builder,
-						 packed,
-						 LLVMFloatTypeInContext(base->gallivm->context),
-						 "");
-			args[chan + 7] = base->undef;
+						 packed, base->elem_type, "");
 		}
-	} else
+		break;
+
+	case V_028714_SPI_SHADER_UNORM16_ABGR:
+		for (chan = 0; chan < 4; chan++) {
+			val[chan] = radeon_llvm_saturate(bld_base, values[chan]);
+			val[chan] = LLVMBuildFMul(builder, val[chan],
+						  lp_build_const_float(gallivm, 65535), "");
+			val[chan] = LLVMBuildFAdd(builder, val[chan],
+						  lp_build_const_float(gallivm, 0.5), "");
+			val[chan] = LLVMBuildFPToUI(builder, val[chan],
+						    uint->elem_type, "");
+		}
+
+		args[4] = uint->one; /* COMPR flag */
+		args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int16(gallivm, val));
+		args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int16(gallivm, val+2));
+		break;
+
+	case V_028714_SPI_SHADER_SNORM16_ABGR:
+		for (chan = 0; chan < 4; chan++) {
+			/* Clamp between [-1, 1]. */
+			val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
+							      values[chan],
+							      lp_build_const_float(gallivm, 1));
+			val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
+							      val[chan],
+							      lp_build_const_float(gallivm, -1));
+			/* Convert to a signed integer in [-32767, 32767]. */
+			val[chan] = LLVMBuildFMul(builder, val[chan],
+						  lp_build_const_float(gallivm, 32767), "");
+			/* If positive, add 0.5, else add -0.5. */
+			val[chan] = LLVMBuildFAdd(builder, val[chan],
+					LLVMBuildSelect(builder,
+						LLVMBuildFCmp(builder, LLVMRealOGE,
+							      val[chan], base->zero, ""),
+						lp_build_const_float(gallivm, 0.5),
+						lp_build_const_float(gallivm, -0.5), ""), "");
+			val[chan] = LLVMBuildFPToSI(builder, val[chan], uint->elem_type, "");
+		}
+
+		args[4] = uint->one; /* COMPR flag */
+		args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int32_as_int16(gallivm, val));
+		args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int32_as_int16(gallivm, val+2));
+		break;
+
+	case V_028714_SPI_SHADER_UINT16_ABGR: {
+		LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
+							255 : 65535);
+		/* Clamp. */
+		for (chan = 0; chan < 4; chan++) {
+			val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
+			val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
+							      val[chan], max);
+		}
+
+		args[4] = uint->one; /* COMPR flag */
+		args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int16(gallivm, val));
+		args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int16(gallivm, val+2));
+		break;
+	}
+
+	case V_028714_SPI_SHADER_SINT16_ABGR: {
+		LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
+							127 : 32767);
+		LLVMValueRef min = lp_build_const_int32(gallivm, is_int8 ?
+							-128 : -32768);
+		/* Clamp. */
+		for (chan = 0; chan < 4; chan++) {
+			val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
+			val[chan] = lp_build_emit_llvm_binary(bld_base,
+							      TGSI_OPCODE_IMIN,
+							      val[chan], max);
+			val[chan] = lp_build_emit_llvm_binary(bld_base,
+							      TGSI_OPCODE_IMAX,
+							      val[chan], min);
+		}
+
+		args[4] = uint->one; /* COMPR flag */
+		args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int32_as_int16(gallivm, val));
+		args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int32_as_int16(gallivm, val+2));
+		break;
+	}
+
+	case V_028714_SPI_SHADER_32_ABGR:
 		memcpy(&args[5], values, sizeof(values[0]) * 4);
+		break;
+	}
 }
 
 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
@@ -2000,6 +2152,8 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
 	struct si_shader_output_values *outputs = NULL;
 	int i,j;
 
+	assert(!si_shader_ctx->is_gs_copy_shader);
+
 	outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
 
 	/* Vertex color clamping.
@@ -2008,8 +2162,7 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
 	 * an IF statement is added that clamps all colors if the constant
 	 * is true.
 	 */
-	if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX &&
-	    !si_shader_ctx->shader->is_gs_copy_shader) {
+	if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX) {
 		struct lp_build_if_state if_ctx;
 		LLVMValueRef cond = NULL;
 		LLVMValueRef addr, val;
@@ -3312,7 +3465,9 @@ static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
 {
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 
-	lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.barrier.local",
+	lp_build_intrinsic(gallivm->builder,
+			HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
+					    : "llvm.AMDGPU.barrier.local",
 			LLVMVoidTypeInContext(gallivm->context), NULL, 0,
 			LLVMNoUnwindAttribute);
 }
@@ -3403,7 +3558,7 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 			params[SI_PARAM_LS_OUT_LAYOUT] = i32;
 			num_params = SI_PARAM_LS_OUT_LAYOUT+1;
 		} else {
-			if (shader->is_gs_copy_shader) {
+			if (si_shader_ctx->is_gs_copy_shader) {
 				last_array_pointer = SI_PARAM_CONST_BUFFERS;
 				num_params = SI_PARAM_CONST_BUFFERS+1;
 			} else {
@@ -3676,7 +3831,7 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
 			build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
 	}
 
-	if (si_shader_ctx->shader->is_gs_copy_shader) {
+	if (si_shader_ctx->is_gs_copy_shader) {
 		LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS);
 
 		si_shader_ctx->gsvs_ring[0] =
@@ -3850,22 +4005,65 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary
 
 static void si_shader_dump_stats(struct si_screen *sscreen,
 			         struct si_shader_config *conf,
+				 unsigned num_inputs,
 				 unsigned code_size,
 			         struct pipe_debug_callback *debug,
 			         unsigned processor)
 {
+	unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
+	unsigned lds_per_wave = 0;
+	unsigned max_simd_waves = 10;
+
+	/* Compute LDS usage for PS. */
+	if (processor == TGSI_PROCESSOR_FRAGMENT) {
+		/* The minimum usage per wave is (num_inputs * 36). The maximum
+		 * usage is (num_inputs * 36 * 16).
+		 * We can get anything in between and it varies between waves.
+		 *
+		 * Other stages don't know the size at compile time or don't
+		 * allocate LDS per wave, but instead they do it per thread group.
+		 */
+		lds_per_wave = conf->lds_size * lds_increment +
+			       align(num_inputs * 36, lds_increment);
+	}
+
+	/* Compute the per-SIMD wave counts. */
+	if (conf->num_sgprs) {
+		if (sscreen->b.chip_class >= VI)
+			max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
+		else
+			max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
+	}
+
+	if (conf->num_vgprs)
+		max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
+
+	/* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD
+	 * that PS can use.
+	 */
+	if (lds_per_wave)
+		max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
+
 	if (r600_can_dump_shader(&sscreen->b, processor)) {
 		fprintf(stderr, "*** SHADER STATS ***\n"
-			"SGPRS: %d\nVGPRS: %d\nCode Size: %d bytes\nLDS: %d blocks\n"
-			"Scratch: %d bytes per wave\n********************\n",
+			"SGPRS: %d\n"
+			"VGPRS: %d\n"
+			"Code Size: %d bytes\n"
+			"LDS: %d blocks\n"
+			"Scratch: %d bytes per wave\n"
+			"Max Waves: %d\n"
+			"********************\n",
 			conf->num_sgprs, conf->num_vgprs, code_size,
-			conf->lds_size, conf->scratch_bytes_per_wave);
+			conf->lds_size, conf->scratch_bytes_per_wave,
+			max_simd_waves);
 	}
 
 	pipe_debug_message(debug, SHADER_INFO,
-			   "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d LDS: %d Scratch: %d",
+			   "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
+			   "LDS: %d Scratch: %d Max Waves: %d",
 			   conf->num_sgprs, conf->num_vgprs, code_size,
-			   conf->lds_size, conf->scratch_bytes_per_wave);
+			   conf->lds_size, conf->scratch_bytes_per_wave,
+			   max_simd_waves);
 }
 
 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
@@ -3876,6 +4074,7 @@ void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
 			si_shader_dump_disassembly(&shader->binary, debug);
 
 	si_shader_dump_stats(sscreen, &shader->config,
+                            shader->selector->info.num_inputs,
 			     shader->binary.code_size, debug, processor);
 }
 
@@ -3924,7 +4123,6 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 	struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
 	struct lp_build_context *base = &bld_base->base;
 	struct lp_build_context *uint = &bld_base->uint_bld;
-	struct si_shader *shader = si_shader_ctx->shader;
 	struct si_shader_output_values *outputs;
 	struct tgsi_shader_info *gsinfo = &gs->selector->info;
 	LLVMValueRef args[9];
@@ -3933,7 +4131,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 	outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
 
 	si_shader_ctx->type = TGSI_PROCESSOR_VERTEX;
-	shader->is_gs_copy_shader = true;
+	si_shader_ctx->is_gs_copy_shader = true;
 
 	radeon_llvm_context_init(&si_shader_ctx->radeon_bld);
 
@@ -4031,7 +4229,7 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 		break;
 
 	case PIPE_SHADER_FRAGMENT:
-		fprintf(f, "  export_16bpc = 0x%X\n", key->ps.export_16bpc);
+		fprintf(f, "  spi_shader_col_format = 0x%x\n", key->ps.spi_shader_col_format);
 		fprintf(f, "  last_cbuf = %u\n", key->ps.last_cbuf);
 		fprintf(f, "  color_two_side = %u\n", key->ps.color_two_side);
 		fprintf(f, "  alpha_func = %u\n", key->ps.alpha_func);
@@ -4208,7 +4406,6 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) {
 		shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
 		shader->gs_copy_shader->selector = shader->selector;
-		shader->gs_copy_shader->key = shader->key;
 		si_shader_ctx.shader = shader->gs_copy_shader;
 		if ((r = si_generate_gs_copy_shader(sscreen, &si_shader_ctx,
 						    shader, dump, debug))) {
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 1635358d505..c1512078a18 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -213,6 +213,10 @@ struct si_shader_selector {
 
 	/* PS parameters. */
 	unsigned	db_shader_control;
+	/* Set 0xf or 0x0 (4 bits) per each written output.
+	 * ANDed with spi_shader_col_format.
+	 */
+	unsigned	colors_written_4bit;
 
 	/* masks of "get_unique_index" bits */
 	uint64_t	outputs_written;
@@ -232,7 +236,8 @@ struct si_shader_selector {
 
 union si_shader_key {
 	struct {
-		unsigned	export_16bpc:8;
+		unsigned	spi_shader_col_format;
+		unsigned	color_is_int8:8;
 		unsigned	last_cbuf:3;
 		unsigned	color_two_side:1;
 		unsigned	alpha_func:3;
@@ -292,7 +297,6 @@ struct si_shader {
 	bool			uses_instanceid;
 	unsigned		nr_pos_exports;
 	unsigned		nr_param_exports;
-	bool			is_gs_copy_shader;
 	bool			dx10_clamp_mode; /* convert NaNs to 0 */
 };
 
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 2a6d2c6ff36..9e0ccfc5dde 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -403,6 +403,7 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
 	if (!blend)
 		return NULL;
 
+	blend->alpha_to_coverage = state->alpha_to_coverage;
 	blend->alpha_to_one = state->alpha_to_one;
 	blend->dual_src_blend = util_blend_state_is_dual(state, 0);
 
@@ -419,6 +420,9 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
 		       S_028B70_ALPHA_TO_MASK_OFFSET2(2) |
 		       S_028B70_ALPHA_TO_MASK_OFFSET3(2));
 
+	if (state->alpha_to_coverage)
+		blend->need_src_alpha_4bit |= 0xf;
+
 	blend->cb_target_mask = 0;
 	for (int i = 0; i < 8; i++) {
 		/* state->rt entries > 0 only written if independent blending */
@@ -433,6 +437,9 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
 
 		unsigned blend_cntl = 0;
 
+		if (!state->rt[j].colormask)
+			continue;
+
 		/* we pretend 8 buffer are used, CB_SHADER_MASK will disable unused one */
 		blend->cb_target_mask |= state->rt[j].colormask << (4 * i);
 
@@ -453,6 +460,17 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
 			blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA));
 		}
 		si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
+
+		blend->blend_enable_4bit |= 0xf << (i * 4);
+
+		/* This is only important for formats without alpha. */
+		if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+		    dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+		    srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+		    dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+		    srcRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+		    dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA)
+			blend->need_src_alpha_4bit |= 0xf << (i * 4);
 	}
 
 	if (blend->cb_target_mask) {
@@ -1266,53 +1284,6 @@ static uint32_t si_colorformat_endian_swap(uint32_t colorformat)
 	}
 }
 
-/* Returns the size in bits of the widest component of a CB format */
-static unsigned si_colorformat_max_comp_size(uint32_t colorformat)
-{
-	switch(colorformat) {
-	case V_028C70_COLOR_4_4_4_4:
-		return 4;
-
-	case V_028C70_COLOR_1_5_5_5:
-	case V_028C70_COLOR_5_5_5_1:
-		return 5;
-
-	case V_028C70_COLOR_5_6_5:
-		return 6;
-
-	case V_028C70_COLOR_8:
-	case V_028C70_COLOR_8_8:
-	case V_028C70_COLOR_8_8_8_8:
-		return 8;
-
-	case V_028C70_COLOR_10_10_10_2:
-	case V_028C70_COLOR_2_10_10_10:
-		return 10;
-
-	case V_028C70_COLOR_10_11_11:
-	case V_028C70_COLOR_11_11_10:
-		return 11;
-
-	case V_028C70_COLOR_16:
-	case V_028C70_COLOR_16_16:
-	case V_028C70_COLOR_16_16_16_16:
-		return 16;
-
-	case V_028C70_COLOR_8_24:
-	case V_028C70_COLOR_24_8:
-		return 24;
-
-	case V_028C70_COLOR_32:
-	case V_028C70_COLOR_32_32:
-	case V_028C70_COLOR_32_32_32_32:
-	case V_028C70_COLOR_X24_8_32_FLOAT:
-		return 32;
-	}
-
-	assert(!"Unknown maximum component size");
-	return 0;
-}
-
 static uint32_t si_translate_dbformat(enum pipe_format format)
 {
 	switch (format) {
@@ -1405,6 +1376,30 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
 		}
 	}
 
+	if (desc->layout == UTIL_FORMAT_LAYOUT_ETC &&
+	    sscreen->b.family >= CHIP_STONEY) {
+		switch (format) {
+		case PIPE_FORMAT_ETC1_RGB8:
+		case PIPE_FORMAT_ETC2_RGB8:
+		case PIPE_FORMAT_ETC2_SRGB8:
+			return V_008F14_IMG_DATA_FORMAT_ETC2_RGB;
+		case PIPE_FORMAT_ETC2_RGB8A1:
+		case PIPE_FORMAT_ETC2_SRGB8A1:
+			return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA1;
+		case PIPE_FORMAT_ETC2_RGBA8:
+		case PIPE_FORMAT_ETC2_SRGBA8:
+			return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA;
+		case PIPE_FORMAT_ETC2_R11_UNORM:
+		case PIPE_FORMAT_ETC2_R11_SNORM:
+			return V_008F14_IMG_DATA_FORMAT_ETC2_R;
+		case PIPE_FORMAT_ETC2_RG11_UNORM:
+		case PIPE_FORMAT_ETC2_RG11_SNORM:
+			return V_008F14_IMG_DATA_FORMAT_ETC2_RG;
+		default:
+			goto out_unknown;
+		}
+	}
+
 	if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
 		if (!enable_compressed_formats)
 			goto out_unknown;
@@ -1880,6 +1875,123 @@ unsigned si_tile_mode_index(struct r600_texture *rtex, unsigned level, bool sten
  * framebuffer handling
  */
 
+static void si_choose_spi_color_formats(struct r600_surface *surf,
+					unsigned format, unsigned swap,
+					unsigned ntype, bool is_depth)
+{
+	/* Alpha is needed for alpha-to-coverage.
+	 * Blending may be with or without alpha.
+	 */
+	unsigned normal = 0; /* most optimal, may not support blending or export alpha */
+	unsigned alpha = 0; /* exports alpha, but may not support blending */
+	unsigned blend = 0; /* supports blending, but may not export alpha */
+	unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */
+
+	/* Choose the SPI color formats. These are required values for Stoney/RB+.
+	 * Other chips have multiple choices, though they are not necessarily better.
+	 */
+	switch (format) {
+	case V_028C70_COLOR_5_6_5:
+	case V_028C70_COLOR_1_5_5_5:
+	case V_028C70_COLOR_5_5_5_1:
+	case V_028C70_COLOR_4_4_4_4:
+	case V_028C70_COLOR_10_11_11:
+	case V_028C70_COLOR_11_11_10:
+	case V_028C70_COLOR_8:
+	case V_028C70_COLOR_8_8:
+	case V_028C70_COLOR_8_8_8_8:
+	case V_028C70_COLOR_10_10_10_2:
+	case V_028C70_COLOR_2_10_10_10:
+		if (ntype == V_028C70_NUMBER_UINT)
+			alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
+		else if (ntype == V_028C70_NUMBER_SINT)
+			alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
+		else
+			alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
+		break;
+
+	case V_028C70_COLOR_16:
+	case V_028C70_COLOR_16_16:
+	case V_028C70_COLOR_16_16_16_16:
+		if (ntype == V_028C70_NUMBER_UNORM ||
+		    ntype == V_028C70_NUMBER_SNORM) {
+			/* UNORM16 and SNORM16 don't support blending */
+			if (ntype == V_028C70_NUMBER_UNORM)
+				normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR;
+			else
+				normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR;
+
+			/* Use 32 bits per channel for blending. */
+			if (format == V_028C70_COLOR_16) {
+				if (swap == V_028C70_SWAP_STD) { /* R */
+					blend = V_028714_SPI_SHADER_32_R;
+					blend_alpha = V_028714_SPI_SHADER_32_AR;
+				} else if (swap == V_028C70_SWAP_ALT_REV) /* A */
+					blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
+				else
+					assert(0);
+			} else if (format == V_028C70_COLOR_16_16) {
+				if (swap == V_028C70_SWAP_STD) { /* RG */
+					blend = V_028714_SPI_SHADER_32_GR;
+					blend_alpha = V_028714_SPI_SHADER_32_ABGR;
+				} else if (swap == V_028C70_SWAP_ALT) /* RA */
+					blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
+				else
+					assert(0);
+			} else /* 16_16_16_16 */
+				blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
+		} else if (ntype == V_028C70_NUMBER_UINT)
+			alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
+		else if (ntype == V_028C70_NUMBER_SINT)
+			alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
+		else if (ntype == V_028C70_NUMBER_FLOAT)
+			alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
+		else
+			assert(0);
+		break;
+
+	case V_028C70_COLOR_32:
+		if (swap == V_028C70_SWAP_STD) { /* R */
+			blend = normal = V_028714_SPI_SHADER_32_R;
+			alpha = blend_alpha = V_028714_SPI_SHADER_32_AR;
+		} else if (swap == V_028C70_SWAP_ALT_REV) /* A */
+			alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
+		else
+			assert(0);
+		break;
+
+	case V_028C70_COLOR_32_32:
+		if (swap == V_028C70_SWAP_STD) { /* RG */
+			blend = normal = V_028714_SPI_SHADER_32_GR;
+			alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
+		} else if (swap == V_028C70_SWAP_ALT) /* RA */
+			alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
+		else
+			assert(0);
+		break;
+
+	case V_028C70_COLOR_32_32_32_32:
+	case V_028C70_COLOR_8_24:
+	case V_028C70_COLOR_24_8:
+	case V_028C70_COLOR_X24_8_32_FLOAT:
+		alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
+		break;
+
+	default:
+		assert(0);
+		return;
+	}
+
+	/* The DB->CB copy needs 32_ABGR. */
+	if (is_depth)
+		alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
+
+	surf->spi_shader_col_format = normal;
+	surf->spi_shader_col_format_alpha = alpha;
+	surf->spi_shader_col_format_blend = blend;
+	surf->spi_shader_col_format_blend_alpha = blend_alpha;
+}
+
 static void si_initialize_color_surface(struct si_context *sctx,
 					struct r600_surface *surf)
 {
@@ -1893,7 +2005,6 @@ static void si_initialize_color_surface(struct si_context *sctx,
 	const struct util_format_description *desc;
 	int i;
 	unsigned blend_clamp = 0, blend_bypass = 0;
-	unsigned max_comp_size;
 
 	/* Layered rendering doesn't work with LINEAR_GENERAL.
 	 * (LINEAR_ALIGNED and others work) */
@@ -1971,6 +2082,12 @@ static void si_initialize_color_surface(struct si_context *sctx,
 		blend_bypass = 1;
 	}
 
+	if ((ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT) &&
+	    (format == V_028C70_COLOR_8 ||
+	     format == V_028C70_COLOR_8_8 ||
+	     format == V_028C70_COLOR_8_8_8_8))
+		surf->color_is_int8 = true;
+
 	color_info = S_028C70_FORMAT(format) |
 		S_028C70_COMP_SWAP(swap) |
 		S_028C70_BLEND_CLAMP(blend_clamp) |
@@ -2050,13 +2167,7 @@ static void si_initialize_color_surface(struct si_context *sctx,
 	}
 
 	/* Determine pixel shader export format */
-	max_comp_size = si_colorformat_max_comp_size(format);
-	if (ntype == V_028C70_NUMBER_SRGB ||
-	    ((ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM) &&
-	     max_comp_size <= 10) ||
-	    (ntype == V_028C70_NUMBER_FLOAT && max_comp_size <= 16)) {
-		surf->export_16bpc = true;
-	}
+	si_choose_spi_color_formats(surf, format, swap, ntype, rtex->is_depth);
 
 	if (sctx->b.family == CHIP_STONEY &&
 	    !(sctx->screen->b.debug_flags & DBG_NO_RB_PLUS)) {
@@ -2283,7 +2394,12 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 
 	util_copy_framebuffer_state(&sctx->framebuffer.state, state);
 
-	sctx->framebuffer.export_16bpc = 0;
+	sctx->framebuffer.spi_shader_col_format = 0;
+	sctx->framebuffer.spi_shader_col_format_alpha = 0;
+	sctx->framebuffer.spi_shader_col_format_blend = 0;
+	sctx->framebuffer.spi_shader_col_format_blend_alpha = 0;
+	sctx->framebuffer.color_is_int8 = 0;
+
 	sctx->framebuffer.compressed_cb_mask = 0;
 	sctx->framebuffer.nr_samples = util_framebuffer_get_num_samples(state);
 	sctx->framebuffer.log_samples = util_logbase2(sctx->framebuffer.nr_samples);
@@ -2304,22 +2420,35 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 			si_initialize_color_surface(sctx, surf);
 		}
 
-		if (surf->export_16bpc) {
-			sctx->framebuffer.export_16bpc |= 1 << i;
-		}
+		sctx->framebuffer.spi_shader_col_format |=
+			surf->spi_shader_col_format << (i * 4);
+		sctx->framebuffer.spi_shader_col_format_alpha |=
+			surf->spi_shader_col_format_alpha << (i * 4);
+		sctx->framebuffer.spi_shader_col_format_blend |=
+			surf->spi_shader_col_format_blend << (i * 4);
+		sctx->framebuffer.spi_shader_col_format_blend_alpha |=
+			surf->spi_shader_col_format_blend_alpha << (i * 4);
+
+		if (surf->color_is_int8)
+			sctx->framebuffer.color_is_int8 |= 1 << i;
 
 		if (rtex->fmask.size && rtex->cmask.size) {
 			sctx->framebuffer.compressed_cb_mask |= 1 << i;
 		}
 		r600_context_add_resource_size(ctx, surf->base.texture);
 	}
-	/* Set the 16BPC export for possible dual-src blending. */
-	if (i == 1 && surf && surf->export_16bpc) {
-		sctx->framebuffer.export_16bpc |= 1 << 1;
+	/* Set the second SPI format for possible dual-src blending. */
+	if (i == 1 && surf) {
+		sctx->framebuffer.spi_shader_col_format |=
+			surf->spi_shader_col_format << (i * 4);
+		sctx->framebuffer.spi_shader_col_format_alpha |=
+			surf->spi_shader_col_format_alpha << (i * 4);
+		sctx->framebuffer.spi_shader_col_format_blend |=
+			surf->spi_shader_col_format_blend << (i * 4);
+		sctx->framebuffer.spi_shader_col_format_blend_alpha |=
+			surf->spi_shader_col_format_blend_alpha << (i * 4);
 	}
 
-	assert(!(sctx->framebuffer.export_16bpc & ~0xff));
-
 	if (state->zsbuf) {
 		surf = (struct r600_surface*)state->zsbuf;
 
@@ -2703,12 +2832,17 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 				case PIPE_FORMAT_DXT3_SRGBA:
 				case PIPE_FORMAT_DXT5_SRGBA:
 				case PIPE_FORMAT_BPTC_SRGBA:
+				case PIPE_FORMAT_ETC2_SRGB8:
+				case PIPE_FORMAT_ETC2_SRGB8A1:
+				case PIPE_FORMAT_ETC2_SRGBA8:
 					num_format = V_008F14_IMG_NUM_FORMAT_SRGB;
 					break;
 				case PIPE_FORMAT_RGTC1_SNORM:
 				case PIPE_FORMAT_LATC1_SNORM:
 				case PIPE_FORMAT_RGTC2_SNORM:
 				case PIPE_FORMAT_LATC2_SNORM:
+				case PIPE_FORMAT_ETC2_R11_SNORM:
+				case PIPE_FORMAT_ETC2_RG11_SNORM:
 				/* implies float, so use SNORM/UNORM to determine
 				   whether data is signed or not */
 				case PIPE_FORMAT_BPTC_RGB_FLOAT:
@@ -3596,12 +3730,32 @@ static void si_init_config(struct si_context *sctx)
 	si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0);
 
 	if (sctx->b.chip_class >= CIK) {
-		si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xfffc));
 		si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 0);
-		si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, S_00B31C_CU_EN(0xfffe));
+		si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, S_00B31C_CU_EN(0xffff));
 		si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, S_00B21C_CU_EN(0xffff));
-		si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, S_00B118_CU_EN(0xffff));
-		si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(0));
+
+		if (sscreen->b.info.num_good_compute_units /
+		    (sscreen->b.info.max_se * sscreen->b.info.max_sh_per_se) <= 4) {
+			/* Too few available compute units per SH. Disallowing
+			 * VS to run on CU0 could hurt us more than late VS
+			 * allocation would help.
+			 *
+			 * LATE_ALLOC_VS = 2 is the highest safe number.
+			 */
+			si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xffff));
+			si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, S_00B118_CU_EN(0xffff));
+			si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(2));
+		} else {
+			/* Set LATE_ALLOC_VS == 31. It should be less than
+			 * the number of scratch waves. Limitations:
+			 * - VS can't execute on CU0.
+			 * - If HS writes outputs to LDS, LS can't execute on CU0.
+			 */
+			si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xfffe));
+			si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, S_00B118_CU_EN(0xfffe));
+			si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(31));
+		}
+
 		si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, S_00B01C_CU_EN(0xffff));
 	}
 
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index f5ca661f8d7..be3488e6dba 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -39,8 +39,14 @@ struct si_shader;
 struct si_state_blend {
 	struct si_pm4_state	pm4;
 	uint32_t		cb_target_mask;
+	bool			alpha_to_coverage;
 	bool			alpha_to_one;
 	bool			dual_src_blend;
+	/* Set 0xf or 0x0 (4 bits) per render target if the following is
+	 * true. ANDed with spi_shader_col_format.
+	 */
+	unsigned		blend_enable_4bit;
+	unsigned		need_src_alpha_4bit;
 };
 
 struct si_state_rasterizer {
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 8ff70b44d45..36174eb5a94 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -212,13 +212,37 @@ static void si_shader_es(struct si_shader *shader)
 		si_set_tesseval_regs(shader, pm4);
 }
 
+/**
+ * Calculate the appropriate setting of VGT_GS_MODE when \p shader is a
+ * geometry shader.
+ */
+static uint32_t si_vgt_gs_mode(struct si_shader *shader)
+{
+	unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices;
+	unsigned cut_mode;
+
+	if (gs_max_vert_out <= 128) {
+		cut_mode = V_028A40_GS_CUT_128;
+	} else if (gs_max_vert_out <= 256) {
+		cut_mode = V_028A40_GS_CUT_256;
+	} else if (gs_max_vert_out <= 512) {
+		cut_mode = V_028A40_GS_CUT_512;
+	} else {
+		assert(gs_max_vert_out <= 1024);
+		cut_mode = V_028A40_GS_CUT_1024;
+	}
+
+	return S_028A40_MODE(V_028A40_GS_SCENARIO_G) |
+	       S_028A40_CUT_MODE(cut_mode)|
+	       S_028A40_ES_WRITE_OPTIMIZE(1) |
+	       S_028A40_GS_WRITE_OPTIMIZE(1);
+}
+
 static void si_shader_gs(struct si_shader *shader)
 {
 	unsigned gs_vert_itemsize = shader->selector->gsvs_vertex_size;
-	unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices;
 	unsigned gsvs_itemsize = shader->selector->max_gsvs_emit_size >> 2;
 	unsigned gs_num_invocations = shader->selector->gs_num_invocations;
-	unsigned cut_mode;
 	struct si_pm4_state *pm4;
 	unsigned num_sgprs, num_user_sgprs;
 	uint64_t va;
@@ -232,22 +256,7 @@ static void si_shader_gs(struct si_shader *shader)
 	if (!pm4)
 		return;
 
-	if (gs_max_vert_out <= 128) {
-		cut_mode = V_028A40_GS_CUT_128;
-	} else if (gs_max_vert_out <= 256) {
-		cut_mode = V_028A40_GS_CUT_256;
-	} else if (gs_max_vert_out <= 512) {
-		cut_mode = V_028A40_GS_CUT_512;
-	} else {
-		assert(gs_max_vert_out <= 1024);
-		cut_mode = V_028A40_GS_CUT_1024;
-	}
-
-	si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE,
-		       S_028A40_MODE(V_028A40_GS_SCENARIO_G) |
-		       S_028A40_CUT_MODE(cut_mode)|
-		       S_028A40_ES_WRITE_OPTIMIZE(1) |
-		       S_028A40_GS_WRITE_OPTIMIZE(1));
+	si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE, si_vgt_gs_mode(shader));
 
 	si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, gsvs_itemsize);
 	si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * ((max_stream >= 2) ? 2 : 1));
@@ -255,7 +264,7 @@ static void si_shader_gs(struct si_shader *shader)
 
 	si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * (max_stream + 1));
 
-	si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out);
+	si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, shader->selector->gs_max_out_vertices);
 
 	si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize >> 2);
 	si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? gs_vert_itemsize >> 2 : 0);
@@ -289,7 +298,14 @@ static void si_shader_gs(struct si_shader *shader)
 		       S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 }
 
-static void si_shader_vs(struct si_shader *shader)
+/**
+ * Compute the state for \p shader, which will run as a vertex shader on the
+ * hardware.
+ *
+ * If \p gs is non-NULL, it points to the geometry shader for which this shader
+ * is the copy shader.
+ */
+static void si_shader_vs(struct si_shader *shader, struct si_shader *gs)
 {
 	struct si_pm4_state *pm4;
 	unsigned num_sgprs, num_user_sgprs;
@@ -304,20 +320,26 @@ static void si_shader_vs(struct si_shader *shader)
 	if (!pm4)
 		return;
 
-	/* If this is the GS copy shader, the GS state writes this register.
-	 * Otherwise, the VS state writes it.
+	/* We always write VGT_GS_MODE in the VS state, because every switch
+	 * between different shader pipelines involving a different GS or no
+	 * GS at all involves a switch of the VS (different GS use different
+	 * copy shaders). On the other hand, when the API switches from a GS to
+	 * no GS and then back to the same GS used originally, the GS state is
+	 * not sent again.
 	 */
-	if (!shader->is_gs_copy_shader) {
+	if (!gs) {
 		si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE,
 			       S_028A40_MODE(enable_prim_id ? V_028A40_GS_SCENARIO_A : 0));
 		si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, enable_prim_id);
-	} else
+	} else {
+		si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE, si_vgt_gs_mode(gs));
 		si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, 0);
+	}
 
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
 
-	if (shader->is_gs_copy_shader) {
+	if (gs) {
 		vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */
 		num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR;
 	} else if (shader->selector->type == PIPE_SHADER_VERTEX) {
@@ -382,13 +404,58 @@ static void si_shader_vs(struct si_shader *shader)
 		si_set_tesseval_regs(shader, pm4);
 }
 
+static unsigned si_get_spi_shader_col_format(struct si_shader *shader)
+{
+	unsigned value = shader->key.ps.spi_shader_col_format;
+	unsigned i, num_targets = (util_last_bit(value) + 3) / 4;
+
+	/* If the i-th target format is set, all previous target formats must
+	 * be non-zero to avoid hangs.
+	 */
+	for (i = 0; i < num_targets; i++)
+		if (!(value & (0xf << (i * 4))))
+			value |= V_028714_SPI_SHADER_32_R << (i * 4);
+
+	return value;
+}
+
+static unsigned si_get_cb_shader_mask(unsigned spi_shader_col_format)
+{
+	unsigned i, cb_shader_mask = 0;
+
+	for (i = 0; i < 8; i++) {
+		switch ((spi_shader_col_format >> (i * 4)) & 0xf) {
+		case V_028714_SPI_SHADER_ZERO:
+			break;
+		case V_028714_SPI_SHADER_32_R:
+			cb_shader_mask |= 0x1 << (i * 4);
+			break;
+		case V_028714_SPI_SHADER_32_GR:
+			cb_shader_mask |= 0x3 << (i * 4);
+			break;
+		case V_028714_SPI_SHADER_32_AR:
+			cb_shader_mask |= 0x9 << (i * 4);
+			break;
+		case V_028714_SPI_SHADER_FP16_ABGR:
+		case V_028714_SPI_SHADER_UNORM16_ABGR:
+		case V_028714_SPI_SHADER_SNORM16_ABGR:
+		case V_028714_SPI_SHADER_UINT16_ABGR:
+		case V_028714_SPI_SHADER_SINT16_ABGR:
+		case V_028714_SPI_SHADER_32_ABGR:
+			cb_shader_mask |= 0xf << (i * 4);
+			break;
+		default:
+			assert(0);
+		}
+	}
+	return cb_shader_mask;
+}
+
 static void si_shader_ps(struct si_shader *shader)
 {
 	struct tgsi_shader_info *info = &shader->selector->info;
 	struct si_pm4_state *pm4;
-	unsigned i, spi_ps_in_control;
-	unsigned spi_shader_col_format = 0, cb_shader_mask = 0;
-	unsigned colors_written, export_16bpc;
+	unsigned spi_ps_in_control, spi_shader_col_format, cb_shader_mask;
 	unsigned num_sgprs, num_user_sgprs;
 	unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
 	uint64_t va;
@@ -423,23 +490,18 @@ static void si_shader_ps(struct si_shader *shader)
 	    TGSI_FS_COORD_PIXEL_CENTER_INTEGER)
 		spi_baryc_cntl |= S_0286E0_POS_FLOAT_ULC(1);
 
-	/* Find out what SPI_SHADER_COL_FORMAT and CB_SHADER_MASK should be. */
-	colors_written = info->colors_written;
-	export_16bpc = shader->key.ps.export_16bpc;
+	spi_shader_col_format = si_get_spi_shader_col_format(shader);
+	cb_shader_mask = si_get_cb_shader_mask(spi_shader_col_format);
 
-	if (info->colors_written == 0x1 &&
-	    info->properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]) {
-		colors_written |= (1 << (shader->key.ps.last_cbuf + 1)) - 1;
-	}
-
-	while (colors_written) {
-		i = u_bit_scan(&colors_written);
-		if (export_16bpc & (1 << i))
-			spi_shader_col_format |= V_028714_SPI_SHADER_FP16_ABGR << (4 * i);
-		else
-			spi_shader_col_format |= V_028714_SPI_SHADER_32_ABGR << (4 * i);
-		cb_shader_mask |= 0xf << (4 * i);
-	}
+	/* This must be non-zero for alpha-test/kill to work.
+	 * The hardware ignores the EXEC mask if no export memory is allocated.
+	 * Don't add this to CB_SHADER_MASK.
+	 */
+	if (!spi_shader_col_format &&
+	    !info->writes_z && !info->writes_stencil && !info->writes_samplemask &&
+	    (shader->selector->info.uses_kill ||
+	     shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS))
+		spi_shader_col_format = V_028714_SPI_SHADER_32_R;
 
 	/* Set interpolation controls. */
 	has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena) ||
@@ -498,7 +560,7 @@ static void si_shader_init_pm4_state(struct si_shader *shader)
 		else if (shader->key.vs.as_es)
 			si_shader_es(shader);
 		else
-			si_shader_vs(shader);
+			si_shader_vs(shader, NULL);
 		break;
 	case PIPE_SHADER_TESS_CTRL:
 		si_shader_hs(shader);
@@ -507,11 +569,11 @@ static void si_shader_init_pm4_state(struct si_shader *shader)
 		if (shader->key.tes.as_es)
 			si_shader_es(shader);
 		else
-			si_shader_vs(shader);
+			si_shader_vs(shader, NULL);
 		break;
 	case PIPE_SHADER_GEOMETRY:
 		si_shader_gs(shader);
-		si_shader_vs(shader->gs_copy_shader);
+		si_shader_vs(shader->gs_copy_shader, shader);
 		break;
 	case PIPE_SHADER_FRAGMENT:
 		si_shader_ps(shader);
@@ -571,12 +633,47 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 		break;
 	case PIPE_SHADER_FRAGMENT: {
 		struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+		struct si_state_blend *blend = sctx->queued.named.blend;
 
 		if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
 		    sel->info.colors_written == 0x1)
 			key->ps.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
 
-		key->ps.export_16bpc = sctx->framebuffer.export_16bpc;
+		if (blend) {
+			/* Select the shader color format based on whether
+			 * blending or alpha are needed.
+			 */
+			key->ps.spi_shader_col_format =
+				(blend->blend_enable_4bit & blend->need_src_alpha_4bit &
+				 sctx->framebuffer.spi_shader_col_format_blend_alpha) |
+				(blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
+				 sctx->framebuffer.spi_shader_col_format_blend) |
+				(~blend->blend_enable_4bit & blend->need_src_alpha_4bit &
+				 sctx->framebuffer.spi_shader_col_format_alpha) |
+				(~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
+				 sctx->framebuffer.spi_shader_col_format);
+		} else
+			key->ps.spi_shader_col_format = sctx->framebuffer.spi_shader_col_format;
+
+		/* If alpha-to-coverage is enabled, we have to export alpha
+		 * even if there is no color buffer.
+		 */
+		if (!(key->ps.spi_shader_col_format & 0xf) &&
+		    blend && blend->alpha_to_coverage)
+			key->ps.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR;
+
+		/* On SI and CIK except Hawaii, the CB doesn't clamp outputs
+		 * to the range supported by the type if a channel has less
+		 * than 16 bits and the export format is 16_ABGR.
+		 */
+		if (sctx->b.chip_class <= CIK && sctx->b.family != CHIP_HAWAII)
+			key->ps.color_is_int8 = sctx->framebuffer.color_is_int8;
+
+		/* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */
+		if (!key->ps.last_cbuf) {
+			key->ps.spi_shader_col_format &= sel->colors_written_4bit;
+			key->ps.color_is_int8 &= sel->info.colors_written;
+		}
 
 		if (rs) {
 			bool is_poly = (sctx->current_rast_prim >= PIPE_PRIM_TRIANGLES &&
@@ -762,6 +859,12 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 		}
 		sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
 		break;
+
+	case PIPE_SHADER_FRAGMENT:
+		for (i = 0; i < 8; i++)
+			if (sel->info.colors_written & (1 << i))
+				sel->colors_written_4bit |= 0xf << (4 * i);
+		break;
 	}
 
 	/* DB_SHADER_CONTROL */
diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
index 573ab78b482..9e1e158219f 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -2062,12 +2062,12 @@
 #define     V_008F14_IMG_DATA_FORMAT_8_24                           0x14
 #define     V_008F14_IMG_DATA_FORMAT_24_8                           0x15
 #define     V_008F14_IMG_DATA_FORMAT_X24_8_32                       0x16
-#define     V_008F14_IMG_DATA_FORMAT_RESERVED_23                    0x17
-#define     V_008F14_IMG_DATA_FORMAT_RESERVED_24                    0x18
-#define     V_008F14_IMG_DATA_FORMAT_RESERVED_25                    0x19
-#define     V_008F14_IMG_DATA_FORMAT_RESERVED_26                    0x1A
-#define     V_008F14_IMG_DATA_FORMAT_RESERVED_27                    0x1B
-#define     V_008F14_IMG_DATA_FORMAT_RESERVED_28                    0x1C
+#define     V_008F14_IMG_DATA_FORMAT_8_AS_8_8_8_8                   0x17 /* stoney+ */
+#define     V_008F14_IMG_DATA_FORMAT_ETC2_RGB                       0x18 /* stoney+ */
+#define     V_008F14_IMG_DATA_FORMAT_ETC2_RGBA                      0x19 /* stoney+ */
+#define     V_008F14_IMG_DATA_FORMAT_ETC2_R                         0x1A /* stoney+ */
+#define     V_008F14_IMG_DATA_FORMAT_ETC2_RG                        0x1B /* stoney+ */
+#define     V_008F14_IMG_DATA_FORMAT_ETC2_RGBA1                     0x1C /* stoney+ */
 #define     V_008F14_IMG_DATA_FORMAT_RESERVED_29                    0x1D
 #define     V_008F14_IMG_DATA_FORMAT_RESERVED_30                    0x1E
 #define     V_008F14_IMG_DATA_FORMAT_RESERVED_31                    0x1F
@@ -2081,8 +2081,8 @@
 #define     V_008F14_IMG_DATA_FORMAT_BC5                            0x27
 #define     V_008F14_IMG_DATA_FORMAT_BC6                            0x28
 #define     V_008F14_IMG_DATA_FORMAT_BC7                            0x29
-#define     V_008F14_IMG_DATA_FORMAT_RESERVED_42                    0x2A
-#define     V_008F14_IMG_DATA_FORMAT_RESERVED_43                    0x2B
+#define     V_008F14_IMG_DATA_FORMAT_16_AS_16_16_16_16              0x2A /* stoney+ */
+#define     V_008F14_IMG_DATA_FORMAT_16_AS_32_32_32_32              0x2B /* stoney+ */
 #define     V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F1                   0x2C
 #define     V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F1                   0x2D
 #define     V_008F14_IMG_DATA_FORMAT_FMASK8_S8_F1                   0x2E
@@ -2100,8 +2100,8 @@
 #define     V_008F14_IMG_DATA_FORMAT_6_5_5                          0x3A
 #define     V_008F14_IMG_DATA_FORMAT_1                              0x3B
 #define     V_008F14_IMG_DATA_FORMAT_1_REVERSED                     0x3C
-#define     V_008F14_IMG_DATA_FORMAT_32_AS_8                        0x3D
-#define     V_008F14_IMG_DATA_FORMAT_32_AS_8_8                      0x3E
+#define     V_008F14_IMG_DATA_FORMAT_32_AS_8                        0x3D /* not on stoney */
+#define     V_008F14_IMG_DATA_FORMAT_32_AS_8_8                      0x3E /* not on stoney */
 #define     V_008F14_IMG_DATA_FORMAT_32_AS_32_32_32_32              0x3F
 #define   S_008F14_NUM_FORMAT(x)                                      (((x) & 0x0F) << 26)
 #define   G_008F14_NUM_FORMAT(x)                                      (((x) >> 26) & 0x0F)