1 files changed, 234 insertions, 37 deletions
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 2de7def8dd2..94c1129c88d 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -68,6 +68,7 @@ struct si_shader_context
 	struct si_shader *shader;
 	struct si_screen *screen;
 	unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */
+	bool is_gs_copy_shader;
 	int param_streamout_config;
 	int param_streamout_write_index;
 	int param_streamout_offset[4];
@@ -1119,9 +1120,20 @@ static void declare_system_value(
 		value = get_sample_id(radeon_bld);
 		break;
 
-	case TGSI_SEMANTIC_SAMPLEPOS:
-		value = load_sample_position(radeon_bld, get_sample_id(radeon_bld));
+	case TGSI_SEMANTIC_SAMPLEPOS: {
+		LLVMValueRef pos[4] = {
+			LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
+			LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
+			lp_build_const_float(gallivm, 0),
+			lp_build_const_float(gallivm, 0)
+		};
+		pos[0] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
+						  TGSI_OPCODE_FRC, pos[0]);
+		pos[1] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
+						  TGSI_OPCODE_FRC, pos[1]);
+		value = lp_build_gather_values(gallivm, pos, 4);
 		break;
+	}
 
 	case TGSI_SEMANTIC_SAMPLEMASK:
 		/* Smoothing isn't MSAA in GL, but it's MSAA in hardware.
@@ -1255,6 +1267,28 @@ static LLVMValueRef fetch_constant(
 	return result;
 }
 
+/* Upper 16 bits must be zero. */
+static LLVMValueRef si_llvm_pack_two_int16(struct gallivm_state *gallivm,
+					   LLVMValueRef val[2])
+{
+	return LLVMBuildOr(gallivm->builder, val[0],
+			   LLVMBuildShl(gallivm->builder, val[1],
+					lp_build_const_int32(gallivm, 16),
+					""), "");
+}
+
+/* Upper 16 bits are ignored and will be dropped. */
+static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct gallivm_state *gallivm,
+						    LLVMValueRef val[2])
+{
+	LLVMValueRef v[2] = {
+		LLVMBuildAnd(gallivm->builder, val[0],
+			     lp_build_const_int32(gallivm, 0xffff), ""),
+		val[1],
+	};
+	return si_llvm_pack_two_int16(gallivm, v);
+}
+
 /* Initialize arguments for the shader export intrinsic */
 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 				     LLVMValueRef *values,
@@ -1265,16 +1299,15 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 	struct lp_build_context *uint =
 				&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld;
 	struct lp_build_context *base = &bld_base->base;
-	unsigned compressed = 0;
+	struct gallivm_state *gallivm = base->gallivm;
+	LLVMBuilderRef builder = base->gallivm->builder;
+	LLVMValueRef val[4];
+	unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
 	unsigned chan;
+	bool is_int8;
 
-	/* XXX: This controls which components of the output
-	 * registers actually get exported. (e.g bit 0 means export
-	 * X component, bit 1 means export Y component, etc.)  I'm
-	 * hard coding this to 0xf for now.  In the future, we might
-	 * want to do something else.
-	 */
-	args[0] = lp_build_const_int32(base->gallivm, 0xf);
+	/* Default is 0xf. Adjusted below depending on the format. */
+	args[0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
 
 	/* Specify whether the EXEC mask represents the valid mask */
 	args[1] = uint->zero;
@@ -1286,17 +1319,47 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 	args[3] = lp_build_const_int32(base->gallivm, target);
 
 	if (si_shader_ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+		const union si_shader_key *key = &si_shader_ctx->shader->key;
+		unsigned col_formats = key->ps.spi_shader_col_format;
 		int cbuf = target - V_008DFC_SQ_EXP_MRT;
 
-		if (cbuf >= 0 && cbuf < 8)
-			compressed = (si_shader_ctx->shader->key.ps.export_16bpc >> cbuf) & 0x1;
+		assert(cbuf >= 0 && cbuf < 8);
+		spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
+		is_int8 = (key->ps.color_is_int8 >> cbuf) & 0x1;
 	}
 
-	/* Set COMPR flag */
-	args[4] = compressed ? uint->one : uint->zero;
+	args[4] = uint->zero; /* COMPR flag */
+	args[5] = base->undef;
+	args[6] = base->undef;
+	args[7] = base->undef;
+	args[8] = base->undef;
+
+	switch (spi_shader_col_format) {
+	case V_028714_SPI_SHADER_ZERO:
+		args[0] = uint->zero; /* writemask */
+		args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
+		break;
+
+	case V_028714_SPI_SHADER_32_R:
+		args[0] = uint->one; /* writemask */
+		args[5] = values[0];
+		break;
+
+	case V_028714_SPI_SHADER_32_GR:
+		args[0] = lp_build_const_int32(base->gallivm, 0x3); /* writemask */
+		args[5] = values[0];
+		args[6] = values[1];
+		break;
+
+	case V_028714_SPI_SHADER_32_AR:
+		args[0] = lp_build_const_int32(base->gallivm, 0x9); /* writemask */
+		args[5] = values[0];
+		args[8] = values[3];
+		break;
+
+	case V_028714_SPI_SHADER_FP16_ABGR:
+		args[4] = uint->one; /* COMPR flag */
 
-	if (compressed) {
-		/* Pixel shader needs to pack output values before export */
 		for (chan = 0; chan < 2; chan++) {
 			LLVMValueRef pack_args[2] = {
 				values[2 * chan],
@@ -1306,18 +1369,107 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 
 			packed = lp_build_intrinsic(base->gallivm->builder,
 						    "llvm.SI.packf16",
-						    LLVMInt32TypeInContext(base->gallivm->context),
-						    pack_args, 2,
+						    uint->elem_type, pack_args, 2,
 						    LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 			args[chan + 5] =
 				LLVMBuildBitCast(base->gallivm->builder,
-						 packed,
-						 LLVMFloatTypeInContext(base->gallivm->context),
-						 "");
-			args[chan + 7] = base->undef;
+						 packed, base->elem_type, "");
 		}
-	} else
+		break;
+
+	case V_028714_SPI_SHADER_UNORM16_ABGR:
+		for (chan = 0; chan < 4; chan++) {
+			val[chan] = radeon_llvm_saturate(bld_base, values[chan]);
+			val[chan] = LLVMBuildFMul(builder, val[chan],
+						  lp_build_const_float(gallivm, 65535), "");
+			val[chan] = LLVMBuildFAdd(builder, val[chan],
+						  lp_build_const_float(gallivm, 0.5), "");
+			val[chan] = LLVMBuildFPToUI(builder, val[chan],
+						    uint->elem_type, "");
+		}
+
+		args[4] = uint->one; /* COMPR flag */
+		args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int16(gallivm, val));
+		args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int16(gallivm, val+2));
+		break;
+
+	case V_028714_SPI_SHADER_SNORM16_ABGR:
+		for (chan = 0; chan < 4; chan++) {
+			/* Clamp between [-1, 1]. */
+			val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
+							      values[chan],
+							      lp_build_const_float(gallivm, 1));
+			val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
+							      val[chan],
+							      lp_build_const_float(gallivm, -1));
+			/* Convert to a signed integer in [-32767, 32767]. */
+			val[chan] = LLVMBuildFMul(builder, val[chan],
+						  lp_build_const_float(gallivm, 32767), "");
+			/* If positive, add 0.5, else add -0.5. */
+			val[chan] = LLVMBuildFAdd(builder, val[chan],
+					LLVMBuildSelect(builder,
+						LLVMBuildFCmp(builder, LLVMRealOGE,
+							      val[chan], base->zero, ""),
+						lp_build_const_float(gallivm, 0.5),
+						lp_build_const_float(gallivm, -0.5), ""), "");
+			val[chan] = LLVMBuildFPToSI(builder, val[chan], uint->elem_type, "");
+		}
+
+		args[4] = uint->one; /* COMPR flag */
+		args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int32_as_int16(gallivm, val));
+		args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int32_as_int16(gallivm, val+2));
+		break;
+
+	case V_028714_SPI_SHADER_UINT16_ABGR: {
+		LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
+							255 : 65535);
+		/* Clamp. */
+		for (chan = 0; chan < 4; chan++) {
+			val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
+			val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
+							      val[chan], max);
+		}
+
+		args[4] = uint->one; /* COMPR flag */
+		args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int16(gallivm, val));
+		args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int16(gallivm, val+2));
+		break;
+	}
+
+	case V_028714_SPI_SHADER_SINT16_ABGR: {
+		LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
+							127 : 32767);
+		LLVMValueRef min = lp_build_const_int32(gallivm, is_int8 ?
+							-128 : -32768);
+		/* Clamp. */
+		for (chan = 0; chan < 4; chan++) {
+			val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
+			val[chan] = lp_build_emit_llvm_binary(bld_base,
+							      TGSI_OPCODE_IMIN,
+							      val[chan], max);
+			val[chan] = lp_build_emit_llvm_binary(bld_base,
+							      TGSI_OPCODE_IMAX,
+							      val[chan], min);
+		}
+
+		args[4] = uint->one; /* COMPR flag */
+		args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int32_as_int16(gallivm, val));
+		args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int32_as_int16(gallivm, val+2));
+		break;
+	}
+
+	case V_028714_SPI_SHADER_32_ABGR:
 		memcpy(&args[5], values, sizeof(values[0]) * 4);
+		break;
+	}
 }
 
 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
@@ -2000,6 +2152,8 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
 	struct si_shader_output_values *outputs = NULL;
 	int i,j;
 
+	assert(!si_shader_ctx->is_gs_copy_shader);
+
 	outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
 
 	/* Vertex color clamping.
@@ -2008,8 +2162,7 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
 	 * an IF statement is added that clamps all colors if the constant
 	 * is true.
 	 */
-	if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX &&
-	    !si_shader_ctx->shader->is_gs_copy_shader) {
+	if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX) {
 		struct lp_build_if_state if_ctx;
 		LLVMValueRef cond = NULL;
 		LLVMValueRef addr, val;
@@ -3312,7 +3465,9 @@ static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
 {
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 
-	lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.barrier.local",
+	lp_build_intrinsic(gallivm->builder,
+			HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
+					    : "llvm.AMDGPU.barrier.local",
 			LLVMVoidTypeInContext(gallivm->context), NULL, 0,
 			LLVMNoUnwindAttribute);
 }
@@ -3403,7 +3558,7 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 			params[SI_PARAM_LS_OUT_LAYOUT] = i32;
 			num_params = SI_PARAM_LS_OUT_LAYOUT+1;
 		} else {
-			if (shader->is_gs_copy_shader) {
+			if (si_shader_ctx->is_gs_copy_shader) {
 				last_array_pointer = SI_PARAM_CONST_BUFFERS;
 				num_params = SI_PARAM_CONST_BUFFERS+1;
 			} else {
@@ -3676,7 +3831,7 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
 			build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
 	}
 
-	if (si_shader_ctx->shader->is_gs_copy_shader) {
+	if (si_shader_ctx->is_gs_copy_shader) {
 		LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS);
 
 		si_shader_ctx->gsvs_ring[0] =
@@ -3850,22 +4005,65 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary
 
 static void si_shader_dump_stats(struct si_screen *sscreen,
 			         struct si_shader_config *conf,
+				 unsigned num_inputs,
 				 unsigned code_size,
 			         struct pipe_debug_callback *debug,
 			         unsigned processor)
 {
+	unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
+	unsigned lds_per_wave = 0;
+	unsigned max_simd_waves = 10;
+
+	/* Compute LDS usage for PS. */
+	if (processor == TGSI_PROCESSOR_FRAGMENT) {
+		/* The minimum usage per wave is (num_inputs * 36). The maximum
+		 * usage is (num_inputs * 36 * 16).
+		 * We can get anything in between and it varies between waves.
+		 *
+		 * Other stages don't know the size at compile time or don't
+		 * allocate LDS per wave, but instead they do it per thread group.
+		 */
+		lds_per_wave = conf->lds_size * lds_increment +
+			       align(num_inputs * 36, lds_increment);
+	}
+
+	/* Compute the per-SIMD wave counts. */
+	if (conf->num_sgprs) {
+		if (sscreen->b.chip_class >= VI)
+			max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
+		else
+			max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
+	}
+
+	if (conf->num_vgprs)
+		max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
+
+	/* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD
+	 * that PS can use.
+	 */
+	if (lds_per_wave)
+		max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
+
 	if (r600_can_dump_shader(&sscreen->b, processor)) {
 		fprintf(stderr, "*** SHADER STATS ***\n"
-			"SGPRS: %d\nVGPRS: %d\nCode Size: %d bytes\nLDS: %d blocks\n"
-			"Scratch: %d bytes per wave\n********************\n",
+			"SGPRS: %d\n"
+			"VGPRS: %d\n"
+			"Code Size: %d bytes\n"
+			"LDS: %d blocks\n"
+			"Scratch: %d bytes per wave\n"
+			"Max Waves: %d\n"
+			"********************\n",
 			conf->num_sgprs, conf->num_vgprs, code_size,
-			conf->lds_size, conf->scratch_bytes_per_wave);
+			conf->lds_size, conf->scratch_bytes_per_wave,
+			max_simd_waves);
 	}
 
 	pipe_debug_message(debug, SHADER_INFO,
-			   "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d LDS: %d Scratch: %d",
+			   "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
+			   "LDS: %d Scratch: %d Max Waves: %d",
 			   conf->num_sgprs, conf->num_vgprs, code_size,
-			   conf->lds_size, conf->scratch_bytes_per_wave);
+			   conf->lds_size, conf->scratch_bytes_per_wave,
+			   max_simd_waves);
 }
 
 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
@@ -3876,6 +4074,7 @@ void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
 			si_shader_dump_disassembly(&shader->binary, debug);
 
 	si_shader_dump_stats(sscreen, &shader->config,
+                            shader->selector->info.num_inputs,
 			     shader->binary.code_size, debug, processor);
 }
 
@@ -3924,7 +4123,6 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 	struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
 	struct lp_build_context *base = &bld_base->base;
 	struct lp_build_context *uint = &bld_base->uint_bld;
-	struct si_shader *shader = si_shader_ctx->shader;
 	struct si_shader_output_values *outputs;
 	struct tgsi_shader_info *gsinfo = &gs->selector->info;
 	LLVMValueRef args[9];
@@ -3933,7 +4131,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 	outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
 
 	si_shader_ctx->type = TGSI_PROCESSOR_VERTEX;
-	shader->is_gs_copy_shader = true;
+	si_shader_ctx->is_gs_copy_shader = true;
 
 	radeon_llvm_context_init(&si_shader_ctx->radeon_bld);
 
@@ -4031,7 +4229,7 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 		break;
 
 	case PIPE_SHADER_FRAGMENT:
-		fprintf(f, "  export_16bpc = 0x%X\n", key->ps.export_16bpc);
+		fprintf(f, "  spi_shader_col_format = 0x%x\n", key->ps.spi_shader_col_format);
 		fprintf(f, "  last_cbuf = %u\n", key->ps.last_cbuf);
 		fprintf(f, "  color_two_side = %u\n", key->ps.color_two_side);
 		fprintf(f, "  alpha_func = %u\n", key->ps.alpha_func);
@@ -4208,7 +4406,6 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) {
 		shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
 		shader->gs_copy_shader->selector = shader->selector;
-		shader->gs_copy_shader->key = shader->key;
 		si_shader_ctx.shader = shader->gs_copy_shader;
 		if ((r = si_generate_gs_copy_shader(sscreen, &si_shader_ctx,
 						    shader, dump, debug))) {