radv: reduce the number of loaded channels for vertex input fetches

It's unnecessary to load more channels than the vertex attribute format. The remaining channels are filled with 0 for y and z, and 1 for w. 29077 shaders in 15096 tests Totals: SGPRS: 1321605 -> 1318869 (-0.21 %) VGPRS: 935236 -> 932252 (-0.32 %) Spilled SGPRs: 24860 -> 24776 (-0.34 %) Code Size: 49832348 -> 49819464 (-0.03 %) bytes Max Waves: 242101 -> 242611 (0.21 %) Totals from affected shaders: SGPRS: 93675 -> 90939 (-2.92 %) VGPRS: 58016 -> 55032 (-5.14 %) Spilled SGPRs: 172 -> 88 (-48.84 %) Code Size: 2862740 -> 2849856 (-0.45 %) bytes Max Waves: 15474 -> 15984 (3.30 %) This mostly helps Croteam games (Talos/Sam2017). Signed-off-by: Samuel Pitoiset <[email protected]> Reviewed-by: Bas Nieuwenhuizen <[email protected]>
author: Samuel Pitoiset <[email protected]> 2019-02-12 15:09:32 +0100
committer: Samuel Pitoiset <[email protected]> 2019-02-14 09:10:56 +0100
commit: 4b3549c0846f451119ee883a33205b0caa1cf16e (patch)
tree: ab6a66296ebc72150aafb11e61fe3a0377b8ffd2 /src/amd/vulkan
parent: 210aec3612bc02a6a02035188937a4755da5a252 (diff)
1 files changed, 79 insertions, 2 deletions
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
index a0ce569d409..08ab64971ab 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -2029,6 +2029,72 @@ adjust_vertex_fetch_alpha(struct radv_shader_context *ctx,
 	return alpha;
 }
 
+static unsigned
+get_num_channels_from_data_format(unsigned data_format)
+{
+	switch (data_format) {
+	case V_008F0C_BUF_DATA_FORMAT_8:
+	case V_008F0C_BUF_DATA_FORMAT_16:
+	case V_008F0C_BUF_DATA_FORMAT_32:
+		return 1;
+	case V_008F0C_BUF_DATA_FORMAT_8_8:
+	case V_008F0C_BUF_DATA_FORMAT_16_16:
+	case V_008F0C_BUF_DATA_FORMAT_32_32:
+		return 2;
+	case V_008F0C_BUF_DATA_FORMAT_10_11_11:
+	case V_008F0C_BUF_DATA_FORMAT_11_11_10:
+	case V_008F0C_BUF_DATA_FORMAT_32_32_32:
+		return 3;
+	case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
+	case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
+	case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
+	case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
+	case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
+		return 4;
+	default:
+		break;
+	}
+
+	return 4;
+}
+
+static LLVMValueRef
+radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx,
+				LLVMValueRef value,
+				unsigned num_channels,
+				bool is_float)
+{
+	LLVMValueRef zero = is_float ? ctx->ac.f32_0 : ctx->ac.i32_0;
+	LLVMValueRef one = is_float ? ctx->ac.f32_1 : ctx->ac.i32_1;
+	LLVMTypeRef elemtype;
+	LLVMValueRef chan[4];
+
+	if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
+		unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
+
+		if (num_channels == vec_size)
+			return value;
+
+		num_channels = MIN2(num_channels, vec_size);
+
+		for (unsigned i = 0; i < num_channels; i++)
+			chan[i] = ac_llvm_extract_elem(&ctx->ac, value, i);
+
+		elemtype = LLVMGetElementType(LLVMTypeOf(value));
+	} else {
+		if (num_channels) {
+			assert(num_channels == 1);
+			chan[0] = value;
+		}
+		elemtype = LLVMTypeOf(value);
+	}
+
+	for (unsigned i = num_channels; i < 4; i++)
+		chan[i] = i == 3 ? one : zero;
+
+	return ac_build_gather_values(&ctx->ac, chan, 4);
+}
+
 static void
 handle_vs_input_decl(struct radv_shader_context *ctx,
 		     struct nir_variable *variable)
@@ -2041,7 +2107,7 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
 	unsigned attrib_count = glsl_count_attribute_slots(variable->type, true);
 	uint8_t input_usage_mask =
 		ctx->shader_info->info.vs.input_usage_mask[variable->data.location];
-	unsigned num_channels = util_last_bit(input_usage_mask);
+	unsigned num_input_channels = util_last_bit(input_usage_mask);
 
 	variable->data.driver_location = variable->data.location * 4;
 
@@ -2049,6 +2115,10 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
 	for (unsigned i = 0; i < attrib_count; ++i) {
 		LLVMValueRef output[4];
 		unsigned attrib_index = variable->data.location + i - VERT_ATTRIB_GENERIC0;
+		unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[attrib_index];
+		unsigned data_format = attrib_format & 0x0f;
+		unsigned num_format = (attrib_format >> 4) & 0x07;
+		bool is_float = num_format == V_008F0C_BUF_NUM_FORMAT_FLOAT;
 
 		if (ctx->options->key.vs.instance_rate_inputs & (1u << attrib_index)) {
 			uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[attrib_index];
@@ -2080,12 +2150,19 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
 
 		t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
 
+		/* Adjust the number of channels to load based on the vertex
+		 * attribute format.
+		 */
+		unsigned num_format_channels = get_num_channels_from_data_format(data_format);
+		unsigned num_channels = MIN2(num_input_channels, num_format_channels);
+
 		input = ac_build_buffer_load_format(&ctx->ac, t_list,
 						    buffer_index,
 						    ctx->ac.i32_0,
 						    num_channels, false, true);
 
-		input = ac_build_expand_to_vec4(&ctx->ac, input, num_channels);
+		input = radv_fixup_vertex_input_fetches(ctx, input, num_channels,
+							is_float);
 
 		for (unsigned chan = 0; chan < 4; chan++) {
 			LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
author	Samuel Pitoiset <[email protected]>	2019-02-12 15:09:32 +0100
committer	Samuel Pitoiset <[email protected]>	2019-02-14 09:10:56 +0100
commit	4b3549c0846f451119ee883a33205b0caa1cf16e (patch)
tree	ab6a66296ebc72150aafb11e61fe3a0377b8ffd2 /src/amd/vulkan
parent	210aec3612bc02a6a02035188937a4755da5a252 (diff)