7 files changed, 42 insertions, 23 deletions
diff --git a/src/amd/common/ac_binary.c b/src/amd/common/ac_binary.c
index 01cf000d9be..9c66a821c21 100644
--- a/src/amd/common/ac_binary.c
+++ b/src/amd/common/ac_binary.c
@@ -212,23 +212,28 @@ static const char *scratch_rsrc_dword1_symbol =
 
 void ac_shader_binary_read_config(struct ac_shader_binary *binary,
 				  struct ac_shader_config *conf,
-				  unsigned symbol_offset)
+				  unsigned symbol_offset,
+				  bool supports_spill)
 {
 	unsigned i;
 	const unsigned char *config =
 		ac_shader_binary_config_start(binary, symbol_offset);
 	bool really_needs_scratch = false;
-
+	uint32_t wavesize = 0;
 	/* LLVM adds SGPR spills to the scratch size.
 	 * Find out if we really need the scratch buffer.
 	 */
-	for (i = 0; i < binary->reloc_count; i++) {
-		const struct ac_shader_reloc *reloc = &binary->relocs[i];
+	if (supports_spill) {
+		really_needs_scratch = true;
+	} else {
+		for (i = 0; i < binary->reloc_count; i++) {
+			const struct ac_shader_reloc *reloc = &binary->relocs[i];
 
-		if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
-		    !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
-			really_needs_scratch = true;
-			break;
+			if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
+			    !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
+				really_needs_scratch = true;
+				break;
+			}
 		}
 	}
 
@@ -259,9 +264,7 @@ void ac_shader_binary_read_config(struct ac_shader_binary *binary,
 		case R_0286E8_SPI_TMPRING_SIZE:
 		case R_00B860_COMPUTE_TMPRING_SIZE:
 			/* WAVESIZE is in units of 256 dwords. */
-			if (really_needs_scratch)
-				conf->scratch_bytes_per_wave =
-					G_00B860_WAVESIZE(value) * 256 * 4;
+			wavesize = value;
 			break;
 		case SPILLED_SGPRS:
 			conf->spilled_sgprs = value;
@@ -285,4 +288,9 @@ void ac_shader_binary_read_config(struct ac_shader_binary *binary,
 		if (!conf->spi_ps_input_addr)
 			conf->spi_ps_input_addr = conf->spi_ps_input_ena;
 	}
+
+	if (really_needs_scratch) {
+		/* sgprs spills aren't spilling */
+	        conf->scratch_bytes_per_wave = G_00B860_WAVESIZE(wavesize) * 256 * 4;
+	}
 }
diff --git a/src/amd/common/ac_binary.h b/src/amd/common/ac_binary.h
index 282f33d22b9..06fd855f948 100644
--- a/src/amd/common/ac_binary.h
+++ b/src/amd/common/ac_binary.h
@@ -27,6 +27,7 @@
 #pragma once
 
 #include <stdint.h>
+#include <stdbool.h>
 
 struct ac_shader_reloc {
 	char name[32];
@@ -85,4 +86,5 @@ void ac_elf_read(const char *elf_data, unsigned elf_size,
 
 void ac_shader_binary_read_config(struct ac_shader_binary *binary,
 				  struct ac_shader_config *conf,
-				  unsigned symbol_offset);
+				  unsigned symbol_offset,
+				  bool supports_spill);
diff --git a/src/amd/common/ac_llvm_util.c b/src/amd/common/ac_llvm_util.c
index 7317db76baa..f3cab921ba1 100644
--- a/src/amd/common/ac_llvm_util.c
+++ b/src/amd/common/ac_llvm_util.c
@@ -126,11 +126,11 @@ static const char *ac_get_llvm_processor_name(enum radeon_family family)
 	}
 }
 
-LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family)
+LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, bool supports_spill)
 {
 	assert(family >= CHIP_TAHITI);
 
-	const char *triple = "amdgcn--";
+	const char *triple = supports_spill ? "amdgcn-mesa-mesa3d" : "amdgcn--";
 	LLVMTargetRef target = ac_get_llvm_target(triple);
 	LLVMTargetMachineRef tm = LLVMCreateTargetMachine(
 	                             target,
diff --git a/src/amd/common/ac_llvm_util.h b/src/amd/common/ac_llvm_util.h
index 2d301c93575..c07f67ab8b1 100644
--- a/src/amd/common/ac_llvm_util.h
+++ b/src/amd/common/ac_llvm_util.h
@@ -56,7 +56,7 @@ struct ac_llvm_context {
 	LLVMValueRef fpmath_md_2p5_ulp;
 };
 
-LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family);
+LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, bool supports_spill);
 
 void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes);
 bool ac_is_sgpr_param(LLVMValueRef param);
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index e83c7a2e488..dedea656892 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -458,10 +458,10 @@ static void create_function(struct nir_to_llvm_context *ctx)
 	    arg_idx, array_params_mask, sgpr_count, ctx->options->unsafe_math);
 	set_llvm_calling_convention(ctx->main_function, ctx->stage);
 
-
 	ctx->shader_info->num_input_sgprs = 0;
 	ctx->shader_info->num_input_vgprs = 0;
 
+	ctx->shader_info->num_user_sgprs = ctx->options->supports_spill ? 2 : 0;
 	for (i = 0; i < user_sgpr_count; i++)
 		ctx->shader_info->num_user_sgprs += llvm_get_type_size(arg_types[i]) / 4;
 
@@ -475,6 +475,12 @@ static void create_function(struct nir_to_llvm_context *ctx)
 
 	arg_idx = 0;
 	user_sgpr_idx = 0;
+
+	if (ctx->options->supports_spill) {
+		set_userdata_location_shader(ctx, AC_UD_SCRATCH, user_sgpr_idx, 2);
+		user_sgpr_idx += 2;
+	}
+
 	for (unsigned i = 0; i < num_sets; ++i) {
 		if (ctx->options->layout->set[i].layout->shader_stages & (1 << ctx->stage)) {
 			set_userdata_location(&ctx->shader_info->user_sgprs_locs.descriptor_sets[i], user_sgpr_idx, 2);
@@ -4432,7 +4438,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
 
 	memset(shader_info, 0, sizeof(*shader_info));
 
-	LLVMSetTarget(ctx.module, "amdgcn--");
+	LLVMSetTarget(ctx.module, options->supports_spill ? "amdgcn-mesa-mesa3d" : "amdgcn--");
 	setup_types(&ctx);
 
 	ctx.builder = LLVMCreateBuilderInContext(ctx.context);
@@ -4566,7 +4572,7 @@ static void ac_compile_llvm_module(LLVMTargetMachineRef tm,
 				   struct ac_shader_config *config,
 				   struct ac_shader_variant_info *shader_info,
 				   gl_shader_stage stage,
-				   bool dump_shader)
+				   bool dump_shader, bool supports_spill)
 {
 	if (dump_shader)
 		ac_dump_module(llvm_module);
@@ -4580,7 +4586,7 @@ static void ac_compile_llvm_module(LLVMTargetMachineRef tm,
 	if (dump_shader)
 		fprintf(stderr, "disasm:\n%s\n", binary->disasm_string);
 
-	ac_shader_binary_read_config(binary, config, 0);
+	ac_shader_binary_read_config(binary, config, 0, supports_spill);
 
 	LLVMContextRef ctx = LLVMGetModuleContext(llvm_module);
 	LLVMDisposeModule(llvm_module);
@@ -4640,7 +4646,7 @@ void ac_compile_nir_shader(LLVMTargetMachineRef tm,
 	LLVMModuleRef llvm_module = ac_translate_nir_to_llvm(tm, nir, shader_info,
 	                                                     options);
 
-	ac_compile_llvm_module(tm, llvm_module, binary, config, shader_info, nir->stage, dump_shader);
+	ac_compile_llvm_module(tm, llvm_module, binary, config, shader_info, nir->stage, dump_shader, options->supports_spill);
 	switch (nir->stage) {
 	case MESA_SHADER_COMPUTE:
 		for (int i = 0; i < 3; ++i)
diff --git a/src/amd/common/ac_nir_to_llvm.h b/src/amd/common/ac_nir_to_llvm.h
index a57558e38ff..9d66f940b52 100644
--- a/src/amd/common/ac_nir_to_llvm.h
+++ b/src/amd/common/ac_nir_to_llvm.h
@@ -52,6 +52,7 @@ struct ac_nir_compiler_options {
 	struct radv_pipeline_layout *layout;
 	union ac_shader_variant_key key;
 	bool unsafe_math;
+	bool supports_spill;
 	enum radeon_family family;
 	enum chip_class chip_class;
 };
@@ -64,8 +65,9 @@ struct ac_userdata_info {
 };
 
 enum ac_ud_index {
-	AC_UD_PUSH_CONSTANTS = 0,
-	AC_UD_SHADER_START = 1,
+	AC_UD_SCRATCH = 0,
+	AC_UD_PUSH_CONSTANTS = 1,
+	AC_UD_SHADER_START = 2,
 	AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
 	AC_UD_VS_BASE_VERTEX_START_INSTANCE,
 	AC_UD_VS_MAX_UD,
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 360b5196551..4d88ed77f93 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -424,7 +424,8 @@ static struct radv_shader_variant *radv_shader_variant_create(struct radv_device
 	options.unsafe_math = !!(device->debug_flags & RADV_DEBUG_UNSAFE_MATH);
 	options.family = chip_family;
 	options.chip_class = device->physical_device->rad_info.chip_class;
-	tm = ac_create_target_machine(chip_family);
+	options.supports_spill = false;
+	tm = ac_create_target_machine(chip_family, false);
 	ac_compile_nir_shader(tm, &binary, &variant->config,
 			      &variant->info, shader, &options, dump);
 	LLVMDisposeTargetMachine(tm);