summaryrefslogtreecommitdiffstats
path: root/src/amd
diff options
context:
space:
mode:
authorTimur Kristóf <[email protected]>2019-10-24 17:34:37 +0200
committerRhys Perry <[email protected]>2019-10-28 23:52:50 +0000
commitc52ebbcea4f63e2da68de56c3839f6a72e816f46 (patch)
tree336c94bd2f215bb838bd6494641c2d63402d771f /src/amd
parentd59f702e268004fd43a0b781f39671be66728d46 (diff)
aco: Introduce vgpr_limit to keep track of available VGPRs.
Signed-off-by: Timur Kristóf <[email protected]> Reviewed-by: Daniel Schürmann <[email protected]>
Diffstat (limited to 'src/amd')
-rw-r--r--src/amd/compiler/aco_instruction_selection.cpp5
-rw-r--r--src/amd/compiler/aco_instruction_selection_setup.cpp2
-rw-r--r--src/amd/compiler/aco_ir.h2
-rw-r--r--src/amd/compiler/aco_live_var_analysis.cpp2
-rw-r--r--src/amd/compiler/aco_lower_to_hw_instr.cpp3
-rw-r--r--src/amd/compiler/aco_register_allocation.cpp3
6 files changed, 12 insertions, 5 deletions
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 768860a2c9b..7ae6fb2d9a1 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -162,6 +162,11 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data
* properly support subgroup shuffle like older generations (or wave32 mode), so we
* emulate it here.
*/
+ if (!ctx->has_gfx10_wave64_bpermute) {
+ ctx->has_gfx10_wave64_bpermute = true;
+ ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
+ ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
+ }
Temp lane_id = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u));
lane_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1), lane_id);
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
index 0104fd36f49..b65628c8521 100644
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -79,6 +79,7 @@ struct isel_context {
std::unique_ptr<Temp[]> allocated;
std::unordered_map<unsigned, std::array<Temp,4>> allocated_vec;
Stage stage; /* Stage */
+ bool has_gfx10_wave64_bpermute = false;
struct {
bool has_branch;
uint16_t loop_nest_depth = 0;
@@ -1255,6 +1256,7 @@ setup_isel_context(Program* program,
program->lds_alloc_granule = options->chip_class >= GFX7 ? 512 : 256;
program->lds_limit = options->chip_class >= GFX7 ? 65536 : 32768;
+ program->vgpr_limit = 256;
if (options->chip_class >= GFX10) {
program->physical_sgprs = 2560; /* doesn't matter as long as it's at least 128 * 20 */
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 58d67ef293b..29aefef26cf 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1080,6 +1080,8 @@ public:
uint16_t lds_alloc_granule;
uint32_t lds_limit; /* in bytes */
+ uint16_t vgpr_limit;
+
uint16_t physical_sgprs;
uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
uint16_t sgpr_limit;
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp
index 4d689db7070..a4a2e5c49bb 100644
--- a/src/amd/compiler/aco_live_var_analysis.cpp
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
@@ -244,7 +244,7 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
const int16_t vgpr_alloc = std::max<int16_t>(4, (new_demand.vgpr + 3) & ~3);
/* this won't compile, register pressure reduction necessary */
- if (new_demand.vgpr > 256 || new_demand.sgpr > program->sgpr_limit) {
+ if (new_demand.vgpr > program->vgpr_limit || new_demand.sgpr > program->sgpr_limit) {
program->num_waves = 0;
program->max_reg_demand = new_demand;
} else {
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index 2fe865e2a90..3d01e59fef7 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -823,9 +823,6 @@ void lower_to_hw_instr(Program* program)
assert(instr->operands[2].regClass() == v1); /* Indices x4 */
assert(instr->operands[3].regClass() == v1); /* Input data */
- /* Shared VGPRs are allocated in groups of 8 */
- program->config->num_shared_vgprs = 8;
-
PhysReg shared_vgpr_reg_lo = PhysReg(align(program->config->num_vgprs, 4) + 256);
PhysReg shared_vgpr_reg_hi = PhysReg(shared_vgpr_reg_lo + 1);
Operand compare = instr->operands[0];
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
index 965fe15964a..621bc1f7636 100644
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@@ -668,7 +668,8 @@ PhysReg get_reg(ra_ctx& ctx,
/* try using more registers */
uint16_t max_addressible_sgpr = ctx.program->sgpr_limit;
- if (rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < 256) {
+ uint16_t max_addressible_vgpr = ctx.program->vgpr_limit;
+ if (rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) {
update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr));
return get_reg(ctx, reg_file, rc, parallelcopies, instr);
} else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) {