aco: restrict scheduling depending on max_waves

Previously, we allowed all shaders to reduce the number of max_waves to as low as 5. Restricting this on shaders with low register demand, increases the total number of waves while the VMEM def-use distances hardly change. This patch also changes the max number of move operations per MEM instruction. Reviewed-by: Rhys Perry <[email protected]>
author: Daniel Schürmann <[email protected]> 2019-08-29 17:17:32 +0200
committer: Rhys Perry <[email protected]> 2019-10-30 16:12:10 +0000
commit: 703ce617ca4045a9e4d3e05b8e6ed607d89fd338 (patch)
tree: 05ec910ae663d0ac3864c5363b5297a160f01b6d /src/amd/compiler
parent: beca63c6c07f7263a56a3517ba0fec6fe6335325 (diff)
1 files changed, 15 insertions, 9 deletions
diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp
index 1601545dcfe..f1b4472a942 100644
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -32,8 +32,8 @@
 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
 #define POS_EXP_WINDOW_SIZE 512
-#define SMEM_MAX_MOVES (80 - ctx.num_waves * 8)
-#define VMEM_MAX_MOVES (128 - ctx.num_waves * 4)
+#define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
+#define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
 #define POS_EXP_MAX_MOVES 512
 
 namespace aco {
@@ -802,13 +802,19 @@ void schedule_program(Program *program, live& live_vars)
    /* Allowing the scheduler to reduce the number of waves to as low as 5
     * improves performance of Thrones of Britannia significantly and doesn't
     * seem to hurt anything else. */
-   //TODO: maybe use some sort of heuristic instead
-   //TODO: this also increases window-size/max-moves? did I realize that at the time?
-   ctx.num_waves = std::min<uint16_t>(program->num_waves, 5);
-   assert(ctx.num_waves);
-   uint16_t total_sgpr_regs = program->physical_sgprs;
-   uint16_t max_addressible_sgpr = program->sgpr_limit;
-   ctx.max_registers = { int16_t(((256 / ctx.num_waves) & ~3) - 2), std::min<int16_t>(((total_sgpr_regs / ctx.num_waves) & ~program->sgpr_alloc_granule) - 2, max_addressible_sgpr)};
+   if (program->num_waves <= 5)
+      ctx.num_waves = program->num_waves;
+   else if (program->max_reg_demand.vgpr >= 32)
+      ctx.num_waves = 5;
+   else if (program->max_reg_demand.vgpr >= 28)
+      ctx.num_waves = 6;
+   else if (program->max_reg_demand.vgpr >= 24)
+      ctx.num_waves = 7;
+   else
+      ctx.num_waves = 8;
+
+   assert(ctx.num_waves > 0 && ctx.num_waves <= program->num_waves);
+   ctx.max_registers = { int16_t(((256 / ctx.num_waves) & ~3) - 2), int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))};
 
    for (Block& block : program->blocks)
       schedule_block(ctx, program, &block, live_vars);
author	Daniel Schürmann <[email protected]>	2019-08-29 17:17:32 +0200
committer	Rhys Perry <[email protected]>	2019-10-30 16:12:10 +0000
commit	703ce617ca4045a9e4d3e05b8e6ed607d89fd338 (patch)
tree	05ec910ae663d0ac3864c5363b5297a160f01b6d /src/amd/compiler
parent	beca63c6c07f7263a56a3517ba0fec6fe6335325 (diff)