From bbefb15e01e1c16af69646898918982ae00f8c92 Mon Sep 17 00:00:00 2001
From: Kristian Høgsberg <krh@bitplanet.net>
Date: Mon, 7 Jul 2014 23:32:35 -0700
Subject: i965: Extend compute-to-mrf pass to understand blocks of MOVs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current compute-to-mrf pass doesn't handle blocks of MOVs.  Shaders
that end with a texture fetch follwed by an fb write are left like this:

0x00000000: pln(8)          g6<1>F          g4<0,1,0>F      g2<8,8,1>F      { align1 WE_normal 1Q compacted };
0x00000008: pln(8)          g7<1>F          g4.4<0,1,0>F    g2<8,8,1>F      { align1 WE_normal 1Q compacted };
0x00000010: send(8)         g2<1>UW         g6<8,8,1>F
                            sampler (1, 0, 0, 1) mlen 2 rlen 4              { align1 WE_normal 1Q };
0x00000020: mov(8)          g113<1>F        g2<8,8,1>F                      { align1 WE_normal 1Q compacted };
0x00000028: mov(8)          g114<1>F        g3<8,8,1>F                      { align1 WE_normal 1Q compacted };
0x00000030: mov(8)          g115<1>F        g4<8,8,1>F                      { align1 WE_normal 1Q compacted };
0x00000038: mov(8)          g116<1>F        g5<8,8,1>F                      { align1 WE_normal 1Q compacted };
0x00000040: sendc(8)        null            g113<8,8,1>F
                            render ( RT write, 0, 4, 12) mlen 4 rlen 0      { align1 WE_normal 1Q EOT };

This patch lets compute-to-mrf recognize blocks of MOVs and match them to
instructions (typically SEND) that writes multiple registers.  With this,
the above shader becomes:

0x00000000: pln(8)          g6<1>F          g4<0,1,0>F      g2<8,8,1>F      { align1 WE_normal 1Q compacted };
0x00000008: pln(8)          g7<1>F          g4.4<0,1,0>F    g2<8,8,1>F      { align1 WE_normal 1Q compacted };
0x00000010: send(8)         g113<1>UW       g6<8,8,1>F
                            sampler (1, 0, 0, 1) mlen 2 rlen 4              { align1 WE_normal 1Q };
0x00000020: sendc(8)        null            g113<8,8,1>F
                            render ( RT write, 0, 20, 12) mlen 4 rlen 0     { align1 WE_normal 1Q EOT };

which is the bulk of the shader db results:

total instructions in shared programs: 987040 -> 986720 (-0.03%)
instructions in affected programs:     844 -> 524 (-37.91%)
GAINED:                                0
LOST:                                  0

The optimization also applies to MRT shaders that write the same
color value to multiple RTs, in which case we can eliminate four MOVs in
a similar fashion.  See fbo-drawbuffers2-blend in piglit for an example.

No measurable performance impact.  No piglit regressions.

Signed-off-by: Kristian Høgsberg <krh@bitplanet.net>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 63 ++++++++++++++++++++++++++++++------
 1 file changed, 53 insertions(+), 10 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index ccd9ac1434a..a3ad3756fd5 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2039,7 +2039,8 @@ bool
 fs_visitor::compute_to_mrf()
 {
    bool progress = false;
-   int next_ip = 0;
+   int next_ip = 0, block_size = 0, step = dispatch_width / 8;
+   fs_inst *block_start = NULL, *block_end = NULL;
 
    calculate_live_intervals();
 
@@ -2053,8 +2054,27 @@ fs_visitor::compute_to_mrf()
 	  inst->dst.type != inst->src[0].type ||
 	  inst->src[0].abs || inst->src[0].negate ||
           !inst->src[0].is_contiguous() ||
-          inst->src[0].subreg_offset)
+          inst->src[0].subreg_offset) {
+         block_start = NULL;
 	 continue;
+      }
+
+      /* We're trying to identify a block of GRF-to-MRF MOVs for the purpose
+       * of rewriting the send that assigned the GRFs to just return in the
+       * MRFs directly.  send can't saturate, so if any of the MOVs do that,
+       * cancel the block.
+       */
+      if (inst->saturate) {
+         block_start = NULL;
+      } else if (block_start && inst->dst.reg == block_end->dst.reg + step &&
+                 inst->src[0].reg == block_end->src[0].reg &&
+                 inst->src[0].reg_offset == block_end->src[0].reg_offset + 1) {
+         block_size++;
+         block_end = inst;
+      } else if (inst->src[0].reg_offset == 0) {
+         block_size = 1;
+         block_start = block_end = inst;
+      }
 
       /* Work out which hardware MRF registers are written by this
        * instruction.
@@ -2097,14 +2117,8 @@ fs_visitor::compute_to_mrf()
 	    if (scan_inst->is_partial_write())
 	       break;
 
-            /* Things returning more than one register would need us to
-             * understand coalescing out more than one MOV at a time.
-             */
-            if (scan_inst->regs_written > 1)
-               break;
-
-	    /* SEND instructions can't have MRF as a destination. */
-	    if (scan_inst->mlen)
+	    /* SEND instructions can't have MRF as a destination before Gen7. */
+	    if (brw->gen < 7 && scan_inst->mlen)
 	       break;
 
 	    if (brw->gen == 6) {
@@ -2116,6 +2130,35 @@ fs_visitor::compute_to_mrf()
 	       }
 	    }
 
+            /* We have a contiguous block of mov to MRF that aligns with the
+             * return registers of a send instruction.  Modify the send
+             * instruction to just return in the MRFs.
+             */
+            if (scan_inst->mlen > 0 &&
+                scan_inst->regs_written == block_size && block_size > 1) {
+               int i = 0;
+
+               scan_inst->dst.file = MRF;
+               scan_inst->dst.reg = block_start->dst.reg;
+               assert(!block_start->saturate);
+
+               for (fs_inst *next, *mov = block_start;
+                    i < block_size;
+                    mov = next, i++) {
+                  next = (fs_inst *) mov->next;
+                  mov->remove();
+               }
+
+               progress = true;
+               break;
+            }
+
+            /* If the block size we've tracked doesn't match the regs_written
+             * of the instruction, we can't do anything.
+             */
+            if (scan_inst->regs_written > 1)
+               break;
+
 	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
 	       /* Found the creator of our MRF's source value. */
 	       scan_inst->dst.file = MRF;
-- 
cgit v1.2.3