diff options
author | Kristian Høgsberg <[email protected]> | 2014-07-07 23:32:35 -0700 |
---|---|---|
committer | Kristian Høgsberg <[email protected]> | 2014-07-07 23:39:40 -0700 |
commit | bbefb15e01e1c16af69646898918982ae00f8c92 (patch) | |
tree | d1dcbd24ea77be6fe454f31fb710b9224e5e9d69 /src | |
parent | 8aa34dc9cb1f4b1b17e49da98e54066832afc98e (diff) |
i965: Extend compute-to-mrf pass to understand blocks of MOVs
The current compute-to-mrf pass doesn't handle blocks of MOVs. Shaders
that end with a texture fetch follwed by an fb write are left like this:
0x00000000: pln(8) g6<1>F g4<0,1,0>F g2<8,8,1>F { align1 WE_normal 1Q compacted };
0x00000008: pln(8) g7<1>F g4.4<0,1,0>F g2<8,8,1>F { align1 WE_normal 1Q compacted };
0x00000010: send(8) g2<1>UW g6<8,8,1>F
sampler (1, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
0x00000020: mov(8) g113<1>F g2<8,8,1>F { align1 WE_normal 1Q compacted };
0x00000028: mov(8) g114<1>F g3<8,8,1>F { align1 WE_normal 1Q compacted };
0x00000030: mov(8) g115<1>F g4<8,8,1>F { align1 WE_normal 1Q compacted };
0x00000038: mov(8) g116<1>F g5<8,8,1>F { align1 WE_normal 1Q compacted };
0x00000040: sendc(8) null g113<8,8,1>F
render ( RT write, 0, 4, 12) mlen 4 rlen 0 { align1 WE_normal 1Q EOT };
This patch lets compute-to-mrf recognize blocks of MOVs and match them to
instructions (typically SEND) that writes multiple registers. With this,
the above shader becomes:
0x00000000: pln(8) g6<1>F g4<0,1,0>F g2<8,8,1>F { align1 WE_normal 1Q compacted };
0x00000008: pln(8) g7<1>F g4.4<0,1,0>F g2<8,8,1>F { align1 WE_normal 1Q compacted };
0x00000010: send(8) g113<1>UW g6<8,8,1>F
sampler (1, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
0x00000020: sendc(8) null g113<8,8,1>F
render ( RT write, 0, 20, 12) mlen 4 rlen 0 { align1 WE_normal 1Q EOT };
which is the bulk of the shader db results:
total instructions in shared programs: 987040 -> 986720 (-0.03%)
instructions in affected programs: 844 -> 524 (-37.91%)
GAINED: 0
LOST: 0
The optimization also applies to MRT shaders that write the same
color value to multiple RTs, in which case we can eliminate four MOVs in
a similar fashion. See fbo-drawbuffers2-blend in piglit for an example.
No measurable performance impact. No piglit regressions.
Signed-off-by: Kristian Høgsberg <[email protected]>
Diffstat (limited to 'src')
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.cpp | 63 |
1 files changed, 53 insertions, 10 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index ccd9ac1434a..a3ad3756fd5 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -2039,7 +2039,8 @@ bool fs_visitor::compute_to_mrf() { bool progress = false; - int next_ip = 0; + int next_ip = 0, block_size = 0, step = dispatch_width / 8; + fs_inst *block_start = NULL, *block_end = NULL; calculate_live_intervals(); @@ -2053,8 +2054,27 @@ fs_visitor::compute_to_mrf() inst->dst.type != inst->src[0].type || inst->src[0].abs || inst->src[0].negate || !inst->src[0].is_contiguous() || - inst->src[0].subreg_offset) + inst->src[0].subreg_offset) { + block_start = NULL; continue; + } + + /* We're trying to identify a block of GRF-to-MRF MOVs for the purpose + * of rewriting the send that assigned the GRFs to just return in the + * MRFs directly. send can't saturate, so if any of the MOVs do that, + * cancel the block. + */ + if (inst->saturate) { + block_start = NULL; + } else if (block_start && inst->dst.reg == block_end->dst.reg + step && + inst->src[0].reg == block_end->src[0].reg && + inst->src[0].reg_offset == block_end->src[0].reg_offset + 1) { + block_size++; + block_end = inst; + } else if (inst->src[0].reg_offset == 0) { + block_size = 1; + block_start = block_end = inst; + } /* Work out which hardware MRF registers are written by this * instruction. @@ -2097,14 +2117,8 @@ fs_visitor::compute_to_mrf() if (scan_inst->is_partial_write()) break; - /* Things returning more than one register would need us to - * understand coalescing out more than one MOV at a time. - */ - if (scan_inst->regs_written > 1) - break; - - /* SEND instructions can't have MRF as a destination. */ - if (scan_inst->mlen) + /* SEND instructions can't have MRF as a destination before Gen7. */ + if (brw->gen < 7 && scan_inst->mlen) break; if (brw->gen == 6) { @@ -2116,6 +2130,35 @@ fs_visitor::compute_to_mrf() } } + /* We have a contiguous block of mov to MRF that aligns with the + * return registers of a send instruction. Modify the send + * instruction to just return in the MRFs. + */ + if (scan_inst->mlen > 0 && + scan_inst->regs_written == block_size && block_size > 1) { + int i = 0; + + scan_inst->dst.file = MRF; + scan_inst->dst.reg = block_start->dst.reg; + assert(!block_start->saturate); + + for (fs_inst *next, *mov = block_start; + i < block_size; + mov = next, i++) { + next = (fs_inst *) mov->next; + mov->remove(); + } + + progress = true; + break; + } + + /* If the block size we've tracked doesn't match the regs_written + * of the instruction, we can't do anything. + */ + if (scan_inst->regs_written > 1) + break; + if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) { /* Found the creator of our MRF's source value. */ scan_inst->dst.file = MRF; |