diff options
Diffstat (limited to 'src/intel/compiler/brw_fs.cpp')
-rw-r--r-- | src/intel/compiler/brw_fs.cpp | 37 |
1 files changed, 35 insertions, 2 deletions
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index ee0d1967ecc..fcba0bb449f 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -5675,16 +5675,49 @@ fs_visitor::lower_simd_width() * after \p inst, inst->next is a moving target and we need to save * it off here so that we insert the zip instructions in the right * place. + * + * Since we're inserting split instructions after after_inst, the + * instructions will end up in the reverse order that we insert them. + * However, certain render target writes require that the low group + * instructions come before the high group. From the Ivy Bridge PRM + * Vol. 4, Pt. 1, Section 3.9.11: + * + * "If multiple SIMD8 Dual Source messages are delivered by the + * pixel shader thread, each SIMD8_DUALSRC_LO message must be + * issued before the SIMD8_DUALSRC_HI message with the same Slot + * Group Select setting." + * + * And, from Section 3.9.11.1 of the same PRM: + * + * "When SIMD32 or SIMD16 PS threads send render target writes + * with multiple SIMD8 and SIMD16 messages, the following must + * hold: + * + * All the slots (as described above) must have a corresponding + * render target write irrespective of the slot's validity. A slot + * is considered valid when at least one sample is enabled. For + * example, a SIMD16 PS thread must send two SIMD8 render target + * writes to cover all the slots. + * + * PS thread must send SIMD render target write messages with + * increasing slot numbers. For example, SIMD16 thread has + * Slot[15:0] and if two SIMD8 render target writes are used, the + * first SIMD8 render target write must send Slot[7:0] and the + * next one must send Slot[15:8]." + * + * In order to make low group instructions come before high group + * instructions (this is required for some render target writes), we + * split from the highest group to lowest. */ exec_node *const after_inst = inst->next; - for (unsigned i = 0; i < n; i++) { + for (int i = n - 1; i >= 0; i--) { /* Emit a copy of the original instruction with the lowered width. * If the EOT flag was set throw it away except for the last * instruction to avoid killing the thread prematurely. */ fs_inst split_inst = *inst; split_inst.exec_size = lower_width; - split_inst.eot = inst->eot && i == 0; + split_inst.eot = inst->eot && i == n - 1; /* Select the correct channel enables for the i-th group, then * transform the sources and destination and emit the lowered |