summaryrefslogtreecommitdiffstats
path: root/src/intel/compiler
diff options
context:
space:
mode:
Diffstat (limited to 'src/intel/compiler')
-rw-r--r--src/intel/compiler/brw_fs.cpp37
1 files changed, 35 insertions, 2 deletions
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index ee0d1967ecc..fcba0bb449f 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -5675,16 +5675,49 @@ fs_visitor::lower_simd_width()
* after \p inst, inst->next is a moving target and we need to save
* it off here so that we insert the zip instructions in the right
* place.
+ *
+ * Since we're inserting split instructions after after_inst, the
+ * instructions will end up in the reverse order that we insert them.
+ * However, certain render target writes require that the low group
+ * instructions come before the high group. From the Ivy Bridge PRM
+ * Vol. 4, Pt. 1, Section 3.9.11:
+ *
+ * "If multiple SIMD8 Dual Source messages are delivered by the
+ * pixel shader thread, each SIMD8_DUALSRC_LO message must be
+ * issued before the SIMD8_DUALSRC_HI message with the same Slot
+ * Group Select setting."
+ *
+ * And, from Section 3.9.11.1 of the same PRM:
+ *
+ * "When SIMD32 or SIMD16 PS threads send render target writes
+ * with multiple SIMD8 and SIMD16 messages, the following must
+ * hold:
+ *
+ * All the slots (as described above) must have a corresponding
+ * render target write irrespective of the slot's validity. A slot
+ * is considered valid when at least one sample is enabled. For
+ * example, a SIMD16 PS thread must send two SIMD8 render target
+ * writes to cover all the slots.
+ *
+ * PS thread must send SIMD render target write messages with
+ * increasing slot numbers. For example, SIMD16 thread has
+ * Slot[15:0] and if two SIMD8 render target writes are used, the
+ * first SIMD8 render target write must send Slot[7:0] and the
+ * next one must send Slot[15:8]."
+ *
+ * In order to make low group instructions come before high group
+ * instructions (this is required for some render target writes), we
+ * split from the highest group to lowest.
*/
exec_node *const after_inst = inst->next;
- for (unsigned i = 0; i < n; i++) {
+ for (int i = n - 1; i >= 0; i--) {
/* Emit a copy of the original instruction with the lowered width.
* If the EOT flag was set throw it away except for the last
* instruction to avoid killing the thread prematurely.
*/
fs_inst split_inst = *inst;
split_inst.exec_size = lower_width;
- split_inst.eot = inst->eot && i == 0;
+ split_inst.eot = inst->eot && i == n - 1;
/* Select the correct channel enables for the i-th group, then
* transform the sources and destination and emit the lowered