summaryrefslogtreecommitdiffstats
path: root/src/intel
diff options
context:
space:
mode:
Diffstat (limited to 'src/intel')
-rw-r--r--src/intel/compiler/brw_fs.cpp6
-rw-r--r--src/intel/compiler/brw_fs_generator.cpp83
2 files changed, 65 insertions, 24 deletions
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 89a6a7f6974..6d7435c5f3e 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -6091,9 +6091,6 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
case FS_OPCODE_LINTERP:
case SHADER_OPCODE_GET_BUFFER_SIZE:
- case FS_OPCODE_DDX_COARSE:
- case FS_OPCODE_DDX_FINE:
- case FS_OPCODE_DDY_COARSE:
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
@@ -6110,6 +6107,9 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
*/
return (devinfo->gen == 4 ? 16 : MIN2(16, inst->exec_size));
+ case FS_OPCODE_DDX_COARSE:
+ case FS_OPCODE_DDX_FINE:
+ case FS_OPCODE_DDY_COARSE:
case FS_OPCODE_DDY_FINE:
/* The implementation of this virtual opcode may require emitting
* compressed Align16 instructions, which are severely limited on some
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index 14cfdd77641..62d19719c39 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1207,27 +1207,50 @@ fs_generator::generate_ddx(const fs_inst *inst,
{
unsigned vstride, width;
- if (inst->opcode == FS_OPCODE_DDX_FINE) {
- /* produce accurate derivatives */
- vstride = BRW_VERTICAL_STRIDE_2;
- width = BRW_WIDTH_2;
- } else {
- /* replicate the derivative at the top-left pixel to other pixels */
- vstride = BRW_VERTICAL_STRIDE_4;
- width = BRW_WIDTH_4;
- }
+ if (devinfo->gen >= 8) {
+ if (inst->opcode == FS_OPCODE_DDX_FINE) {
+ /* produce accurate derivatives */
+ vstride = BRW_VERTICAL_STRIDE_2;
+ width = BRW_WIDTH_2;
+ } else {
+ /* replicate the derivative at the top-left pixel to other pixels */
+ vstride = BRW_VERTICAL_STRIDE_4;
+ width = BRW_WIDTH_4;
+ }
+
+ struct brw_reg src0 = byte_offset(src, type_sz(src.type));;
+ struct brw_reg src1 = src;
- struct brw_reg src0 = byte_offset(src, type_sz(src.type));;
- struct brw_reg src1 = src;
+ src0.vstride = vstride;
+ src0.width = width;
+ src0.hstride = BRW_HORIZONTAL_STRIDE_0;
+ src1.vstride = vstride;
+ src1.width = width;
+ src1.hstride = BRW_HORIZONTAL_STRIDE_0;
- src0.vstride = vstride;
- src0.width = width;
- src0.hstride = BRW_HORIZONTAL_STRIDE_0;
- src1.vstride = vstride;
- src1.width = width;
- src1.hstride = BRW_HORIZONTAL_STRIDE_0;
+ brw_ADD(p, dst, src0, negate(src1));
+ } else {
+ /* On Haswell and earlier, the region used above appears to not work
+ * correctly for compressed instructions. At least on Haswell and
+ * Iron Lake, compressed ALIGN16 instructions do work. Since we
+ * would have to split to SIMD8 no matter which method we choose, we
+ * may as well use ALIGN16 on all platforms gen7 and earlier.
+ */
+ struct brw_reg src0 = stride(src, 4, 4, 1);
+ struct brw_reg src1 = stride(src, 4, 4, 1);
+ if (inst->opcode == FS_OPCODE_DDX_FINE) {
+ src0.swizzle = BRW_SWIZZLE_XXZZ;
+ src1.swizzle = BRW_SWIZZLE_YYWW;
+ } else {
+ src0.swizzle = BRW_SWIZZLE_XXXX;
+ src1.swizzle = BRW_SWIZZLE_YYYY;
+ }
- brw_ADD(p, dst, src0, negate(src1));
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ brw_ADD(p, dst, negate(src0), src1);
+ brw_pop_insn_state(p);
+ }
}
/* The negate_value boolean is used to negate the derivative computation for
@@ -1280,10 +1303,28 @@ fs_generator::generate_ddy(const fs_inst *inst,
}
} else {
/* replicate the derivative at the top-left pixel to other pixels */
- struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size);
- struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size);
+ if (devinfo->gen >= 8) {
+ struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size);
+ struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size);
- brw_ADD(p, dst, negate(src0), src1);
+ brw_ADD(p, dst, negate(src0), src1);
+ } else {
+ /* On Haswell and earlier, the region used above appears to not work
+ * correctly for compressed instructions. At least on Haswell and
+ * Iron Lake, compressed ALIGN16 instructions do work. Since we
+ * would have to split to SIMD8 no matter which method we choose, we
+ * may as well use ALIGN16 on all platforms gen7 and earlier.
+ */
+ struct brw_reg src0 = stride(src, 4, 4, 1);
+ struct brw_reg src1 = stride(src, 4, 4, 1);
+ src0.swizzle = BRW_SWIZZLE_XXXX;
+ src1.swizzle = BRW_SWIZZLE_ZZZZ;
+
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ brw_ADD(p, dst, negate(src0), src1);
+ brw_pop_insn_state(p);
+ }
}
}