diff options
Diffstat (limited to 'src/intel')
-rw-r--r-- | src/intel/compiler/brw_fs.cpp | 6 | ||||
-rw-r--r-- | src/intel/compiler/brw_fs_generator.cpp | 83 |
2 files changed, 65 insertions, 24 deletions
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 89a6a7f6974..6d7435c5f3e 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -6091,9 +6091,6 @@ get_lowered_simd_width(const struct gen_device_info *devinfo, case FS_OPCODE_LINTERP: case SHADER_OPCODE_GET_BUFFER_SIZE: - case FS_OPCODE_DDX_COARSE: - case FS_OPCODE_DDX_FINE: - case FS_OPCODE_DDY_COARSE: case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: case FS_OPCODE_PACK_HALF_2x16_SPLIT: case FS_OPCODE_INTERPOLATE_AT_SAMPLE: @@ -6110,6 +6107,9 @@ get_lowered_simd_width(const struct gen_device_info *devinfo, */ return (devinfo->gen == 4 ? 16 : MIN2(16, inst->exec_size)); + case FS_OPCODE_DDX_COARSE: + case FS_OPCODE_DDX_FINE: + case FS_OPCODE_DDY_COARSE: case FS_OPCODE_DDY_FINE: /* The implementation of this virtual opcode may require emitting * compressed Align16 instructions, which are severely limited on some diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index 14cfdd77641..62d19719c39 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -1207,27 +1207,50 @@ fs_generator::generate_ddx(const fs_inst *inst, { unsigned vstride, width; - if (inst->opcode == FS_OPCODE_DDX_FINE) { - /* produce accurate derivatives */ - vstride = BRW_VERTICAL_STRIDE_2; - width = BRW_WIDTH_2; - } else { - /* replicate the derivative at the top-left pixel to other pixels */ - vstride = BRW_VERTICAL_STRIDE_4; - width = BRW_WIDTH_4; - } + if (devinfo->gen >= 8) { + if (inst->opcode == FS_OPCODE_DDX_FINE) { + /* produce accurate derivatives */ + vstride = BRW_VERTICAL_STRIDE_2; + width = BRW_WIDTH_2; + } else { + /* replicate the derivative at the top-left pixel to other pixels */ + vstride = BRW_VERTICAL_STRIDE_4; + width = BRW_WIDTH_4; + } + + struct brw_reg src0 = byte_offset(src, type_sz(src.type));; + struct brw_reg src1 = src; - struct brw_reg src0 = byte_offset(src, type_sz(src.type));; - struct brw_reg src1 = src; + src0.vstride = vstride; + src0.width = width; + src0.hstride = BRW_HORIZONTAL_STRIDE_0; + src1.vstride = vstride; + src1.width = width; + src1.hstride = BRW_HORIZONTAL_STRIDE_0; - src0.vstride = vstride; - src0.width = width; - src0.hstride = BRW_HORIZONTAL_STRIDE_0; - src1.vstride = vstride; - src1.width = width; - src1.hstride = BRW_HORIZONTAL_STRIDE_0; + brw_ADD(p, dst, src0, negate(src1)); + } else { + /* On Haswell and earlier, the region used above appears to not work + * correctly for compressed instructions. At least on Haswell and + * Iron Lake, compressed ALIGN16 instructions do work. Since we + * would have to split to SIMD8 no matter which method we choose, we + * may as well use ALIGN16 on all platforms gen7 and earlier. + */ + struct brw_reg src0 = stride(src, 4, 4, 1); + struct brw_reg src1 = stride(src, 4, 4, 1); + if (inst->opcode == FS_OPCODE_DDX_FINE) { + src0.swizzle = BRW_SWIZZLE_XXZZ; + src1.swizzle = BRW_SWIZZLE_YYWW; + } else { + src0.swizzle = BRW_SWIZZLE_XXXX; + src1.swizzle = BRW_SWIZZLE_YYYY; + } - brw_ADD(p, dst, src0, negate(src1)); + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_ADD(p, dst, negate(src0), src1); + brw_pop_insn_state(p); + } } /* The negate_value boolean is used to negate the derivative computation for @@ -1280,10 +1303,28 @@ fs_generator::generate_ddy(const fs_inst *inst, } } else { /* replicate the derivative at the top-left pixel to other pixels */ - struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size); - struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size); + if (devinfo->gen >= 8) { + struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size); + struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size); - brw_ADD(p, dst, negate(src0), src1); + brw_ADD(p, dst, negate(src0), src1); + } else { + /* On Haswell and earlier, the region used above appears to not work + * correctly for compressed instructions. At least on Haswell and + * Iron Lake, compressed ALIGN16 instructions do work. Since we + * would have to split to SIMD8 no matter which method we choose, we + * may as well use ALIGN16 on all platforms gen7 and earlier. + */ + struct brw_reg src0 = stride(src, 4, 4, 1); + struct brw_reg src1 = stride(src, 4, 4, 1); + src0.swizzle = BRW_SWIZZLE_XXXX; + src1.swizzle = BRW_SWIZZLE_ZZZZ; + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_ADD(p, dst, negate(src0), src1); + brw_pop_insn_state(p); + } } } |