summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFrancisco Jerez <[email protected]>2015-08-06 14:04:00 +0300
committerFrancisco Jerez <[email protected]>2015-08-06 14:12:12 +0300
commit3b48a0eeda20f5cf2dbc8de5e36f8fe3461f41bf (patch)
tree5b28c4047b1572e5782303664c28c9b88b7beaaf
parent2e731264382954beb1192cd7cc62e16e0b8e7978 (diff)
i965/fs: Lower the MULH virtual instruction.
Translate MULH into the MUL/MACH sequence. This does roughly the same thing that nir_emit_alu() used to do but we can now handle 16-wide by taking advantage of the SIMD lowering pass. The force_sechalf workaround near the bottom is required because the SIMD lowering pass will emit instructions with non-zero quarter control and we need to make sure we avoid that on integer arithmetic instructions with implicit accumulator access due to a known hardware bug on IVB. Reviewed-by: Matt Turner <[email protected]>
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.cpp55
1 files changed, 55 insertions, 0 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 17f0eaba311..284528d1f84 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3280,6 +3280,55 @@ fs_visitor::lower_integer_multiplication()
ibld.MOV(null, inst->dst));
}
}
+
+ } else if (inst->opcode == SHADER_OPCODE_MULH) {
+ /* Should have been lowered to 8-wide. */
+ assert(inst->exec_size <= 8);
+ const fs_reg acc = retype(brw_acc_reg(inst->exec_size),
+ inst->dst.type);
+ fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
+ fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
+
+ if (devinfo->gen >= 8) {
+ /* Until Gen8, integer multiplies read 32-bits from one source,
+ * and 16-bits from the other, and relying on the MACH instruction
+ * to generate the high bits of the result.
+ *
+ * On Gen8, the multiply instruction does a full 32x32-bit
+ * multiply, but in order to do a 64-bit multiply we can simulate
+ * the previous behavior and then use a MACH instruction.
+ *
+ * FINISHME: Don't use source modifiers on src1.
+ */
+ assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
+ mul->src[1].type == BRW_REGISTER_TYPE_UD);
+ mul->src[1].type = (type_is_signed(mul->src[1].type) ?
+ BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW);
+ mul->src[1].stride *= 2;
+
+ } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
+ inst->force_sechalf) {
+ /* Among other things the quarter control bits influence which
+ * accumulator register is used by the hardware for instructions
+ * that access the accumulator implicitly (e.g. MACH). A
+ * second-half instruction would normally map to acc1, which
+ * doesn't exist on Gen7 and up (the hardware does emulate it for
+ * floating-point instructions *only* by taking advantage of the
+ * extra precision of acc0 not normally used for floating point
+ * arithmetic).
+ *
+ * HSW and up are careful enough not to try to access an
+ * accumulator register that doesn't exist, but on earlier Gen7
+ * hardware we need to make sure that the quarter control bits are
+ * zero to avoid non-deterministic behaviour and emit an extra MOV
+ * to get the result masked correctly according to the current
+ * channel enables.
+ */
+ mach->force_sechalf = false;
+ mach->force_writemask_all = true;
+ mach->dst = ibld.vgrf(inst->dst.type);
+ ibld.MOV(inst->dst, mach->dst);
+ }
} else {
continue;
}
@@ -4083,6 +4132,12 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
const fs_inst *inst)
{
switch (inst->opcode) {
+ case SHADER_OPCODE_MULH:
+ /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
+ * is 8-wide on Gen7+.
+ */
+ return (devinfo->gen >= 7 ? 8 : inst->exec_size);
+
case FS_OPCODE_FB_WRITE_LOGICAL:
/* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
* here.