summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/mesa/drivers/dri/i965/brw_vec4.h5
-rw-r--r--src/mesa/drivers/dri/i965/brw_vec4_emit.cpp8
-rw-r--r--src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp133
3 files changed, 146 insertions, 0 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index e65b92caa7b..86921a033d1 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -358,6 +358,8 @@ public:
vec4_instruction *RNDE(dst_reg dst, src_reg src0);
vec4_instruction *RNDZ(dst_reg dst, src_reg src0);
vec4_instruction *FRC(dst_reg dst, src_reg src0);
+ vec4_instruction *F32TO16(dst_reg dst, src_reg src0);
+ vec4_instruction *F16TO32(dst_reg dst, src_reg src0);
vec4_instruction *ADD(dst_reg dst, src_reg src0, src_reg src1);
vec4_instruction *MUL(dst_reg dst, src_reg src0, src_reg src1);
vec4_instruction *MACH(dst_reg dst, src_reg src0, src_reg src1);
@@ -431,6 +433,9 @@ public:
void emit_math(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1);
src_reg fix_math_operand(src_reg src);
+ void emit_pack_half_2x16(dst_reg dst, src_reg src0);
+ void emit_unpack_half_2x16(dst_reg dst, src_reg src0);
+
void swizzle_result(ir_texture *ir, src_reg orig_val, int sampler);
void emit_ndc_computation();
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
index 747edc29a13..e395ada5d8f 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
@@ -808,6 +808,14 @@ vec4_generator::generate_code(exec_list *instructions)
brw_DP2(p, dst, src[0], src[1]);
break;
+ case BRW_OPCODE_F32TO16:
+ brw_F32TO16(p, dst, src[0]);
+ break;
+
+ case BRW_OPCODE_F16TO32:
+ brw_F16TO32(p, dst, src[0]);
+ break;
+
case BRW_OPCODE_IF:
if (inst->src[0].file != BAD_FILE) {
/* The instruction has an embedded compare (only allowed on gen6) */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index cff04ba887c..4fb365d9acd 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -113,6 +113,8 @@ ALU1(FRC)
ALU1(RNDD)
ALU1(RNDE)
ALU1(RNDZ)
+ALU1(F32TO16)
+ALU1(F16TO32)
ALU2(ADD)
ALU2(MUL)
ALU2(MACH)
@@ -348,6 +350,119 @@ vec4_visitor::emit_math(enum opcode opcode,
}
void
+vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
+{
+ if (intel->gen < 7)
+ assert(!"ir_unop_pack_half_2x16 should be lowered");
+
+ assert(dst.type == BRW_REGISTER_TYPE_UD);
+ assert(src0.type == BRW_REGISTER_TYPE_F);
+
+ /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
+ *
+ * Because this instruction does not have a 16-bit floating-point type,
+ * the destination data type must be Word (W).
+ *
+ * The destination must be DWord-aligned and specify a horizontal stride
+ * (HorzStride) of 2. The 16-bit result is stored in the lower word of
+ * each destination channel and the upper word is not modified.
+ *
+ * The above restriction implies that the f32to16 instruction must use
+ * align1 mode, because only in align1 mode is it possible to specify
+ * horizontal stride. We choose here to defy the hardware docs and emit
+ * align16 instructions.
+ *
+ * (I [chadv] did attempt to emit align1 instructions for VS f32to16
+ * instructions. I was partially successful in that the code passed all
+ * tests. However, the code was dubiously correct and fragile, and the
+ * tests were not harsh enough to probe that frailty. Not trusting the
+ * code, I chose instead to remain in align16 mode in defiance of the hw
+ * docs).
+ *
+ * I've [chadv] experimentally confirmed that, on gen7 hardware and the
+ * simulator, emitting a f32to16 in align16 mode with UD as destination
+ * data type is safe. The behavior differs from that specified in the PRM
+ * in that the upper word of each destination channel is cleared to 0.
+ */
+
+ dst_reg tmp_dst(this, glsl_type::uvec2_type);
+ src_reg tmp_src(tmp_dst);
+
+#if 0
+ /* Verify the undocumented behavior on which the following instructions
+ * rely. If f32to16 fails to clear the upper word of the X and Y channels,
+ * then the result of the bit-or instruction below will be incorrect.
+ *
+ * You should inspect the disasm output in order to verify that the MOV is
+ * not optimized away.
+ */
+ emit(MOV(tmp_dst, src_reg(0x12345678u)));
+#endif
+
+ /* Give tmp the form below, where "." means untouched.
+ *
+ * w z y x w z y x
+ * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
+ *
+ * That the upper word of each write-channel be 0 is required for the
+ * following bit-shift and bit-or instructions to work. Note that this
+ * relies on the undocumented hardware behavior mentioned above.
+ */
+ tmp_dst.writemask = WRITEMASK_XY;
+ emit(F32TO16(tmp_dst, src0));
+
+ /* Give the write-channels of dst the form:
+ * 0xhhhh0000
+ */
+ tmp_src.swizzle = SWIZZLE_Y;
+ emit(SHL(dst, tmp_src, src_reg(16u)));
+
+ /* Finally, give the write-channels of dst the form of packHalf2x16's
+ * output:
+ * 0xhhhhllll
+ */
+ tmp_src.swizzle = SWIZZLE_X;
+ emit(OR(dst, src_reg(dst), tmp_src));
+}
+
+void
+vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
+{
+ if (intel->gen < 7)
+ assert(!"ir_unop_unpack_half_2x16 should be lowered");
+
+ assert(dst.type == BRW_REGISTER_TYPE_F);
+ assert(src0.type == BRW_REGISTER_TYPE_UD);
+
+ /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
+ *
+ * Because this instruction does not have a 16-bit floating-point type,
+ * the source data type must be Word (W). The destination type must be
+ * F (Float).
+ *
+ * To use W as the source data type, we must adjust horizontal strides,
+ * which is only possible in align1 mode. All my [chadv] attempts at
+ * emitting align1 instructions for unpackHalf2x16 failed to pass the
+ * Piglit tests, so I gave up.
+ *
+ * I've verified that, on gen7 hardware and the simulator, it is safe to
+ * emit f16to32 in align16 mode with UD as source data type.
+ */
+
+ dst_reg tmp_dst(this, glsl_type::uvec2_type);
+ src_reg tmp_src(tmp_dst);
+
+ tmp_dst.writemask = WRITEMASK_X;
+ emit(AND(tmp_dst, src0, src_reg(0xffffu)));
+
+ tmp_dst.writemask = WRITEMASK_Y;
+ emit(SHR(tmp_dst, src0, src_reg(16u)));
+
+ dst.writemask = WRITEMASK_XY;
+ emit(F16TO32(dst, tmp_src));
+}
+
+void
vec4_visitor::visit_instructions(const exec_list *list)
{
foreach_list(node, list) {
@@ -1469,6 +1584,24 @@ vec4_visitor::visit(ir_expression *ir)
case ir_quadop_vector:
assert(!"not reached: should be handled by lower_quadop_vector");
break;
+
+ case ir_unop_pack_half_2x16:
+ emit_pack_half_2x16(result_dst, op[0]);
+ break;
+ case ir_unop_unpack_half_2x16:
+ emit_unpack_half_2x16(result_dst, op[0]);
+ break;
+ case ir_unop_pack_snorm_2x16:
+ case ir_unop_pack_unorm_2x16:
+ case ir_unop_unpack_snorm_2x16:
+ case ir_unop_unpack_unorm_2x16:
+ assert(!"not reached: should be handled by lower_packing_builtins");
+ break;
+ case ir_unop_unpack_half_2x16_split_x:
+ case ir_unop_unpack_half_2x16_split_y:
+ case ir_binop_pack_half_2x16_split:
+ assert(!"not reached: should not occur in vertex shader");
+ break;
}
}