i915: Optimize SEQ and SNE when two operands are uniforms

SEQ and SNE are not native i915 instructions, so they each generate at least 3 instructions. If both operands are uniforms or constants, we get 5 instructions like: U[1] = MOV CONST[1] U[0].xyz = SGE CONST[0].xxxx, U[1] U[1] = MOV CONST[1].-x-y-z-w R[0].xyz = SGE CONST[0].-x-x-x-x, U[1] R[0].xyz = MUL R[0], U[0] This code is stupid. Instead of having the individual calls to i915_emit_arith generate the moves to utemps, do it in the caller. This results in code like: U[1] = MOV CONST[1] U[0].xyz = SGE CONST[0].xxxx, U[1] R[0].xyz = SGE CONST[0].-x-x-x-x, U[1].-x-y-z-w R[0].xyz = MUL R[0], U[0] This allows fs-temp-array-mat2-index-col-wr and fs-temp-array-mat2-index-row-wr to fit in hardware limits (instead of falling back to software rasterization). NOTE: Without pending patches to the piglit tests, these tests will now fail. This is an unrelated, pre-existing issue. v2: Copy most of the body of the commit message into comments in the code. Suggested by Eric. Signed-off-by: Ian Romanick <[email protected]> Reviewed-by: Eric Anholt <[email protected]>
author: Ian Romanick <[email protected]> 2013-08-21 19:37:30 -0700
committer: Ian Romanick <[email protected]> 2013-08-26 22:11:26 -0700
commit: d127a0343d7a1ab44bbdf6b4f664daebe6b297d8 (patch)
tree: bb81b8f2e0c79d6a805eef1916baa986a7df7b8c
parent: f3e86d4a68c27f0e86d64a98469a48756c445498 (diff)
1 files changed, 70 insertions, 12 deletions
diff --git a/src/mesa/drivers/dri/i915/i915_fragprog.c b/src/mesa/drivers/dri/i915/i915_fragprog.c
index 930c2b876bc..67eff7665d1 100644
--- a/src/mesa/drivers/dri/i915/i915_fragprog.c
+++ b/src/mesa/drivers/dri/i915/i915_fragprog.c
@@ -817,23 +817,52 @@ upload_program(struct i915_fragment_program *p)
 	 flags = get_result_flags(inst);
 	 dst = get_result_vector(p, inst);
 
+         /* If both operands are uniforms or constants, we get 5 instructions
+          * like:
+          *
+          *     U[1] = MOV CONST[1]
+          *     U[0].xyz = SGE CONST[0].xxxx, U[1]
+          *     U[1] = MOV CONST[1].-x-y-z-w
+          *     R[0].xyz = SGE CONST[0].-x-x-x-x, U[1]
+          *     R[0].xyz = MUL R[0], U[0]
+          *
+          * This code is stupid.  Instead of having the individual calls to
+          * i915_emit_arith generate the moves to utemps, do it in the caller.
+          * This results in code like:
+          *
+          *     U[1] = MOV CONST[1]
+          *     U[0].xyz = SGE CONST[0].xxxx, U[1]
+          *     R[0].xyz = SGE CONST[0].-x-x-x-x, U[1].-x-y-z-w
+          *     R[0].xyz = MUL R[0], U[0]
+          */
+         src0 = src_vector(p, &inst->SrcReg[0], program);
+         src1 = src_vector(p, &inst->SrcReg[1], program);
+
+         if (GET_UREG_TYPE(src0) == REG_TYPE_CONST
+             && GET_UREG_TYPE(src1) == REG_TYPE_CONST) {
+            unsigned tmp = i915_get_utemp(p);
+
+            i915_emit_arith(p, A0_MOV, tmp, A0_DEST_CHANNEL_ALL, 0,
+                            src1, 0, 0);
+
+            src1 = tmp;
+         }
+
 	 /* tmp = src1 >= src2 */
 	 i915_emit_arith(p,
 			 A0_SGE,
 			 tmp,
 			 flags, 0,
-			 src_vector(p, &inst->SrcReg[0], program),
-			 src_vector(p, &inst->SrcReg[1], program),
+			 src0,
+			 src1,
 			 0);
 	 /* dst = src1 <= src2 */
 	 i915_emit_arith(p,
 			 A0_SGE,
 			 dst,
 			 flags, 0,
-			 negate(src_vector(p, &inst->SrcReg[0], program),
-				1, 1, 1, 1),
-			 negate(src_vector(p, &inst->SrcReg[1], program),
-				1, 1, 1, 1),
+			 negate(src0, 1, 1, 1, 1),
+			 negate(src1, 1, 1, 1, 1),
 			 0);
 	 /* dst = tmp && dst */
 	 i915_emit_arith(p,
@@ -966,23 +995,52 @@ upload_program(struct i915_fragment_program *p)
 	 flags = get_result_flags(inst);
 	 dst = get_result_vector(p, inst);
 
+         /* If both operands are uniforms or constants, we get 5 instructions
+          * like:
+          *
+          *     U[1] = MOV CONST[1]
+          *     U[0].xyz = SLT CONST[0].xxxx, U[1]
+          *     U[1] = MOV CONST[1].-x-y-z-w
+          *     R[0].xyz = SLT CONST[0].-x-x-x-x, U[1]
+          *     R[0].xyz = MUL R[0], U[0]
+          *
+          * This code is stupid.  Instead of having the individual calls to
+          * i915_emit_arith generate the moves to utemps, do it in the caller.
+          * This results in code like:
+          *
+          *     U[1] = MOV CONST[1]
+          *     U[0].xyz = SLT CONST[0].xxxx, U[1]
+          *     R[0].xyz = SLT CONST[0].-x-x-x-x, U[1].-x-y-z-w
+          *     R[0].xyz = MUL R[0], U[0]
+          */
+         src0 = src_vector(p, &inst->SrcReg[0], program);
+         src1 = src_vector(p, &inst->SrcReg[1], program);
+
+         if (GET_UREG_TYPE(src0) == REG_TYPE_CONST
+             && GET_UREG_TYPE(src1) == REG_TYPE_CONST) {
+            unsigned tmp = i915_get_utemp(p);
+
+            i915_emit_arith(p, A0_MOV, tmp, A0_DEST_CHANNEL_ALL, 0,
+                            src1, 0, 0);
+
+            src1 = tmp;
+         }
+
 	 /* tmp = src1 < src2 */
 	 i915_emit_arith(p,
 			 A0_SLT,
 			 tmp,
 			 flags, 0,
-			 src_vector(p, &inst->SrcReg[0], program),
-			 src_vector(p, &inst->SrcReg[1], program),
+			 src0,
+			 src1,
 			 0);
 	 /* dst = src1 > src2 */
 	 i915_emit_arith(p,
 			 A0_SLT,
 			 dst,
 			 flags, 0,
-			 negate(src_vector(p, &inst->SrcReg[0], program),
-				1, 1, 1, 1),
-			 negate(src_vector(p, &inst->SrcReg[1], program),
-				1, 1, 1, 1),
+			 negate(src0, 1, 1, 1, 1),
+			 negate(src1, 1, 1, 1, 1),
 			 0);
 	 /* dst = tmp || dst */
 	 i915_emit_arith(p,
author	Ian Romanick <[email protected]>	2013-08-21 19:37:30 -0700
committer	Ian Romanick <[email protected]>	2013-08-26 22:11:26 -0700
commit	d127a0343d7a1ab44bbdf6b4f664daebe6b297d8 (patch)
tree	bb81b8f2e0c79d6a805eef1916baa986a7df7b8c
parent	f3e86d4a68c27f0e86d64a98469a48756c445498 (diff)