i965/fs: Rewrite fsign64 to skip the float -> double conversion

... without the float -> double conversion. Low power parts have additional restrictions when it comes to operating on 64-bit types, and the instruction used to do the conversion violates one of them: specifically, the restriction that "Source and Destination horizontal stride must be aligned to the same qword". Previously we generated a float and then converted, but we can avoid the conversion by using the same extract-the-sign-bit + or-in-1.0 algorithm by directly operating on the high four bytes of each double-precision component in the result. In SIMD8 and SIMD16 this cuts one instruction from the implementation, and more importantly that instruction is the one which violated the regioning restriction. Along the way I removed some comments that I did not think helped, and some code about double comparisons which does not seem to be necessary today. This prevents validation failures caught by the new EU validation code added in later patches. Reviewed-by: Iago Toral Quiroga <[email protected]>
author: Matt Turner <[email protected]> 2017-09-28 20:59:49 -0700
committer: Matt Turner <[email protected]> 2017-10-04 14:08:54 -0700
commit: 7e88f934697fd4ce05093c077b5f11454e377ed0 (patch)
tree: 42ebb0420d905cf3df1501645708e62da44eaa10 /src/intel/compiler
parent: b541945c2027990ac571184bbf8e01285be0e33a (diff)
1 files changed, 9 insertions, 41 deletions
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 7eae0520277..5b8ccd50bff 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -693,53 +693,21 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
           *
           * - 2-src instructions can't operate with 64-bit immediates
           * - The sign is encoded in the high 32-bit of each DF
-          * - CMP with DF requires special handling in SIMD16
           * - We need to produce a DF result.
           */
 
-         /* 2-src instructions can't have 64-bit immediates, so put 0.0 in
-          * a register and compare with that.
-          */
-         fs_reg tmp = vgrf(glsl_type::double_type);
-         bld.MOV(tmp, setup_imm_df(bld, 0.0));
-
-         /* A direct DF CMP using the flag register (null dst) won't work in
-          * SIMD16 because the CMP will be split in two by lower_simd_width,
-          * resulting in two CMP instructions with the same dst (NULL),
-          * leading to dead code elimination of the first one. In SIMD8,
-          * however, there is no need to split the CMP and we can save some
-          * work.
-          */
-         fs_reg dst_tmp = vgrf(glsl_type::double_type);
-         bld.CMP(dst_tmp, op[0], tmp, BRW_CONDITIONAL_NZ);
-
-         /* In SIMD16 we want to avoid using a NULL dst register with DF CMP,
-          * so we store the result of the comparison in a vgrf instead and
-          * then we generate a UD comparison from that that won't have to
-          * be split by lower_simd_width. This is what NIR does to handle
-          * double comparisons in the general case.
-          */
-         if (bld.dispatch_width() == 16 ) {
-            fs_reg dst_tmp_ud = retype(dst_tmp, BRW_REGISTER_TYPE_UD);
-            bld.MOV(dst_tmp_ud, subscript(dst_tmp, BRW_REGISTER_TYPE_UD, 0));
-            bld.CMP(bld.null_reg_ud(),
-                    dst_tmp_ud, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
-         }
-
-         /* Get the high 32-bit of each double component where the sign is */
-         fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
-         bld.MOV(result_int, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
+         fs_reg zero = vgrf(glsl_type::double_type);
+         bld.MOV(zero, setup_imm_df(bld, 0.0));
+         bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ);
 
-         /* Get the sign bit */
-         bld.AND(result_int, result_int, brw_imm_ud(0x80000000u));
+         bld.MOV(result, zero);
 
-         /* Add 1.0 to the sign, predicated to skip the case of op[0] == 0.0 */
-         inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
-         inst->predicate = BRW_PREDICATE_NORMAL;
+         fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1);
+         bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1),
+                 brw_imm_ud(0x80000000u));
 
-         /* Convert from 32-bit float to 64-bit double */
-         result.type = BRW_REGISTER_TYPE_DF;
-         inst = bld.MOV(result, retype(result_int, BRW_REGISTER_TYPE_F));
+         set_predicate(BRW_PREDICATE_NORMAL,
+                       bld.OR(r, r, brw_imm_ud(0x3ff00000u)));
 
          if (instr->dest.saturate) {
             inst = bld.MOV(result, result);
author	Matt Turner <[email protected]>	2017-09-28 20:59:49 -0700
committer	Matt Turner <[email protected]>	2017-10-04 14:08:54 -0700
commit	7e88f934697fd4ce05093c077b5f11454e377ed0 (patch)
tree	42ebb0420d905cf3df1501645708e62da44eaa10 /src/intel/compiler
parent	b541945c2027990ac571184bbf8e01285be0e33a (diff)