gallivm: generalize 4x4f->1x16ub special case conversion

This special packing path can be easily extended to handle not just float->unorm8 but also float->snorm8 and uint32->uint8 and int32->int8 (i.e. all interesting cases for llvmpipe fs backend code). The packing parts all stay the same (only the last step packing will be signed->signed instead of signed->unsigned but luckily even sse2 can do both). While here also note some bugs with that (we keep the bugs identical to what we did before on x86, albeit other archs may differ). In particular float->unorm8 too large values will still get clamped to 0, not 255, and for float->snorm8 NaNs will end up as -1, not 0 (but we do the clamp against 1.0 there to prevent too large values ending up as -1.0 - this is inconsistent to unorm8 handling but is what we ended up before, I'm not sure we can get away without it). This is quite fishy in any case as we depend on arch-dependent behavior of the iround (my understanding is in fact with altivec the conversion would actually saturate although I've no idea about NaNs, so probably wouldn't need to do anything for snorm). (There are only minimal piglit tests for unorm clamping behavior AFAICT, in particular nothing seems to test values which are too large to be handled by the float->int conversion.) For uint32->uint8 we also do a min against MAX_INT, since the source for the packs is always signed (again, on x86 - should probably be able to express these arch-dependent bits better some day). Reviewed-by: Jose Fonseca <[email protected]>
author: Roland Scheidegger <[email protected]> 2016-12-22 03:51:40 +0100
committer: Roland Scheidegger <[email protected]> 2017-01-06 23:13:34 +0100
commit: 6e7ce1ef55b138ed2cdedb40cbc010b523de8743 (patch)
tree: 75376ed731562e711fc19e98c0336e9739d638db
parent: 04480a04b10524e156cb01d5f7bd8ec02fcc4a67 (diff)
1 files changed, 118 insertions, 56 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index c8f9c28ddc6..c688965a73e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -456,21 +456,21 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
        src_type.sign == dst_type->sign)
       return num_dsts;
 
-   /* Special case 4x4f -> 1x16ub or 2x8f -> 1x16ub
+   /* Special case 4x4x32 -> 1x16x8 or 2x8x32 -> 1x16x8
     */
-   if (src_type.floating == 1 &&
-       src_type.fixed    == 0 &&
-       src_type.sign     == 1 &&
-       src_type.norm     == 0 &&
+   if (src_type.norm     == 0 &&
        src_type.width    == 32 &&
+       src_type.fixed    == 0 &&
 
        dst_type->floating == 0 &&
        dst_type->fixed    == 0 &&
-       dst_type->sign     == 0 &&
-       dst_type->norm     == 1 &&
-       dst_type->width    == 8)
-   {
-      /* Special case 4x4f --> 1x16ub */
+       dst_type->width    == 8 &&
+
+       ((src_type.floating == 1 && src_type.sign == 1 && dst_type->norm == 1) ||
+        (src_type.floating == 0 && dst_type->floating == 0 &&
+         src_type.sign == dst_type->sign && dst_type->norm == 0))) {
+
+      /* Special case 4x4x32 --> 1x16x8 */
       if (src_type.length == 4 &&
             (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
       {
@@ -481,7 +481,7 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
          return num_dsts;
       }
 
-      /* Special case 2x8f --> 1x16ub */
+      /* Special case 2x8x32 --> 1x16x8 */
       if (src_type.length == 8 &&
           util_cpu_caps.has_avx)
       {
@@ -558,21 +558,25 @@ lp_build_conv(struct gallivm_state *gallivm,
    num_tmps = num_srcs;
 
 
-   /* Special case 4x4f --> 1x16ub, 2x4f -> 1x8ub, 1x4f -> 1x4ub
+   /*
+    * Special case 4x4x32 --> 1x16x8, 2x4x32 -> 1x8x8, 1x4x32 -> 1x4x8
+    * Only float -> s/unorm8 and (u)int32->(u)int8.
+    * XXX: This should cover all interesting backend cases for 8 bit,
+    * but should use same strategy if dst is 16 bit.
     */
-   if (src_type.floating == 1 &&
-       src_type.fixed    == 0 &&
-       src_type.sign     == 1 &&
-       src_type.norm     == 0 &&
+   if (src_type.norm     == 0 &&
        src_type.width    == 32 &&
        src_type.length   == 4 &&
+       src_type.fixed    == 0 &&
 
        dst_type.floating == 0 &&
        dst_type.fixed    == 0 &&
-       dst_type.sign     == 0 &&
-       dst_type.norm     == 1 &&
        dst_type.width    == 8 &&
 
+       ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) ||
+        (src_type.floating == 0 && dst_type.floating == 0 &&
+         src_type.sign == dst_type.sign && dst_type.norm == 0)) &&
+
        ((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
         (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&
 
@@ -581,7 +585,7 @@ lp_build_conv(struct gallivm_state *gallivm,
       struct lp_build_context bld;
       struct lp_type int16_type, int32_type;
       struct lp_type dst_type_ext = dst_type;
-      LLVMValueRef const_255f;
+      LLVMValueRef const_scale;
       unsigned i, j;
 
       lp_build_context_init(&bld, gallivm, src_type);
@@ -597,14 +601,54 @@ lp_build_conv(struct gallivm_state *gallivm,
       int32_type.length /= 4;
       int32_type.sign = 1;
 
-      const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
+      const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type));
 
       for (i = 0; i < num_dsts; ++i, src += 4) {
          LLVMValueRef lo, hi;
 
-         for (j = 0; j < dst_type.length / 4; ++j) {
-            tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, "");
-            tmp[j] = lp_build_iround(&bld, tmp[j]);
+         if (src_type.floating) {
+            for (j = 0; j < dst_type.length / 4; ++j) {
+               /*
+                * XXX This is not actually fully correct. The float to int
+                * conversion will produce 0x80000000 value for everything
+                * out of range and NaNs (on x86, llvm.x86.sse2.cvtps2dq).
+                * Hence, NaNs and negatives will get clamped just fine to zero
+                * (relying on clamping pack behavior) when converting to unorm,
+                * however too large values (both finite and infinite) will also
+                * end up as zero, not 255.
+                * For snorm, for now we'll keep bug compatibility with generic
+                * conversion path (meaning too large values are fine, but
+                * NaNs get converted to -128 (purely by luck, as we don't
+                * specify nan behavior for the max there) instead of 0).
+                */
+               if (dst_type.sign) {
+                  tmp[j] = lp_build_min(&bld, bld.one, src[j]);
+
+               }
+               else {
+                  if (0) {
+                     tmp[j] = lp_build_min_ext(&bld, bld.one, src[j],
+                                               GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
+                  }
+                  tmp[j] = src[j];
+               }
+               tmp[j] = LLVMBuildFMul(builder, tmp[j], const_scale, "");
+               tmp[j] = lp_build_iround(&bld, tmp[j]);
+            }
+         } else {
+            for (j = 0; j < dst_type.length / 4; ++j) {
+               if (!dst_type.sign) {
+                  /*
+                   * Pack clamp is always signed->unsigned (or signed->signed).
+                   * Hence need min.
+                   */
+                  LLVMValueRef const_max;
+                  const_max = lp_build_const_int_vec(gallivm, src_type, 255);
+                  tmp[j] = lp_build_min(&bld, src[j], const_max);
+               } else {
+                  tmp[j] = src[j];
+               }
+            }
          }
 
          if (num_srcs == 1) {
@@ -629,20 +673,20 @@ lp_build_conv(struct gallivm_state *gallivm,
       return; 
    }
 
-   /* Special case 2x8f --> 1x16ub, 1x8f ->1x8ub
+   /* Special case 2x8x32 --> 1x16x8, 1x8x32 ->1x8x8
     */
-   else if (src_type.floating == 1 &&
-      src_type.fixed    == 0 &&
-      src_type.sign     == 1 &&
-      src_type.norm     == 0 &&
-      src_type.width    == 32 &&
-      src_type.length   == 8 &&
-
-      dst_type.floating == 0 &&
-      dst_type.fixed    == 0 &&
-      dst_type.sign     == 0 &&
-      dst_type.norm     == 1 &&
-      dst_type.width    == 8 &&
+   else if (src_type.norm     == 0 &&
+       src_type.width    == 32 &&
+       src_type.length   == 8 &&
+       src_type.fixed    == 0 &&
+
+       dst_type.floating == 0 &&
+       dst_type.fixed    == 0 &&
+       dst_type.width    == 8 &&
+
+       ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) ||
+        (src_type.floating == 0 && dst_type.floating == 0 &&
+         src_type.sign == dst_type.sign && dst_type.norm == 0)) &&
 
       ((dst_type.length == 16 && 2 * num_dsts == num_srcs) ||
        (num_dsts == 1 && dst_type.length * num_srcs == 8)) &&
@@ -652,7 +696,7 @@ lp_build_conv(struct gallivm_state *gallivm,
       struct lp_build_context bld;
       struct lp_type int16_type, int32_type;
       struct lp_type dst_type_ext = dst_type;
-      LLVMValueRef const_255f;
+      LLVMValueRef const_scale;
       unsigned i;
 
       lp_build_context_init(&bld, gallivm, src_type);
@@ -668,30 +712,44 @@ lp_build_conv(struct gallivm_state *gallivm,
       int32_type.length /= 4;
       int32_type.sign = 1;
 
-      const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
+      const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type));
 
       for (i = 0; i < num_dsts; ++i, src += 2) {
-         LLVMValueRef lo, hi, a, b;
-
-         a = LLVMBuildFMul(builder, src[0], const_255f, "");
-         a = lp_build_iround(&bld, a);
-         tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
-         tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
-         /* relying on clamping behavior of sse2 intrinsics here */
-         lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
-
-         if (num_srcs == 1) {
-            hi = lo;
+         unsigned j;
+         for (j = 0; j < (num_srcs == 1 ? 1 : 2); j++) {
+            LLVMValueRef lo, hi, a;
+
+            a = src[j];
+            if (src_type.floating) {
+               if (dst_type.sign) {
+                  a = lp_build_min(&bld, bld.one, a);
+
+               }
+               else {
+                  if (0) {
+                     a = lp_build_min_ext(&bld, bld.one, a,
+                                          GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
+                  }
+               }
+               a = LLVMBuildFMul(builder, a, const_scale, "");
+               a = lp_build_iround(&bld, a);
+            } else {
+               if (!dst_type.sign) {
+                  LLVMValueRef const_max;
+                  const_max = lp_build_const_int_vec(gallivm, src_type, 255);
+                  a = lp_build_min(&bld, a, const_max);
+               }
+            }
+            lo = lp_build_extract_range(gallivm, a, 0, 4);
+            hi = lp_build_extract_range(gallivm, a, 4, 4);
+            /* relying on clamping behavior of sse2 intrinsics here */
+            tmp[j] = lp_build_pack2(gallivm, int32_type, int16_type, lo, hi);
          }
-         else {
-            b = LLVMBuildFMul(builder, src[1], const_255f, "");
-            b = lp_build_iround(&bld, b);
-            tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
-            tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
-            hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
 
+         if (num_srcs == 1) {
+            tmp[1] = tmp[0];
          }
-         dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
+         dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, tmp[0], tmp[1]);
       }
 
       if (num_srcs == 1) {
@@ -858,6 +916,10 @@ lp_build_conv(struct gallivm_state *gallivm,
       new_type.width  = dst_type.width;
       new_type.length = dst_type.length;
 
+      /*
+       * Note that resize when using packs can sometimes get min/max
+       * clamping for free. Should be able to exploit this...
+       */
       lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
 
       tmp_type = new_type;
author	Roland Scheidegger <[email protected]>	2016-12-22 03:51:40 +0100
committer	Roland Scheidegger <[email protected]>	2017-01-06 23:13:34 +0100
commit	6e7ce1ef55b138ed2cdedb40cbc010b523de8743 (patch)
tree	75376ed731562e711fc19e98c0336e9739d638db
parent	04480a04b10524e156cb01d5f7bd8ec02fcc4a67 (diff)