4 files changed, 191 insertions, 53 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index a1edd349f1f..321c6e4edf0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -541,38 +541,38 @@ lp_build_add(struct lp_build_context *bld,
    assert(lp_check_value(type, a));
    assert(lp_check_value(type, b));
 
-   if(a == bld->zero)
+   if (a == bld->zero)
       return b;
-   if(b == bld->zero)
+   if (b == bld->zero)
       return a;
-   if(a == bld->undef || b == bld->undef)
+   if (a == bld->undef || b == bld->undef)
       return bld->undef;
 
-   if(bld->type.norm) {
+   if (type.norm) {
       const char *intrinsic = NULL;
 
-      if(a == bld->one || b == bld->one)
+      if (!type.sign && (a == bld->one || b == bld->one))
         return bld->one;
 
       if (!type.floating && !type.fixed) {
          if (type.width * type.length == 128) {
-            if(util_cpu_caps.has_sse2) {
-              if(type.width == 8)
+            if (util_cpu_caps.has_sse2) {
+              if (type.width == 8)
                 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
-              if(type.width == 16)
+              if (type.width == 16)
                 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
             } else if (util_cpu_caps.has_altivec) {
-              if(type.width == 8)
+              if (type.width == 8)
                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
-              if(type.width == 16)
+              if (type.width == 16)
                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
             }
          }
          if (type.width * type.length == 256) {
-            if(util_cpu_caps.has_avx2) {
-              if(type.width == 8)
+            if (util_cpu_caps.has_avx2) {
+              if (type.width == 8)
                 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
-              if(type.width == 16)
+              if (type.width == 16)
                 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
             }
          }
@@ -842,38 +842,38 @@ lp_build_sub(struct lp_build_context *bld,
    assert(lp_check_value(type, a));
    assert(lp_check_value(type, b));
 
-   if(b == bld->zero)
+   if (b == bld->zero)
       return a;
-   if(a == bld->undef || b == bld->undef)
+   if (a == bld->undef || b == bld->undef)
       return bld->undef;
-   if(a == b)
+   if (a == b)
       return bld->zero;
 
-   if(bld->type.norm) {
+   if (type.norm) {
       const char *intrinsic = NULL;
 
-      if(b == bld->one)
+      if (!type.sign && b == bld->one)
         return bld->zero;
 
       if (!type.floating && !type.fixed) {
          if (type.width * type.length == 128) {
             if (util_cpu_caps.has_sse2) {
-              if(type.width == 8)
+              if (type.width == 8)
                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
-              if(type.width == 16)
+              if (type.width == 16)
                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
             } else if (util_cpu_caps.has_altivec) {
-              if(type.width == 8)
+              if (type.width == 8)
                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
-              if(type.width == 16)
+              if (type.width == 16)
                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
             }
          }
          if (type.width * type.length == 256) {
             if (util_cpu_caps.has_avx2) {
-              if(type.width == 8)
+              if (type.width == 8)
                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
-              if(type.width == 16)
+              if (type.width == 16)
                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
             }
          }
@@ -963,7 +963,7 @@ lp_build_sub(struct lp_build_context *bld,
  * @sa Michael Herf, The "double blend trick", May 2000, 
  *     http://www.stereopsis.com/doubleblend.html
  */
-static LLVMValueRef
+LLVMValueRef
 lp_build_mul_norm(struct gallivm_state *gallivm,
                   struct lp_type wide_type,
                   LLVMValueRef a, LLVMValueRef b)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index 2a4137a6780..f5b2800a2cf 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -71,6 +71,13 @@ lp_build_sub(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b);
 
+
+LLVMValueRef
+lp_build_mul_norm(struct gallivm_state *gallivm,
+                  struct lp_type wide_type,
+                  LLVMValueRef a,
+                  LLVMValueRef b);
+
 LLVMValueRef
 lp_build_mul(struct lp_build_context *bld,
              LLVMValueRef a,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.c b/src/gallium/drivers/llvmpipe/lp_bld_blend.c
index 1feb415c9e5..02ec55eddbd 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.c
@@ -35,6 +35,7 @@
 #include "gallivm/lp_bld_swizzle.h"
 #include "gallivm/lp_bld_flow.h"
 #include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_pack.h"
 
 #include "lp_bld_blend.h"
 
@@ -65,11 +66,11 @@ lp_build_blend_func_commutative(unsigned func)
 boolean
 lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func)
 {
-   if(rgb_func == alpha_func)
+   if (rgb_func == alpha_func)
       return FALSE;
-   if(rgb_func == PIPE_BLEND_SUBTRACT && alpha_func == PIPE_BLEND_REVERSE_SUBTRACT)
+   if (rgb_func == PIPE_BLEND_SUBTRACT && alpha_func == PIPE_BLEND_REVERSE_SUBTRACT)
       return TRUE;
-   if(rgb_func == PIPE_BLEND_REVERSE_SUBTRACT && alpha_func == PIPE_BLEND_SUBTRACT)
+   if (rgb_func == PIPE_BLEND_REVERSE_SUBTRACT && alpha_func == PIPE_BLEND_SUBTRACT)
       return TRUE;
    return FALSE;
 }
@@ -81,11 +82,65 @@ lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func)
 static inline boolean
 lp_build_blend_factor_complementary(unsigned src_factor, unsigned dst_factor)
 {
+   STATIC_ASSERT((PIPE_BLENDFACTOR_ZERO ^ 0x10) == PIPE_BLENDFACTOR_ONE);
+   STATIC_ASSERT((PIPE_BLENDFACTOR_CONST_COLOR ^ 0x10) ==
+                 PIPE_BLENDFACTOR_INV_CONST_COLOR);
    return dst_factor == (src_factor ^ 0x10);
 }
 
 
 /**
+ * Whether this is a inverse blend factor
+ */
+static inline boolean
+is_inverse_factor(unsigned factor)
+{
+   STATIC_ASSERT(PIPE_BLENDFACTOR_ZERO == 0x11);
+   return factor > 0x11;
+}
+
+
+/**
+ * Calculates the (expanded to wider type) multiplication
+ * of 2 normalized numbers.
+ */
+static void
+lp_build_mul_norm_expand(struct lp_build_context *bld,
+                         LLVMValueRef a, LLVMValueRef b,
+                         LLVMValueRef *resl, LLVMValueRef *resh,
+                         boolean signedness_differs)
+{
+   const struct lp_type type = bld->type;
+   struct lp_type wide_type = lp_wider_type(type);
+   struct lp_type wide_type2 = wide_type;
+   struct lp_type type2 = type;
+   LLVMValueRef al, ah, bl, bh;
+
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+   assert(!type.floating && !type.fixed && type.norm);
+
+   if (a == bld->zero || b == bld->zero) {
+      LLVMValueRef zero = LLVMConstNull(lp_build_vec_type(bld->gallivm, wide_type));
+      *resl = zero;
+      *resh = zero;
+      return;
+   }
+
+   if (signedness_differs) {
+      type2.sign = !type.sign;
+      wide_type2.sign = !wide_type2.sign;
+   }
+
+   lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
+   lp_build_unpack2_native(bld->gallivm, type2, wide_type2, b, &bl, &bh);
+
+   *resl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
+   *resh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
+}
+
+
+/**
  * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendEquationSeparate.xml
  */
 LLVMValueRef
@@ -155,7 +210,7 @@ lp_build_blend(struct lp_build_context *bld,
             } else {
                return lp_build_lerp(bld, dst_factor, src, dst, 0);
             }
-         } else if(bld->type.floating && func == PIPE_BLEND_SUBTRACT) {
+         } else if (bld->type.floating && func == PIPE_BLEND_SUBTRACT) {
             result = lp_build_add(bld, src, dst);
 
             if (factor_src < factor_dst) {
@@ -165,7 +220,7 @@ lp_build_blend(struct lp_build_context *bld,
                result = lp_build_mul(bld, result, dst_factor);
                return lp_build_sub(bld, src, result);
             }
-         } else if(bld->type.floating && func == PIPE_BLEND_REVERSE_SUBTRACT) {
+         } else if (bld->type.floating && func == PIPE_BLEND_REVERSE_SUBTRACT) {
             result = lp_build_add(bld, src, dst);
 
             if (factor_src < factor_dst) {
@@ -192,9 +247,72 @@ lp_build_blend(struct lp_build_context *bld,
    if (optimise_only)
       return NULL;
 
-   src_term = lp_build_mul(bld, src, src_factor);
-   dst_term = lp_build_mul(bld, dst, dst_factor);
-   return lp_build_blend_func(bld, func, src_term, dst_term);
+   if ((bld->type.norm && bld->type.sign) &&
+       (is_inverse_factor(factor_src) || is_inverse_factor(factor_dst))) {
+      /*
+       * With snorm blending, the inverse blend factors range from [0,2]
+       * instead of [-1,1], so the ordinary signed normalized arithmetic
+       * doesn't quite work. Unpack must be unsigned, and the add/sub
+       * must be done with wider type.
+       * (Note that it's not quite obvious what the blend equation wrt to
+       * clamping should actually be based on GL spec in this case, but
+       * really the incoming src values are clamped to [-1,1] (the dst is
+       * always clamped already), and then NO further clamping occurs until
+       * the end.)
+       */
+      struct lp_build_context bldw;
+      struct lp_type wide_type = lp_wider_type(bld->type);
+      LLVMValueRef src_terml, src_termh, dst_terml, dst_termh;
+      LLVMValueRef resl, resh;
+
+      /*
+       * We don't need saturate math for the sub/add, since we have
+       * x+1 bit numbers in x*2 wide type (result is x+2 bits).
+       * (Doesn't really matter on x86 sse2 though as we use saturated
+       * intrinsics.)
+       */
+      wide_type.norm = 0;
+      lp_build_context_init(&bldw, bld->gallivm, wide_type);
+
+      /*
+       * XXX This is a bit hackish. Note that -128 really should
+       * be -1.0, the same as -127. However, we did not actually clamp
+       * things anywhere (relying on pack intrinsics instead) therefore
+       * we will get -128, and the inverted factor then 255. But the mul
+       * can overflow in this case (rather the rounding fixups for the mul,
+       * -128*255 will be positive).
+       * So we clamp the src and dst up here but only when necessary (we
+       * should do this before calculating blend factors but it's enough
+       * for avoiding overflow).
+       */
+      if (is_inverse_factor(factor_src)) {
+         src = lp_build_max(bld, src,
+                            lp_build_const_vec(bld->gallivm, bld->type, -1.0));
+      }
+      if (is_inverse_factor(factor_dst)) {
+         dst = lp_build_max(bld, dst,
+                            lp_build_const_vec(bld->gallivm, bld->type, -1.0));
+      }
+
+      lp_build_mul_norm_expand(bld, src, src_factor, &src_terml, &src_termh,
+                               is_inverse_factor(factor_src) ? TRUE : FALSE);
+      lp_build_mul_norm_expand(bld, dst, dst_factor, &dst_terml, &dst_termh,
+                               is_inverse_factor(factor_dst) ? TRUE : FALSE);
+      resl = lp_build_blend_func(&bldw, func, src_terml, dst_terml);
+      resh = lp_build_blend_func(&bldw, func, src_termh, dst_termh);
+
+      /*
+       * XXX pack2_native is not ok because the values have to be in dst
+       * range. We need native pack though for the correct order on avx2.
+       * Will break on everything not implementing clamping pack intrinsics
+       * (i.e. everything but sse2 and altivec).
+       */
+      return lp_build_pack2_native(bld->gallivm, wide_type, bld->type, resl, resh);
+   } else {
+      src_term = lp_build_mul(bld, src, src_factor);
+      dst_term = lp_build_mul(bld, dst, dst_factor);
+      return lp_build_blend_func(bld, func, src_term, dst_term);
+   }
 }
 
 void
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
index 45c5c2bb65e..c16ef1a2e91 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
@@ -112,22 +112,34 @@ lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld,
    case PIPE_BLENDFACTOR_DST_ALPHA:
       return bld->dst;
    case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-      if(alpha)
+      if (alpha)
          return bld->base.one;
       else {
          /*
           * If there's no dst alpha the complement is zero but for unclamped
-          * float inputs min can be non-zero (negative).
+          * float inputs (or snorm inputs) min can be non-zero (negative).
           */
-         if (!bld->has_dst_alpha) {
-            if (!bld->saturate)
+         if (!bld->saturate) {
+            if (!bld->has_dst_alpha) {
                bld->saturate = lp_build_min(&bld->base, src_alpha, bld->base.zero);
-         }
-         else {
-            if(!bld->inv_dst)
-               bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
-            if(!bld->saturate)
+            }
+            else if (bld->base.type.norm && bld->base.type.sign) {
+               /*
+                * The complement/min totally doesn't work, since
+                * the complement is in range [0,2] but the other
+                * min input is [-1,1]. However, we can just clamp to 0
+                * before doing the complement...
+                */
+               LLVMValueRef inv_dst;
+               inv_dst = lp_build_max(&bld->base, bld->base.zero, bld->dst);
+               inv_dst = lp_build_comp(&bld->base, inv_dst);
+               bld->saturate = lp_build_min(&bld->base, src_alpha, inv_dst);
+            } else {
+               if (!bld->inv_dst) {
+                  bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
+               }
                bld->saturate = lp_build_min(&bld->base, src_alpha, bld->inv_dst);
+            }
          }
          return bld->saturate;
       }
@@ -140,24 +152,24 @@ lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld,
    case PIPE_BLENDFACTOR_SRC1_ALPHA:
       return src1_alpha;
    case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-      if(!bld->inv_src)
+      if (!bld->inv_src)
          bld->inv_src = lp_build_comp(&bld->base, bld->src);
       return bld->inv_src;
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-      if(!bld->inv_src_alpha)
+      if (!bld->inv_src_alpha)
          bld->inv_src_alpha = lp_build_comp(&bld->base, src_alpha);
       return bld->inv_src_alpha;
    case PIPE_BLENDFACTOR_INV_DST_COLOR:
    case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-      if(!bld->inv_dst)
+      if (!bld->inv_dst)
          bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
       return bld->inv_dst;
    case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-      if(!bld->inv_const)
+      if (!bld->inv_const)
          bld->inv_const = lp_build_comp(&bld->base, bld->const_);
       return bld->inv_const;
    case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-      if(!bld->inv_const_alpha)
+      if (!bld->inv_const_alpha)
          bld->inv_const_alpha = lp_build_comp(&bld->base, const_alpha);
       return bld->inv_const_alpha;
    case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
@@ -331,7 +343,7 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
    bld.const_alpha = const_alpha;
    bld.has_dst_alpha = FALSE;
 
-   /* Find the alpha channel if not provided seperately */
+   /* Find the alpha channel if not provided separately */
    if (!src_alpha) {
       for (i = 0; i < 4; ++i) {
          if (swizzle[i] == 3) {
@@ -349,7 +361,7 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
    }
 
    if (blend->logicop_enable) {
-      if(!type.floating) {
+      if (!type.floating) {
          result = lp_build_logicop(gallivm->builder, blend->logicop_func, src, dst);
       }
       else {
@@ -361,6 +373,7 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
       boolean rgb_alpha_same = (state->rgb_src_factor == state->rgb_dst_factor &&
                                 state->alpha_src_factor == state->alpha_dst_factor) ||
                                nr_channels == 1;
+      boolean alpha_only = nr_channels == 1 && alpha_swizzle == PIPE_SWIZZLE_X;
 
       src_factor = lp_build_blend_factor(&bld, state->rgb_src_factor,
                                          state->alpha_src_factor,
@@ -374,8 +387,8 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
 
       result = lp_build_blend(&bld.base,
                               state->rgb_func,
-                              state->rgb_src_factor,
-                              state->rgb_dst_factor,
+                              alpha_only ? state->alpha_src_factor : state->rgb_src_factor,
+                              alpha_only ? state->alpha_dst_factor : state->rgb_dst_factor,
                               src,
                               dst,
                               src_factor,
@@ -383,8 +396,8 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
                               rgb_alpha_same,
                               false);
 
-      if(state->rgb_func != state->alpha_func && nr_channels > 1 &&
-                            alpha_swizzle != PIPE_SWIZZLE_NONE) {
+      if (state->rgb_func != state->alpha_func && nr_channels > 1 &&
+          alpha_swizzle != PIPE_SWIZZLE_NONE) {
          LLVMValueRef alpha;
 
          alpha = lp_build_blend(&bld.base,