27 files changed, 1782 insertions, 157 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index af942ada2c2..cc442369630 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -415,6 +415,81 @@ lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm,
 
 
 /**
+ * Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used.
+ *
+ * Returns the number of dsts created from src
+ */
+int lp_build_conv_auto(struct gallivm_state *gallivm,
+                       struct lp_type src_type,
+                       struct lp_type* dst_type,
+                       const LLVMValueRef *src,
+                       unsigned num_srcs,
+                       LLVMValueRef *dst)
+{
+   int i;
+   int num_dsts = num_srcs;
+
+   if (src_type.floating == dst_type->floating &&
+       src_type.width == dst_type->width &&
+       src_type.length == dst_type->length &&
+       src_type.fixed == dst_type->fixed &&
+       src_type.norm == dst_type->norm &&
+       src_type.sign == dst_type->sign)
+      return num_dsts;
+
+   /* Special case 4x4f -> 1x16ub or 2x8f -> 1x16ub
+    */
+   if (src_type.floating == 1 &&
+       src_type.fixed    == 0 &&
+       src_type.sign     == 1 &&
+       src_type.norm     == 0 &&
+       src_type.width    == 32 &&
+
+       dst_type->floating == 0 &&
+       dst_type->fixed    == 0 &&
+       dst_type->sign     == 0 &&
+       dst_type->norm     == 1 &&
+       dst_type->width    == 8)
+   {
+      /* Special case 4x4f --> 1x16ub */
+      if (src_type.length == 4 && util_cpu_caps.has_sse2)
+      {
+         assert((num_srcs % 4) == 0);
+
+         num_dsts = num_srcs / 4;
+         dst_type->length = 16;
+
+         lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
+         return num_dsts;
+      }
+
+      /* Special case 2x8f --> 1x16ub */
+      if (src_type.length == 8 && util_cpu_caps.has_avx)
+      {
+         assert((num_srcs % 2) == 0);
+
+         num_dsts = num_srcs / 2;
+         dst_type->length = 16;
+
+         lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
+         return num_dsts;
+      }
+   }
+
+   /* lp_build_resize does not support M:N */
+   if (src_type.width == dst_type->width) {
+      lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
+   } else {
+      for (i = 0; i < num_srcs; ++i) {
+         lp_build_conv(gallivm, src_type, *dst_type, &src[i], 1, &dst[i], 1);
+      }
+   }
+
+   return num_dsts;
+}
+
+
+/**
  * Generic type conversion.
  *
  * TODO: Take a precision argument, or even better, add a new precision member
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.h b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
index ef221051bcd..42a11137473 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
@@ -70,6 +70,16 @@ lp_build_conv(struct gallivm_state *gallivm,
               const LLVMValueRef *srcs, unsigned num_srcs,
               LLVMValueRef *dsts, unsigned num_dsts);
 
+
+int
+lp_build_conv_auto(struct gallivm_state *gallivm,
+                   struct lp_type src_type,
+                   struct lp_type* dst_type,
+                   const LLVMValueRef *src,
+                   unsigned num_srcs,
+                   LLVMValueRef *dst);
+
+
 void
 lp_build_conv_mask(struct gallivm_state *gallivm,
                    struct lp_type src_type,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index 8a77a43dae8..f56b61bf248 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -560,7 +560,8 @@ LLVMValueRef
 lp_build_select_aos(struct lp_build_context *bld,
                     unsigned mask,
                     LLVMValueRef a,
-                    LLVMValueRef b)
+                    LLVMValueRef b,
+                    unsigned num_channels)
 {
    LLVMBuilderRef builder = bld->gallivm->builder;
    const struct lp_type type = bld->type;
@@ -594,8 +595,8 @@ lp_build_select_aos(struct lp_build_context *bld,
       LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
       LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
 
-      for(j = 0; j < n; j += 4)
-         for(i = 0; i < 4; ++i)
+      for(j = 0; j < n; j += num_channels)
+         for(i = 0; i < num_channels; ++i)
             shuffles[j + i] = LLVMConstInt(elem_type,
                                            (mask & (1 << i) ? 0 : n) + j + i,
                                            0);
@@ -603,7 +604,7 @@ lp_build_select_aos(struct lp_build_context *bld,
       return LLVMBuildShuffleVector(builder, a, b, LLVMConstVector(shuffles, n), "");
    }
    else {
-      LLVMValueRef mask_vec = lp_build_const_mask_aos(bld->gallivm, type, mask, 4);
+      LLVMValueRef mask_vec = lp_build_const_mask_aos(bld->gallivm, type, mask, num_channels);
       return lp_build_select(bld, mask_vec, a, b);
    }
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.h b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
index 64c0a1f5946..f5304240a59 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
@@ -79,7 +79,8 @@ LLVMValueRef
 lp_build_select_aos(struct lp_build_context *bld,
                     unsigned mask,
                     LLVMValueRef a,
-                    LLVMValueRef b);
+                    LLVMValueRef b,
+                    unsigned num_channels);
 
 
 LLVMValueRef
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index e57d4148870..b467d561e36 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -211,6 +211,42 @@ lp_build_concat(struct gallivm_state *gallivm,
    return tmp[0];
 }
 
+
+/**
+ * Combines vectors to reduce from num_srcs to num_dsts.
+ * Returns the number of src vectors concatenated in a single dst.
+ *
+ * num_srcs must be exactly divisible by num_dsts.
+ *
+ * e.g. For num_srcs = 4 and src = [x, y, z, w]
+ *          num_dsts = 1  dst = [xyzw]    return = 4
+ *          num_dsts = 2  dst = [xy, zw]  return = 2
+ */
+int
+lp_build_concat_n(struct gallivm_state *gallivm,
+                  struct lp_type src_type,
+                  LLVMValueRef *src,
+                  unsigned num_srcs,
+                  LLVMValueRef *dst,
+                  unsigned num_dsts)
+{
+   int size = num_srcs / num_dsts;
+   int i;
+
+   assert(num_srcs >= num_dsts);
+   assert((num_srcs % size) == 0);
+
+   if (num_srcs == num_dsts)
+      return 1;
+
+   for (i = 0; i < num_dsts; ++i) {
+      dst[i] = lp_build_concat(gallivm, &src[i * size], src_type, size);
+   }
+
+   return size;
+}
+
+
 /**
  * Interleave vector elements.
  *
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
index f734c60b1d8..7cede35bbde 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@@ -87,6 +87,15 @@ lp_build_concat(struct gallivm_state *gallivm,
                 struct lp_type src_type,
                 unsigned num_vectors);
 
+int
+lp_build_concat_n(struct gallivm_state *gallivm,
+                  struct lp_type src_type,
+                  LLVMValueRef *src,
+                  unsigned num_srcs,
+                  LLVMValueRef *dst,
+                  unsigned num_dsts);
+
+
 LLVMValueRef
 lp_build_packs2(struct gallivm_state *gallivm,
                 struct lp_type src_type,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
index c7c58edd5a7..8a0efed655f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
@@ -31,6 +31,7 @@
 #include "lp_bld_const.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_quad.h"
+#include "lp_bld_pack.h"
 
 
 static const unsigned char
@@ -156,3 +157,52 @@ lp_build_packed_ddx_ddy_twocoord(struct lp_build_context *bld,
       return LLVMBuildSub(builder, vec2, vec1, "ddxddyddxddy");
 }
 
+
+/**
+ * Twiddle from quad format to row format
+ *
+ *   src0      src1
+ * ######### #########      #################
+ * # 0 | 1 # # 4 | 5 #      # 0 | 1 | 4 | 5 # src0
+ * #---+---# #---+---#  ->  #################
+ * # 2 | 3 # # 6 | 7 #      # 2 | 3 | 6 | 7 # src1
+ * ######### #########      #################
+ *
+ */
+void
+lp_bld_quad_twiddle(struct gallivm_state *gallivm,
+                    struct lp_type lp_dst_type,
+                    const LLVMValueRef* src,
+                    unsigned src_count,
+                    LLVMValueRef* dst)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMTypeRef dst_type_ref;
+   LLVMTypeRef type2_ref;
+   struct lp_type type2;
+   unsigned i;
+
+   assert((src_count % 2) == 0);
+
+   /* Create a type with only 2 elements */
+   type2 = lp_dst_type;
+   type2.width = (lp_dst_type.width * lp_dst_type.length) / 2;
+   type2.length = 2;
+   type2.floating = 0;
+
+   type2_ref = lp_build_vec_type(gallivm, type2);
+   dst_type_ref = lp_build_vec_type(gallivm, lp_dst_type);
+
+   for (i = 0; i < src_count; i += 2) {
+      LLVMValueRef src0, src1;
+
+      src0 = LLVMBuildBitCast(builder, src[i + 0], type2_ref, "");
+      src1 = LLVMBuildBitCast(builder, src[i + 1], type2_ref, "");
+
+      dst[i + 0] = lp_build_interleave2(gallivm, type2, src0, src1, 0);
+      dst[i + 1] = lp_build_interleave2(gallivm, type2, src0, src1, 1);
+
+      dst[i + 0] = LLVMBuildBitCast(builder, dst[i + 0], dst_type_ref, "");
+      dst[i + 1] = LLVMBuildBitCast(builder, dst[i + 1], dst_type_ref, "");
+   }
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.h b/src/gallium/auxiliary/gallivm/lp_bld_quad.h
index be6a1efc396..e41f80efe2d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.h
@@ -88,5 +88,14 @@ LLVMValueRef
 lp_build_packed_ddx_ddy_onecoord(struct lp_build_context *bld,
                                  LLVMValueRef a);
 
+/*
+ * Twiddle from quad format to row format
+ */
+void
+lp_bld_quad_twiddle(struct gallivm_state *gallivm,
+                    struct lp_type lp_dst_type,
+                    const LLVMValueRef* src,
+                    unsigned src_count,
+                    LLVMValueRef* dst);
 
 #endif /* LP_BLD_QUAD_H_ */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 37490e47c85..8ea5f5e01dc 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -772,7 +772,7 @@ lp_build_get_mip_offsets(struct lp_build_sample_context *bld,
          offset1 = LLVMBuildLoad(builder, offset1, "");
          offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexo, "");
       }
-      offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, offsets, 0);
+      offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, offsets, 0, 4);
    }
    else {
       unsigned i;
@@ -849,7 +849,7 @@ lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
          stride1 = LLVMBuildLoad(builder, stride1, "");
          stride = LLVMBuildInsertElement(builder, stride, stride1, indexo, "");
       }
-      stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0);
+      stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0, 4);
    }
    else {
       LLVMValueRef stride1;
@@ -1045,11 +1045,11 @@ lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
          *out_width = size;
       }
       else if (bld->num_lods == num_quads) {
-         *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0);
+         *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0, 4);
          if (dims >= 2) {
-            *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1);
+            *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1, 4);
             if (dims == 3) {
-               *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2);
+               *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2, 4);
             }
          }
       }
@@ -1246,9 +1246,9 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
       signrxyz = LLVMBuildBitCast(builder, rxyz, lp_build_vec_type(gallivm, intctype), "");
       signrxyz = LLVMBuildAnd(builder, signrxyz, signmask, "");
 
-      arxs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 0);
-      arys = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 1);
-      arzs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 2);
+      arxs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 0, 4);
+      arys = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 1, 4);
+      arzs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 2, 4);
 
       /*
        * select x if x >= y else select y
@@ -1267,15 +1267,15 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
        * snewz = signrz * rx;
        * tnewz = -ry;
        */
-      signrxs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 0);
+      signrxs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 0, 4);
       snewx = LLVMBuildXor(builder, signrxs, rzneg, "");
       tnewx = ryneg;
 
-      signrys = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 1);
+      signrys = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 1, 4);
       snewy = rx;
       tnewy = LLVMBuildXor(builder, signrys, rz, "");
 
-      signrzs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 2);
+      signrzs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 2, 4);
       snewz = LLVMBuildXor(builder, signrzs, rx, "");
       tnewz = ryneg;
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
index 4ae4f3752a8..377884a78cf 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -159,21 +159,24 @@ lp_build_extract_broadcast(struct gallivm_state *gallivm,
 
 
 /**
- * Swizzle one channel into all other three channels.
+ * Swizzle one channel into other channels.
  */
 LLVMValueRef
 lp_build_swizzle_scalar_aos(struct lp_build_context *bld,
                             LLVMValueRef a,
-                            unsigned channel)
+                            unsigned channel,
+                            unsigned num_channels)
 {
    LLVMBuilderRef builder = bld->gallivm->builder;
    const struct lp_type type = bld->type;
    const unsigned n = type.length;
    unsigned i, j;
 
-   if(a == bld->undef || a == bld->zero || a == bld->one)
+   if(a == bld->undef || a == bld->zero || a == bld->one || num_channels == 1)
       return a;
 
+   assert(num_channels == 2 || num_channels == 4);
+
    /* XXX: SSE3 has PSHUFB which should be better than bitmasks, but forcing
     * using shuffles here actually causes worst results. More investigation is
     * needed. */
@@ -184,12 +187,55 @@ lp_build_swizzle_scalar_aos(struct lp_build_context *bld,
       LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
       LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
 
-      for(j = 0; j < n; j += 4)
-         for(i = 0; i < 4; ++i)
+      for(j = 0; j < n; j += num_channels)
+         for(i = 0; i < num_channels; ++i)
             shuffles[j + i] = LLVMConstInt(elem_type, j + channel, 0);
 
       return LLVMBuildShuffleVector(builder, a, bld->undef, LLVMConstVector(shuffles, n), "");
    }
+   else if (num_channels == 2) {
+      /*
+       * Bit mask and shifts
+       *
+       *   XY XY .... XY  <= input
+       *   0Y 0Y .... 0Y
+       *   YY YY .... YY
+       *   YY YY .... YY  <= output
+       */
+      struct lp_type type2;
+      LLVMValueRef tmp = NULL;
+      int shift;
+
+      a = LLVMBuildAnd(builder, a,
+                       lp_build_const_mask_aos(bld->gallivm,
+                                               type, 1 << channel, num_channels), "");
+
+      type2 = type;
+      type2.floating = FALSE;
+      type2.width *= 2;
+      type2.length /= 2;
+
+      a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type2), "");
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+      shift = channel == 0 ? 1 : -1;
+#else
+      shift = channel == 0 ? -1 : 1;
+#endif
+
+      if (shift > 0) {
+         tmp = LLVMBuildShl(builder, a, lp_build_const_int_vec(bld->gallivm, type2, shift * type.width), "");
+      } else if (shift < 0) {
+         tmp = LLVMBuildLShr(builder, a, lp_build_const_int_vec(bld->gallivm, type2, -shift * type.width), "");
+      }
+
+      assert(tmp);
+      if (tmp) {
+         a = LLVMBuildOr(builder, a, tmp, "");
+      }
+
+      return LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type), "");
+   }
    else {
       /*
        * Bit mask and recursive shifts
@@ -247,6 +293,45 @@ lp_build_swizzle_scalar_aos(struct lp_build_context *bld,
 }
 
 
+/**
+ * Swizzle a vector consisting of an array of XYZW structs.
+ *
+ * This fills a vector of dst_len length with the swizzled channels from src.
+ *
+ * e.g. with swizzles = { 2, 1, 0 } and swizzle_count = 6 results in
+ *      RGBA RGBA = BGR BGR BG
+ *
+ * @param swizzles        the swizzle array
+ * @param num_swizzles    the number of elements in swizzles
+ * @param dst_len         the length of the result
+ */
+LLVMValueRef
+lp_build_swizzle_aos_n(struct gallivm_state* gallivm,
+                       LLVMValueRef src,
+                       const unsigned char* swizzles,
+                       unsigned num_swizzles,
+                       unsigned dst_len)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef shuffles[LP_MAX_VECTOR_WIDTH];
+   unsigned i;
+
+   assert(dst_len < LP_MAX_VECTOR_WIDTH);
+
+   for (i = 0; i < dst_len; ++i) {
+      int swizzle = swizzles[i % num_swizzles];
+
+      if (swizzle == LP_BLD_SWIZZLE_DONTCARE) {
+         shuffles[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+      } else {
+         shuffles[i] = lp_build_const_int32(gallivm, swizzle);
+      }
+   }
+
+   return LLVMBuildShuffleVector(builder, src, LLVMGetUndef(LLVMTypeOf(src)), LLVMConstVector(shuffles, dst_len), "");
+}
+
+
 LLVMValueRef
 lp_build_swizzle_aos(struct lp_build_context *bld,
                      LLVMValueRef a,
@@ -272,7 +357,7 @@ lp_build_swizzle_aos(struct lp_build_context *bld,
       case PIPE_SWIZZLE_GREEN:
       case PIPE_SWIZZLE_BLUE:
       case PIPE_SWIZZLE_ALPHA:
-         return lp_build_swizzle_scalar_aos(bld, a, swizzles[0]);
+         return lp_build_swizzle_scalar_aos(bld, a, swizzles[0], 4);
       case PIPE_SWIZZLE_ZERO:
          return bld->zero;
       case PIPE_SWIZZLE_ONE:
@@ -367,7 +452,7 @@ lp_build_swizzle_aos(struct lp_build_context *bld,
             cond |= 1 << chan;
          }
       }
-      res = lp_build_select_aos(bld, cond, bld->one, bld->zero);
+      res = lp_build_select_aos(bld, cond, bld->one, bld->zero, 4);
 
       /*
        * Build a type where each element is an integer that cover the four
@@ -554,6 +639,44 @@ lp_build_transpose_aos(struct gallivm_state *gallivm,
 
 
 /**
+ * Transpose from AOS <-> SOA for num_srcs
+ */
+void
+lp_build_transpose_aos_n(struct gallivm_state *gallivm,
+                         struct lp_type type,
+                         const LLVMValueRef* src,
+                         unsigned num_srcs,
+                         LLVMValueRef* dst)
+{
+   switch (num_srcs) {
+      case 1:
+         dst[0] = src[0];
+         break;
+
+      case 2:
+      {
+         /* Note: we must use a temporary incase src == dst */
+         LLVMValueRef lo, hi;
+
+         lo = lp_build_interleave2_half(gallivm, type, src[0], src[1], 0);
+         hi = lp_build_interleave2_half(gallivm, type, src[0], src[1], 1);
+
+         dst[0] = lo;
+         dst[1] = hi;
+         break;
+      }
+
+      case 4:
+         lp_build_transpose_aos(gallivm, type, src, dst);
+         break;
+
+      default:
+         assert(0);
+   };
+}
+
+
+/**
  * Pack n-th element of aos values,
  * pad out to destination size.
  * i.e. x1 y1 _ _ x2 y2 _ _ will become x1 x2 _ _
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
index c49d9167231..91ecd341476 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
@@ -67,13 +67,14 @@ lp_build_extract_broadcast(struct gallivm_state *gallivm,
 
 
 /**
- * Broadcast one channel of a vector composed of arrays of XYZW structures into
- * all four channel.
+ * Broadcast one channel of a vector composed of arrays of XYZ.. structures into
+ * all channels XXX...
  */
 LLVMValueRef
 lp_build_swizzle_scalar_aos(struct lp_build_context *bld,
-                       LLVMValueRef a,
-                       unsigned channel);
+                            LLVMValueRef a,
+                            unsigned channel,
+                            unsigned num_channels);
 
 
 /**
@@ -88,6 +89,14 @@ lp_build_swizzle_aos(struct lp_build_context *bld,
 
 
 LLVMValueRef
+lp_build_swizzle_aos_n(struct gallivm_state* gallivm,
+                       LLVMValueRef src,
+                       const unsigned char* swizzles,
+                       unsigned num_swizzles,
+                       unsigned dst_len);
+
+
+LLVMValueRef
 lp_build_swizzle_soa_channel(struct lp_build_context *bld,
                              const LLVMValueRef *unswizzled,
                              unsigned swizzle);
@@ -113,6 +122,14 @@ lp_build_transpose_aos(struct gallivm_state *gallivm,
                        LLVMValueRef dst[4]);
 
 
+void
+lp_build_transpose_aos_n(struct gallivm_state *gallivm,
+                         struct lp_type type,
+                         const LLVMValueRef* src,
+                         unsigned num_srcs,
+                         LLVMValueRef* dst);
+
+
 LLVMValueRef
 lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
                           struct lp_type src_type,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
index 44f684a1d01..dbd9ccb3b62 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
@@ -94,7 +94,7 @@ swizzle_scalar_aos(struct lp_build_tgsi_aos_context *bld,
                    unsigned chan)
 {
    chan = bld->swizzles[chan];
-   return lp_build_swizzle_scalar_aos(&bld->bld_base.base, a, chan);
+   return lp_build_swizzle_scalar_aos(&bld->bld_base.base, a, chan, 4);
 }
 
 
@@ -623,7 +623,7 @@ lp_emit_instruction_aos(
 
    case TGSI_OPCODE_EX2:
       src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL);
-      tmp0 = lp_build_swizzle_scalar_aos(&bld->bld_base.base, src0, TGSI_SWIZZLE_X);
+      tmp0 = lp_build_swizzle_scalar_aos(&bld->bld_base.base, src0, TGSI_SWIZZLE_X, TGSI_NUM_CHANNELS);
       dst0 = lp_build_exp2(&bld->bld_base.base, tmp0);
       break;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.h b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
index 68e55ac05b3..75e06d16ed8 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
@@ -60,10 +60,13 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
                    struct lp_type type,
                    unsigned rt,
                    LLVMValueRef src,
+                   LLVMValueRef src_alpha,
                    LLVMValueRef dst,
                    LLVMValueRef mask,
                    LLVMValueRef const_,
-                   const unsigned char swizzle[4]);
+                   LLVMValueRef const_alpha,
+                   const unsigned char swizzle[4],
+                   int nr_channels);
 
 
 void
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
index c615c2d6f5b..8be0b974287 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
@@ -66,14 +66,18 @@
 struct lp_build_blend_aos_context
 {
    struct lp_build_context base;
-   
+
    LLVMValueRef src;
+   LLVMValueRef src_alpha;
    LLVMValueRef dst;
    LLVMValueRef const_;
+   LLVMValueRef const_alpha;
 
    LLVMValueRef inv_src;
+   LLVMValueRef inv_src_alpha;
    LLVMValueRef inv_dst;
    LLVMValueRef inv_const;
+   LLVMValueRef inv_const_alpha;
    LLVMValueRef saturate;
 
    LLVMValueRef rgb_src_factor;
@@ -88,14 +92,18 @@ lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld,
                                  unsigned factor,
                                  boolean alpha)
 {
+   LLVMValueRef src_alpha = bld->src_alpha ? bld->src_alpha : bld->src;
+   LLVMValueRef const_alpha = bld->const_alpha ? bld->const_alpha : bld->const_;
+
    switch (factor) {
    case PIPE_BLENDFACTOR_ZERO:
       return bld->base.zero;
    case PIPE_BLENDFACTOR_ONE:
       return bld->base.one;
    case PIPE_BLENDFACTOR_SRC_COLOR:
-   case PIPE_BLENDFACTOR_SRC_ALPHA:
       return bld->src;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      return src_alpha;
    case PIPE_BLENDFACTOR_DST_COLOR:
    case PIPE_BLENDFACTOR_DST_ALPHA:
       return bld->dst;
@@ -106,32 +114,39 @@ lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld,
          if(!bld->inv_dst)
             bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
          if(!bld->saturate)
-            bld->saturate = lp_build_min(&bld->base, bld->src, bld->inv_dst);
+            bld->saturate = lp_build_min(&bld->base, src_alpha, bld->inv_dst);
          return bld->saturate;
       }
    case PIPE_BLENDFACTOR_CONST_COLOR:
-   case PIPE_BLENDFACTOR_CONST_ALPHA:
       return bld->const_;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      return const_alpha;
    case PIPE_BLENDFACTOR_SRC1_COLOR:
    case PIPE_BLENDFACTOR_SRC1_ALPHA:
       /* TODO */
       assert(0);
       return bld->base.zero;
    case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
       if(!bld->inv_src)
          bld->inv_src = lp_build_comp(&bld->base, bld->src);
       return bld->inv_src;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      if(!bld->inv_src_alpha)
+         bld->inv_src_alpha = lp_build_comp(&bld->base, src_alpha);
+      return bld->inv_src_alpha;
    case PIPE_BLENDFACTOR_INV_DST_COLOR:
    case PIPE_BLENDFACTOR_INV_DST_ALPHA:
       if(!bld->inv_dst)
          bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
       return bld->inv_dst;
    case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
       if(!bld->inv_const)
          bld->inv_const = lp_build_comp(&bld->base, bld->const_);
       return bld->inv_const;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      if(!bld->inv_const_alpha)
+         bld->inv_const_alpha = lp_build_comp(&bld->base, const_alpha);
+      return bld->inv_const_alpha;
    case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
    case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
       /* TODO */
@@ -190,7 +205,8 @@ lp_build_blend_swizzle(struct lp_build_blend_aos_context *bld,
                        LLVMValueRef rgb, 
                        LLVMValueRef alpha, 
                        enum lp_build_blend_swizzle rgb_swizzle,
-                       unsigned alpha_swizzle)
+                       unsigned alpha_swizzle,
+                       unsigned num_channels)
 {
    LLVMValueRef swizzled_rgb;
 
@@ -199,7 +215,7 @@ lp_build_blend_swizzle(struct lp_build_blend_aos_context *bld,
       swizzled_rgb = rgb;
       break;
    case LP_BUILD_BLEND_SWIZZLE_AAAA:
-      swizzled_rgb = lp_build_swizzle_scalar_aos(&bld->base, rgb, alpha_swizzle);
+      swizzled_rgb = lp_build_swizzle_scalar_aos(&bld->base, rgb, alpha_swizzle, num_channels);
       break;
    default:
       assert(0);
@@ -208,13 +224,13 @@ lp_build_blend_swizzle(struct lp_build_blend_aos_context *bld,
 
    if (rgb != alpha) {
       swizzled_rgb = lp_build_select_aos(&bld->base, 1 << alpha_swizzle,
-                                         alpha, swizzled_rgb);
+                                         alpha, swizzled_rgb,
+                                         num_channels);
    }
 
    return swizzled_rgb;
 }
 
-
 /**
  * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendFuncSeparate.xml
  */
@@ -222,17 +238,22 @@ static LLVMValueRef
 lp_build_blend_factor(struct lp_build_blend_aos_context *bld,
                       unsigned rgb_factor,
                       unsigned alpha_factor,
-                      unsigned alpha_swizzle)
+                      unsigned alpha_swizzle,
+                      unsigned num_channels)
 {
    LLVMValueRef rgb_factor_, alpha_factor_;
    enum lp_build_blend_swizzle rgb_swizzle;
 
+   if (alpha_swizzle == 0) {
+      return lp_build_blend_factor_unswizzled(bld, alpha_factor, TRUE);
+   }
+
    rgb_factor_ = lp_build_blend_factor_unswizzled(bld, rgb_factor, FALSE);
 
    if (alpha_swizzle != UTIL_FORMAT_SWIZZLE_NONE) {
       rgb_swizzle   = lp_build_blend_factor_swizzle(rgb_factor);
       alpha_factor_ = lp_build_blend_factor_unswizzled(bld, alpha_factor, TRUE);
-      return lp_build_blend_swizzle(bld, rgb_factor_, alpha_factor_, rgb_swizzle, alpha_swizzle);
+      return lp_build_blend_swizzle(bld, rgb_factor_, alpha_factor_, rgb_swizzle, alpha_swizzle, num_channels);
    } else {
       return rgb_factor_;
    }
@@ -261,18 +282,21 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
                    struct lp_type type,
                    unsigned rt,
                    LLVMValueRef src,
+                   LLVMValueRef src_alpha,
                    LLVMValueRef dst,
                    LLVMValueRef mask,
                    LLVMValueRef const_,
-                   const unsigned char swizzle[4])
+                   LLVMValueRef const_alpha,
+                   const unsigned char swizzle[4],
+                   int nr_channels)
 {
    const struct pipe_rt_blend_state * state = &blend->rt[rt];
    const struct util_format_description * desc;
    struct lp_build_blend_aos_context bld;
    LLVMValueRef src_factor, dst_factor;
    LLVMValueRef result;
-   unsigned alpha_swizzle = swizzle[3];
-   boolean fullcolormask;
+   unsigned alpha_swizzle = UTIL_FORMAT_SWIZZLE_NONE;
+   unsigned i;
 
    desc = util_format_description(cbuf_format[rt]);
 
@@ -282,20 +306,32 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
    bld.src = src;
    bld.dst = dst;
    bld.const_ = const_;
-
-   if (swizzle[3] > UTIL_FORMAT_SWIZZLE_W || swizzle[3] == swizzle[0])
-      alpha_swizzle = UTIL_FORMAT_SWIZZLE_NONE;
+   bld.src_alpha = src_alpha;
+   bld.const_alpha = const_alpha;
+
+   /* Find the alpha channel if not provided seperately */
+   if (!src_alpha) {
+      for (i = 0; i < 4; ++i) {
+         if (swizzle[i] == 3) {
+            alpha_swizzle = i;
+         }
+      }
+   }
 
    if (!state->blend_enable) {
       result = src;
    } else {
-      boolean rgb_alpha_same = state->rgb_src_factor == state->rgb_dst_factor && state->alpha_src_factor == state->alpha_dst_factor;
-      assert(rgb_alpha_same || alpha_swizzle != UTIL_FORMAT_SWIZZLE_NONE);
+      boolean rgb_alpha_same = (state->rgb_src_factor == state->rgb_dst_factor && state->alpha_src_factor == state->alpha_dst_factor) || nr_channels == 1;
 
       src_factor = lp_build_blend_factor(&bld, state->rgb_src_factor,
-                                         state->alpha_src_factor, alpha_swizzle);
+                                         state->alpha_src_factor,
+                                         alpha_swizzle,
+                                         nr_channels);
+
       dst_factor = lp_build_blend_factor(&bld, state->rgb_dst_factor,
-                                         state->alpha_dst_factor, alpha_swizzle);
+                                         state->alpha_dst_factor,
+                                         alpha_swizzle,
+                                         nr_channels);
 
       result = lp_build_blend(&bld.base,
                               state->rgb_func,
@@ -308,7 +344,7 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
                               rgb_alpha_same,
                               false);
 
-      if(state->rgb_func != state->alpha_func && alpha_swizzle != UTIL_FORMAT_SWIZZLE_NONE) {
+      if(state->rgb_func != state->alpha_func && nr_channels > 1 && alpha_swizzle != UTIL_FORMAT_SWIZZLE_NONE) {
          LLVMValueRef alpha;
 
          alpha = lp_build_blend(&bld.base,
@@ -326,22 +362,27 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
                                          result,
                                          alpha,
                                          LP_BUILD_BLEND_SWIZZLE_RGBA,
-                                         alpha_swizzle);
+                                         alpha_swizzle,
+                                         nr_channels);
       }
    }
 
    /* Check if color mask is necessary */
-   fullcolormask = util_format_colormask_full(util_format_description(cbuf_format[rt]), state->colormask);
-
-   if (!fullcolormask) {
+   if (!util_format_colormask_full(desc, state->colormask)) {
       LLVMValueRef color_mask;
 
-      color_mask = lp_build_const_mask_aos_swizzled(gallivm, bld.base.type, state.colormask, desc->nr_channels, swizzle);
+      color_mask = lp_build_const_mask_aos_swizzled(gallivm, bld.base.type, state->colormask, nr_channels, swizzle);
       lp_build_name(color_mask, "color_mask");
 
       /* Combine with input mask if necessary */
       if (mask) {
+         /* We can be blending floating values but masks are always integer... */
+         unsigned floating = bld.base.type.floating;
+         bld.base.type.floating = 0;
+
          mask = lp_build_and(&bld.base, color_mask, mask);
+
+         bld.base.type.floating = floating;
       } else {
          mask = color_mask;
       }
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index 20c53cbcf7f..2667aeb24e6 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -128,7 +128,8 @@ lp_jit_create_types(struct lp_fragment_shader_variant *lp)
       elem_types[LP_JIT_CTX_ALPHA_REF] = LLVMFloatTypeInContext(lc);
       elem_types[LP_JIT_CTX_STENCIL_REF_FRONT] =
       elem_types[LP_JIT_CTX_STENCIL_REF_BACK] = LLVMInt32TypeInContext(lc);
-      elem_types[LP_JIT_CTX_BLEND_COLOR] = LLVMPointerType(LLVMInt8TypeInContext(lc), 0);
+      elem_types[LP_JIT_CTX_U8_BLEND_COLOR] = LLVMPointerType(LLVMInt8TypeInContext(lc), 0);
+      elem_types[LP_JIT_CTX_F_BLEND_COLOR] = LLVMPointerType(LLVMFloatTypeInContext(lc), 0);
       elem_types[LP_JIT_CTX_TEXTURES] = LLVMArrayType(texture_type,
                                                       PIPE_MAX_SAMPLERS);
 
@@ -153,9 +154,12 @@ lp_jit_create_types(struct lp_fragment_shader_variant *lp)
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, stencil_ref_back,
                              gallivm->target, context_type,
                              LP_JIT_CTX_STENCIL_REF_BACK);
-      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, blend_color,
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, u8_blend_color,
                              gallivm->target, context_type,
-                             LP_JIT_CTX_BLEND_COLOR);
+                             LP_JIT_CTX_U8_BLEND_COLOR);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, f_blend_color,
+                             gallivm->target, context_type,
+                             LP_JIT_CTX_F_BLEND_COLOR);
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, textures,
                              gallivm->target, context_type,
                              LP_JIT_CTX_TEXTURES);
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
index 94a2bb5ff36..b4699074656 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -103,8 +103,8 @@ struct lp_jit_context
 
    uint32_t stencil_ref_front, stencil_ref_back;
 
-   /* FIXME: store (also?) in floats */
-   uint8_t *blend_color;
+   uint8_t *u8_blend_color;
+   float *f_blend_color;
 
    struct lp_jit_texture textures[PIPE_MAX_SAMPLERS];
 };
@@ -119,7 +119,8 @@ enum {
    LP_JIT_CTX_ALPHA_REF,
    LP_JIT_CTX_STENCIL_REF_FRONT,
    LP_JIT_CTX_STENCIL_REF_BACK,
-   LP_JIT_CTX_BLEND_COLOR,
+   LP_JIT_CTX_U8_BLEND_COLOR,
+   LP_JIT_CTX_F_BLEND_COLOR,
    LP_JIT_CTX_TEXTURES,
    LP_JIT_CTX_COUNT
 };
@@ -137,14 +138,33 @@ enum {
 #define lp_jit_context_stencil_ref_back_value(_gallivm, _ptr) \
    lp_build_struct_get(_gallivm, _ptr, LP_JIT_CTX_STENCIL_REF_BACK, "stencil_ref_back")
 
-#define lp_jit_context_blend_color(_gallivm, _ptr) \
-   lp_build_struct_get(_gallivm, _ptr, LP_JIT_CTX_BLEND_COLOR, "blend_color")
+#define lp_jit_context_u8_blend_color(_gallivm, _ptr) \
+   lp_build_struct_get(_gallivm, _ptr, LP_JIT_CTX_U8_BLEND_COLOR, "u8_blend_color")
+
+#define lp_jit_context_f_blend_color(_gallivm, _ptr) \
+   lp_build_struct_get(_gallivm, _ptr, LP_JIT_CTX_F_BLEND_COLOR, "f_blend_color")
 
 #define lp_jit_context_textures(_gallivm, _ptr) \
    lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CTX_TEXTURES, "textures")
 
 
 
+/**
+ * typedef for fragment shader function
+ *
+ * @param context       jit context
+ * @param x             block start x
+ * @param y             block start y
+ * @param facing        is front facing
+ * @param a0            shader input a0
+ * @param dadx          shader input dadx
+ * @param dady          shader input dady
+ * @param color         color buffer
+ * @param depth         depth buffer
+ * @param mask          mask of visible pixels in block
+ * @param thread_data   task thread data
+ * @param stride        color buffer row stride in bytes
+ */
 typedef void
 (*lp_jit_frag_func)(const struct lp_jit_context *context,
                     uint32_t x,
@@ -156,7 +176,8 @@ typedef void
                     uint8_t **color,
                     void *depth,
                     uint32_t mask,
-                    uint32_t *counter);
+                    uint32_t *counter,
+                    unsigned *stride);
 
 
 void
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index d743d7689ae..3d83077fcbd 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -151,47 +151,70 @@ lp_rast_clear_color(struct lp_rasterizer_task *task,
                     const union lp_rast_cmd_arg arg)
 {
    const struct lp_scene *scene = task->scene;
-   const uint8_t *clear_color = arg.clear_color;
+   uint8_t clear_color[4];
 
    unsigned i;
+   boolean gray;
 
-   LP_DBG(DEBUG_RAST, "%s 0x%x,0x%x,0x%x,0x%x\n", __FUNCTION__, 
+   for (i = 0; i < 4; ++i) {
+      clear_color[i] = float_to_ubyte(arg.clear_color[i]);
+   }
+
+   LP_DBG(DEBUG_RAST, "%s 0x%x,0x%x,0x%x,0x%x\n", __FUNCTION__,
               clear_color[0],
               clear_color[1],
               clear_color[2],
               clear_color[3]);
 
-   if (clear_color[0] == clear_color[1] &&
-       clear_color[1] == clear_color[2] &&
-       clear_color[2] == clear_color[3]) {
-      /* clear to grayscale value {x, x, x, x} */
-      for (i = 0; i < scene->fb.nr_cbufs; i++) {
-         uint8_t *ptr =
-            lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL);
-	 memset(ptr, clear_color[0], TILE_SIZE * TILE_SIZE * 4);
-      }
-   }
-   else {
-      /* Non-gray color.
-       * Note: if the swizzled tile layout changes (see TILE_PIXEL) this code
-       * will need to change.  It'll be pretty obvious when clearing no longer
-       * works.
-       */
-      const unsigned chunk = TILE_SIZE / 4;
-      for (i = 0; i < scene->fb.nr_cbufs; i++) {
-         uint8_t *c =
-            lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL);
+   gray =
+         clear_color[0] == clear_color[1] &&
+         clear_color[1] == clear_color[2] &&
+         clear_color[2] == clear_color[3];
+
+   for (i = 0; i < scene->fb.nr_cbufs; i++) {
+      if (scene->cbufs[i].unswizzled) {
+         const struct lp_scene *scene = task->scene;
+         union util_color uc;
+
+         util_pack_color(arg.clear_color,
+                         scene->fb.cbufs[i]->format, &uc);
+
+         util_fill_rect(scene->cbufs[i].map,
+                        scene->fb.cbufs[i]->format,
+                        scene->cbufs[i].stride,
+                        task->x,
+                        task->y,
+                        TILE_SIZE,
+                        TILE_SIZE,
+                        &uc);
+      } else {
+         const unsigned chunk = TILE_SIZE / 4;
+         uint8_t *ptr;
          unsigned j;
 
-         for (j = 0; j < 4 * TILE_SIZE; j++) {
-            memset(c, clear_color[0], chunk);
-            c += chunk;
-            memset(c, clear_color[1], chunk);
-            c += chunk;
-            memset(c, clear_color[2], chunk);
-            c += chunk;
-            memset(c, clear_color[3], chunk);
-            c += chunk;
+         ptr = lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL);
+
+         if (gray) {
+            /* clear to grayscale value {x, x, x, x} */
+
+            memset(ptr, clear_color[0], TILE_SIZE * TILE_SIZE * 4);
+         } else {
+            /* Non-gray color.
+            * Note: if the swizzled tile layout changes (see TILE_PIXEL) this code
+            * will need to change.  It'll be pretty obvious when clearing no longer
+            * works.
+            */
+
+            for (j = 0; j < 4 * TILE_SIZE; j++) {
+               memset(ptr, clear_color[0], chunk);
+               ptr += chunk;
+               memset(ptr, clear_color[1], chunk);
+               ptr += chunk;
+               memset(ptr, clear_color[2], chunk);
+               ptr += chunk;
+               memset(ptr, clear_color[3], chunk);
+               ptr += chunk;
+            }
          }
       }
    }
@@ -311,7 +334,7 @@ lp_rast_store_linear_color( struct lp_rasterizer_task *task )
       const unsigned level = cbuf->u.tex.level;
       struct llvmpipe_resource *lpt = llvmpipe_resource(cbuf->texture);
 
-      if (!task->color_tiles[buf])
+      if (scene->cbufs[buf].unswizzled || !task->color_tiles[buf])
          continue;
 
       llvmpipe_unswizzle_cbuf_tile(lpt,
@@ -358,13 +381,20 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task,
    for (y = 0; y < TILE_SIZE; y += 4){
       for (x = 0; x < TILE_SIZE; x += 4) {
          uint8_t *color[PIPE_MAX_COLOR_BUFS];
+         unsigned stride[PIPE_MAX_COLOR_BUFS];
          uint32_t *depth;
          unsigned i;
 
          /* color buffer */
-         for (i = 0; i < scene->fb.nr_cbufs; i++)
-            color[i] = lp_rast_get_color_block_pointer(task, i,
-                                                       tile_x + x, tile_y + y);
+         for (i = 0; i < scene->fb.nr_cbufs; i++){
+            stride[i] = scene->cbufs[i].stride;
+
+            if (scene->cbufs[i].unswizzled) {
+               color[i] = lp_rast_get_unswizzled_color_block_pointer(task, i, tile_x + x, tile_y + y);
+            } else {
+               color[i] = lp_rast_get_color_block_pointer(task, i, tile_x + x, tile_y + y);
+            }
+         }
 
          /* depth buffer */
          depth = lp_rast_get_depth_block_pointer(task, tile_x + x, tile_y + y);
@@ -380,7 +410,8 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task,
                                             color,
                                             depth,
                                             0xffff,
-                                            &task->vis_counter);
+                                            &task->vis_counter,
+                                            stride);
          END_JIT_CALL();
       }
    }
@@ -408,7 +439,9 @@ lp_rast_shade_tile_opaque(struct lp_rasterizer_task *task,
 
    /* this will prevent converting the layout from tiled to linear */
    for (i = 0; i < scene->fb.nr_cbufs; i++) {
-      (void)lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL);
+      if (!scene->cbufs[i].unswizzled) {
+         (void)lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL);
+      }
    }
 
    lp_rast_shade_tile(task, arg);
@@ -431,6 +464,7 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
    struct lp_fragment_shader_variant *variant = state->variant;
    const struct lp_scene *scene = task->scene;
    uint8_t *color[PIPE_MAX_COLOR_BUFS];
+   unsigned stride[PIPE_MAX_COLOR_BUFS];
    void *depth;
    unsigned i;
 
@@ -447,15 +481,20 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
 
    /* color buffer */
    for (i = 0; i < scene->fb.nr_cbufs; i++) {
-      color[i] = lp_rast_get_color_block_pointer(task, i, x, y);
-      assert(lp_check_alignment(color[i], 16));
+      stride[i] = scene->cbufs[i].stride;
+
+      if (scene->cbufs[i].unswizzled) {
+         color[i] = lp_rast_get_unswizzled_color_block_pointer(task, i, x, y);
+      } else {
+         color[i] = lp_rast_get_color_block_pointer(task, i, x, y);
+      }
    }
 
    /* depth buffer */
    depth = lp_rast_get_depth_block_pointer(task, x, y);
 
 
-   assert(lp_check_alignment(state->jit_context.blend_color, 16));
+   assert(lp_check_alignment(state->jit_context.u8_blend_color, 16));
 
    /* run shader on 4x4 block */
    BEGIN_JIT_CALL(state, task);
@@ -468,7 +507,8 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
                                          color,
                                          depth,
                                          mask,
-                                         &task->vis_counter);
+                                         &task->vis_counter,
+                                         stride);
    END_JIT_CALL();
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index 49da41f0e5e..315601e1b6a 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -152,7 +152,7 @@ union lp_rast_cmd_arg {
       unsigned plane_mask;
    } triangle;
    const struct lp_rast_state *set_state;
-   uint8_t clear_color[4];
+   float clear_color[4];
    struct {
       uint32_t value;
       uint32_t mask;
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
index b5d0074e894..eeb1a94138e 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
@@ -189,6 +189,7 @@ lp_rast_get_color_tile_pointer(struct lp_rasterizer_task *task,
    assert(task->x % TILE_SIZE == 0);
    assert(task->y % TILE_SIZE == 0);
    assert(buf < scene->fb.nr_cbufs);
+   assert(scene->cbufs[buf].unswizzled == 0);
 
    if (!task->color_tiles[buf]) {
       struct pipe_surface *cbuf = scene->fb.cbufs[buf];
@@ -211,6 +212,35 @@ lp_rast_get_color_tile_pointer(struct lp_rasterizer_task *task,
 
 
 /**
+ * Get pointer to the unswizzled color tile
+ */
+static INLINE uint8_t *
+lp_rast_get_unswizzled_color_tile_pointer(struct lp_rasterizer_task *task,
+                                          unsigned buf, enum lp_texture_usage usage)
+{
+   const struct lp_scene *scene = task->scene;
+   unsigned format_bytes;
+
+   assert(task->x < scene->tiles_x * TILE_SIZE);
+   assert(task->y < scene->tiles_y * TILE_SIZE);
+   assert(task->x % TILE_SIZE == 0);
+   assert(task->y % TILE_SIZE == 0);
+   assert(buf < scene->fb.nr_cbufs);
+   assert(scene->cbufs[buf].unswizzled);
+
+   if (!task->color_tiles[buf]) {
+      struct pipe_surface *cbuf = scene->fb.cbufs[buf];
+      assert(cbuf);
+
+      format_bytes = util_format_description(cbuf->format)->block.bits / 8;
+      task->color_tiles[buf] = scene->cbufs[buf].map + scene->cbufs[buf].stride * task->y + format_bytes * task->x;
+   }
+
+   return task->color_tiles[buf];
+}
+
+
+/**
  * Get the pointer to a 4x4 color block (within a 64x64 tile).
  * We'll map the color buffer on demand here.
  * Note that this may be called even when there's no color buffers - return
@@ -228,6 +258,8 @@ lp_rast_get_color_block_pointer(struct lp_rasterizer_task *task,
    assert(y < task->scene->tiles_y * TILE_SIZE);
    assert((x % TILE_VECTOR_WIDTH) == 0);
    assert((y % TILE_VECTOR_HEIGHT) == 0);
+   assert(buf < task->scene->fb.nr_cbufs);
+   assert(task->scene->cbufs[buf].unswizzled == 0);
 
    color = lp_rast_get_color_tile_pointer(task, buf, LP_TEX_USAGE_READ_WRITE);
    assert(color);
@@ -243,6 +275,40 @@ lp_rast_get_color_block_pointer(struct lp_rasterizer_task *task,
 }
 
 
+/**
+ * Get the pointer to an unswizzled 4x4 color block (within an unswizzled 64x64 tile).
+ * \param x, y location of 4x4 block in window coords
+ */
+static INLINE uint8_t *
+lp_rast_get_unswizzled_color_block_pointer(struct lp_rasterizer_task *task,
+                                           unsigned buf, unsigned x, unsigned y)
+{
+   unsigned px, py, pixel_offset, format_bytes;
+   uint8_t *color;
+
+   assert(x < task->scene->tiles_x * TILE_SIZE);
+   assert(y < task->scene->tiles_y * TILE_SIZE);
+   assert((x % TILE_VECTOR_WIDTH) == 0);
+   assert((y % TILE_VECTOR_HEIGHT) == 0);
+   assert(buf < task->scene->fb.nr_cbufs);
+   assert(task->scene->cbufs[buf].unswizzled);
+
+   format_bytes = util_format_description(task->scene->fb.cbufs[buf]->format)->block.bits / 8;
+
+   color = lp_rast_get_unswizzled_color_tile_pointer(task, buf, LP_TEX_USAGE_READ_WRITE);
+   assert(color);
+
+   px = x % TILE_SIZE;
+   py = y % TILE_SIZE;
+   pixel_offset = px * format_bytes + py * task->scene->cbufs[buf].stride;
+
+   color = color + pixel_offset;
+
+   assert(lp_check_alignment(color, llvmpipe_get_format_alignment(task->scene->fb.cbufs[buf]->format)));
+   return color;
+}
+
+
 
 /**
  * Shade all pixels in a 4x4 block.  The fragment code omits the
@@ -258,12 +324,20 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task,
    const struct lp_rast_state *state = task->state;
    struct lp_fragment_shader_variant *variant = state->variant;
    uint8_t *color[PIPE_MAX_COLOR_BUFS];
+   unsigned stride[PIPE_MAX_COLOR_BUFS];
    void *depth;
    unsigned i;
 
    /* color buffer */
-   for (i = 0; i < scene->fb.nr_cbufs; i++)
-      color[i] = lp_rast_get_color_block_pointer(task, i, x, y);
+   for (i = 0; i < scene->fb.nr_cbufs; i++) {
+      stride[i] = scene->cbufs[i].stride;
+
+      if (scene->cbufs[i].unswizzled) {
+         color[i] = lp_rast_get_unswizzled_color_block_pointer(task, i, x, y);
+      } else {
+         color[i] = lp_rast_get_color_block_pointer(task, i, x, y);
+      }
+   }
 
    depth = lp_rast_get_depth_block_pointer(task, x, y);
 
@@ -278,7 +352,8 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task,
                                       color,
                                       depth,
                                       0xffff,
-                                      &task->vis_counter );
+                                      &task->vis_counter,
+                                      stride );
    END_JIT_CALL();
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c b/src/gallium/drivers/llvmpipe/lp_scene.c
index ed998246fb9..515717bf021 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.c
+++ b/src/gallium/drivers/llvmpipe/lp_scene.c
@@ -150,6 +150,8 @@ lp_scene_begin_rasterization(struct lp_scene *scene)
                                                   cbuf->u.tex.first_layer,
                                                   LP_TEX_USAGE_READ_WRITE,
                                                   LP_TEX_LAYOUT_LINEAR);
+
+      scene->cbufs[i].unswizzled = llvmpipe_is_format_unswizzled(cbuf->format);
    }
 
    if (fb->zsbuf) {
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h
index 622c522f11a..012fa672cf8 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.h
+++ b/src/gallium/drivers/llvmpipe/lp_scene.h
@@ -137,6 +137,7 @@ struct lp_scene {
       uint8_t *map;
       unsigned stride;
       unsigned blocksize;
+      unsigned unswizzled;
    } zsbuf, cbufs[PIPE_MAX_COLOR_BUFS];
    
    /** the framebuffer to render the scene into */
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index 60144c34bac..a06acb27b83 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -390,7 +390,7 @@ lp_setup_try_clear( struct lp_setup_context *setup,
 
    if (flags & PIPE_CLEAR_COLOR) {
       for (i = 0; i < 4; i++)
-         color_arg.clear_color[i] = float_to_ubyte(color[i]);
+         color_arg.clear_color[i] = color[i];
    }
 
    if (flags & PIPE_CLEAR_DEPTHSTENCIL) {
@@ -805,14 +805,26 @@ try_update_scene_state( struct lp_setup_context *setup )
 
    if(setup->dirty & LP_SETUP_NEW_BLEND_COLOR) {
       uint8_t *stored;
+      float* fstored;
       unsigned i, j;
+      unsigned size;
+
+      /* Alloc u8_blend_color (16 x i8) and f_blend_color (4 or 8 x f32) */
+      size  = 4 * 16 * sizeof(uint8_t);
+      size += (LP_MAX_VECTOR_LENGTH / 4) * sizeof(float);
+      stored = lp_scene_alloc_aligned(scene, size, LP_MAX_VECTOR_LENGTH);
 
-      stored = lp_scene_alloc_aligned(scene, 4 * 16, 16);
       if (!stored) {
          assert(!new_scene);
          return FALSE;
       }
 
+      /* Store floating point colour */
+      fstored = (float*)(stored + 4*16);
+      for (i = 0; i < (LP_MAX_VECTOR_LENGTH / 4); ++i) {
+         fstored[i] = setup->blend_color.current.color[i % 4];
+      }
+
       /* smear each blend color component across 16 ubyte elements */
       for (i = 0; i < 4; ++i) {
          uint8_t c = float_to_ubyte(setup->blend_color.current.color[i]);
@@ -821,7 +833,8 @@ try_update_scene_state( struct lp_setup_context *setup )
       }
 
       setup->blend_color.stored = stored;
-      setup->fs.current.jit_context.blend_color = setup->blend_color.stored;
+      setup->fs.current.jit_context.u8_blend_color = stored;
+      setup->fs.current.jit_context.f_blend_color = fstored;
       setup->dirty |= LP_SETUP_NEW_FS;
    }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index da20dedde3e..bf0217d6847 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -82,6 +82,10 @@
 #include "gallivm/lp_bld_swizzle.h"
 #include "gallivm/lp_bld_flow.h"
 #include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_pack.h"
+#include "gallivm/lp_bld_format.h"
+#include "gallivm/lp_bld_quad.h"
 
 #include "lp_bld_alpha.h"
 #include "lp_bld_blend.h"
@@ -719,7 +723,7 @@ generate_blend(struct gallivm_state *gallivm,
 
    vec_type = lp_build_vec_type(gallivm, type);
 
-   const_ptr = lp_jit_context_blend_color(gallivm, context_ptr);
+   const_ptr = lp_jit_context_u8_blend_color(gallivm, context_ptr);
    const_ptr = LLVMBuildBitCast(builder, const_ptr,
                                 LLVMPointerType(vec_type, 0), "");
 
@@ -752,6 +756,1010 @@ generate_blend(struct gallivm_state *gallivm,
 
 
 /**
+ * This function will reorder pixels from the fragment shader SoA to memory layout AoS
+ *
+ * Fragment Shader outputs pixels in small 2x2 blocks
+ *  e.g. (0, 0), (1, 0), (0, 1), (1, 1) ; (2, 0) ...
+ *
+ * However in memory pixels are stored in rows
+ *  e.g. (0, 0), (1, 0), (2, 0), (3, 0) ; (0, 1) ...
+ *
+ * @param type            fragment shader type (4x or 8x float)
+ * @param num_fs          number of fs_src
+ * @param dst_channels    number of output channels
+ * @param fs_src          output from fragment shader
+ * @param dst             pointer to store result
+ * @param pad_inline      is channel padding inline or at end of row
+ * @return                the number of dsts
+ */
+static int
+generate_fs_twiddle(struct gallivm_state *gallivm,
+                    struct lp_type type,
+                    unsigned num_fs,
+                    unsigned dst_channels,
+                    LLVMValueRef fs_src[][4],
+                    LLVMValueRef* dst,
+                    bool pad_inline)
+{
+   LLVMValueRef src[16];
+
+   bool swizzle_pad;
+   bool twiddle;
+   bool split;
+
+   unsigned pixels = num_fs == 4 ? 1 : 2;
+   unsigned reorder_group;
+   unsigned src_channels;
+   unsigned src_count;
+   unsigned i;
+
+   src_channels = dst_channels < 3 ? dst_channels : 4;
+   src_count = num_fs * src_channels;
+
+   assert(pixels == 2 || num_fs == 4);
+   assert(num_fs * src_channels <= Elements(src));
+
+   /*
+    * Transpose from SoA -> AoS
+    */
+   for (i = 0; i < num_fs; ++i) {
+      lp_build_transpose_aos_n(gallivm, type, &fs_src[i][0], src_channels, &src[i * src_channels]);
+   }
+
+   /*
+    * Pick transformation options
+    */
+   swizzle_pad = false;
+   twiddle = false;
+   split = false;
+   reorder_group = 0;
+
+   if (dst_channels == 1) {
+      twiddle = true;
+
+      if (pixels == 2) {
+         split = true;
+      }
+   } else if (dst_channels == 2) {
+      if (pixels == 1) {
+         reorder_group = 1;
+      }
+   } else if (dst_channels > 2) {
+      if (pixels == 1) {
+         reorder_group = 2;
+      } else {
+         twiddle = true;
+      }
+
+      if (!pad_inline && dst_channels == 3 && pixels > 1) {
+         swizzle_pad = true;
+      }
+   }
+
+   /*
+    * Split the src in half
+    */
+   if (split) {
+      for (i = num_fs; i > 0; --i) {
+         src[(i - 1)*2 + 1] = lp_build_extract_range(gallivm, src[i - 1], 4, 4);
+         src[(i - 1)*2 + 0] = lp_build_extract_range(gallivm, src[i - 1], 0, 4);
+      }
+
+      src_count *= 2;
+      type.length = 4;
+   }
+
+   /*
+    * Ensure pixels are in memory order
+    */
+   if (reorder_group) {
+      /* Twiddle pixels by reordering the array, e.g.:
+       *
+       * src_count =  8 -> 0 2 1 3 4 6 5 7
+       * src_count = 16 -> 0 1 4 5 2 3 6 7 8 9 12 13 10 11 14 15
+       */
+      const unsigned reorder_sw[] = { 0, 2, 1, 3 };
+
+      for (i = 0; i < src_count; ++i) {
+         unsigned group = i / reorder_group;
+         unsigned block = (group / 4) * 4 * reorder_group;
+         unsigned j = block + (reorder_sw[group % 4] * reorder_group) + (i % reorder_group);
+         dst[i] = src[j];
+      }
+   } else if (twiddle) {
+      /* Twiddle pixels across elements of array */
+      lp_bld_quad_twiddle(gallivm, type, src, src_count, dst);
+   } else {
+      /* Do nothing */
+      memcpy(dst, src, sizeof(LLVMValueRef) * src_count);
+   }
+
+   /*
+    * Moves any padding between pixels to the end
+    * e.g. RGBXRGBX -> RGBRGBXX
+    */
+   if (swizzle_pad) {
+      unsigned char swizzles[16];
+      unsigned elems = pixels * dst_channels;
+
+      for (i = 0; i < type.length; ++i) {
+         if (i < elems)
+            swizzles[i] = i % dst_channels + (i / dst_channels) * 4;
+         else
+            swizzles[i] = LP_BLD_SWIZZLE_DONTCARE;
+      }
+
+      for (i = 0; i < src_count; ++i) {
+         dst[i] = lp_build_swizzle_aos_n(gallivm, dst[i], swizzles, type.length, type.length);
+      }
+   }
+
+   return src_count;
+}
+
+
+/**
+ * Load an unswizzled block of pixels from memory
+ */
+static void
+load_unswizzled_block(struct gallivm_state *gallivm,
+                      LLVMValueRef base_ptr,
+                      LLVMValueRef stride,
+                      unsigned block_width,
+                      unsigned block_height,
+                      LLVMValueRef* dst,
+                      struct lp_type dst_type,
+                      unsigned dst_count)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   unsigned row_size = dst_count / block_height;
+   unsigned i;
+
+   /* Ensure block exactly fits into dst */
+   assert((block_width * block_height) % dst_count == 0);
+
+   for (i = 0; i < dst_count; ++i) {
+      unsigned x = i % row_size;
+      unsigned y = i / row_size;
+
+      LLVMValueRef bx = lp_build_const_int32(gallivm, x * (dst_type.width / 8) * dst_type.length);
+      LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
+
+      LLVMValueRef gep[2];
+      LLVMValueRef dst_ptr;
+
+      gep[0] = lp_build_const_int32(gallivm, 0);
+      gep[1] = LLVMBuildAdd(builder, bx, by, "");
+
+      dst_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
+      dst_ptr = LLVMBuildBitCast(builder, dst_ptr, LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), "");
+
+      dst[i] = LLVMBuildLoad(builder, dst_ptr, "");
+
+      if ((dst_type.length % 3) == 0) {
+         lp_set_load_alignment(dst[i], dst_type.width / 8);
+      }
+   }
+}
+
+
+/**
+ * Store an unswizzled block of pixels to memory
+ */
+static void
+store_unswizzled_block(struct gallivm_state *gallivm,
+                       LLVMValueRef base_ptr,
+                       LLVMValueRef stride,
+                       unsigned block_width,
+                       unsigned block_height,
+                       LLVMValueRef* src,
+                       struct lp_type src_type,
+                       unsigned src_count)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   unsigned row_size = src_count / block_height;
+   unsigned i;
+
+   /* Ensure src exactly fits into block */
+   assert((block_width * block_height) % src_count == 0);
+
+   for (i = 0; i < src_count; ++i) {
+      unsigned x = i % row_size;
+      unsigned y = i / row_size;
+
+      LLVMValueRef bx = lp_build_const_int32(gallivm, x * (src_type.width / 8) * src_type.length);
+      LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
+
+      LLVMValueRef gep[2];
+      LLVMValueRef src_ptr;
+
+      gep[0] = lp_build_const_int32(gallivm, 0);
+      gep[1] = LLVMBuildAdd(builder, bx, by, "");
+
+      src_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
+      src_ptr = LLVMBuildBitCast(builder, src_ptr, LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), "");
+
+      src_ptr = LLVMBuildStore(builder, src[i], src_ptr);
+
+      if ((src_type.length % 3) == 0) {
+         lp_set_store_alignment(src_ptr, src_type.width / 8);
+      }
+   }
+}
+
+
+/**
+ * Checks if a format description is an arithmetic format
+ *
+ * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5.
+ */
+static INLINE boolean
+is_arithmetic_format(const struct util_format_description *format_desc)
+{
+   boolean arith = false;
+   unsigned i;
+
+   for (i = 0; i < format_desc->nr_channels; ++i) {
+      arith |= format_desc->channel[i].size != format_desc->channel[0].size;
+      arith |= (format_desc->channel[i].size % 8) != 0;
+   }
+
+   return arith;
+}
+
+
+/**
+ * Retrieves the type representing the memory layout for a format
+ *
+ * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte
+ */
+static INLINE void
+lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
+                             struct lp_type* type)
+{
+   int i;
+
+   memset(type, 0, sizeof(struct lp_type));
+   type->floating = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT;
+   type->fixed    = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FIXED;
+   type->sign     = format_desc->channel[0].type != UTIL_FORMAT_TYPE_UNSIGNED;
+   type->norm     = format_desc->channel[0].normalized;
+
+   if (is_arithmetic_format(format_desc)) {
+      type->width = 0;
+      type->length = 1;
+
+      for (i = 0; i < format_desc->nr_channels; ++i) {
+         type->width += format_desc->channel[i].size;
+      }
+   } else {
+      type->width = format_desc->channel[0].size;
+      type->length = format_desc->nr_channels;
+   }
+}
+
+
+/**
+ * Retrieves the type for a format which is usable in the blending code.
+ *
+ * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte
+ */
+static INLINE void
+lp_blend_type_from_format_desc(const struct util_format_description *format_desc,
+                               struct lp_type* type)
+{
+   int i;
+
+   memset(type, 0, sizeof(struct lp_type));
+   type->floating = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT;
+   type->fixed    = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FIXED;
+   type->sign     = format_desc->channel[0].type != UTIL_FORMAT_TYPE_UNSIGNED;
+   type->norm     = format_desc->channel[0].normalized;
+   type->width    = format_desc->channel[0].size;
+   type->length   = format_desc->nr_channels;
+
+   for (i = 1; i < format_desc->nr_channels; ++i) {
+      if (format_desc->channel[i].size > type->width)
+         type->width = format_desc->channel[i].size;
+   }
+
+   if (type->floating) {
+      type->width = 32;
+   } else {
+      if (type->width <= 8) {
+         type->width = 8;
+      } else if (type->width <= 16) {
+         type->width = 16;
+      } else {
+         type->width = 32;
+      }
+   }
+
+   if (is_arithmetic_format(format_desc) && type->length == 3) {
+      type->length = 4;
+   }
+}
+
+
+/**
+ * Scale a normalised value from src_bits to dst_bits
+ */
+static INLINE LLVMValueRef
+scale_bits(struct gallivm_state *gallivm,
+           int src_bits,
+           int dst_bits,
+           LLVMValueRef src,
+           struct lp_type src_type)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef result = src;
+
+   if (dst_bits < src_bits) {
+      /* Scale down by LShr */
+      result = LLVMBuildLShr(builder,
+                             src,
+                             lp_build_const_int_vec(gallivm, src_type, src_bits - dst_bits),
+                             "");
+   } else if (dst_bits > src_bits) {
+      /* Scale up bits */
+      int db = dst_bits - src_bits;
+
+      /* Shift left by difference in bits */
+      result = LLVMBuildShl(builder,
+                            src,
+                            lp_build_const_int_vec(gallivm, src_type, db),
+                            "");
+
+      if (db < src_bits) {
+         /* Enough bits in src to fill the remainder */
+         LLVMValueRef lower = LLVMBuildLShr(builder,
+                                            src,
+                                            lp_build_const_int_vec(gallivm, src_type, src_bits - db),
+                                            "");
+
+         result = LLVMBuildOr(builder, result, lower, "");
+      } else if (db > src_bits) {
+         /* Need to repeatedely copy src bits to fill remainder in dst */
+         unsigned n;
+
+         for (n = src_bits; n < dst_bits; n *= 2) {
+            LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
+
+            result = LLVMBuildOr(builder,
+                                 result,
+                                 LLVMBuildLShr(builder, result, shuv, ""),
+                                 "");
+         }
+      }
+   }
+
+   return result;
+}
+
+
+/**
+ * Convert from memory format to blending format
+ *
+ * e.g. GL_R3G3B2 is 1 byte in memory but 3 bytes for blending
+ */
+static void
+convert_to_blend_type(struct gallivm_state *gallivm,
+                      const struct util_format_description *src_fmt,
+                      struct lp_type src_type,
+                      struct lp_type dst_type,
+                      LLVMValueRef* src,
+                      unsigned num_srcs,
+                      LLVMValueRef* dst)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_type blend_type;
+   struct lp_type mem_type;
+   unsigned i, j, k;
+   unsigned pixels = 16 / num_srcs;
+   bool is_arith;
+
+   memcpy(dst, src, sizeof(LLVMValueRef*) * num_srcs);
+
+   lp_mem_type_from_format_desc(src_fmt, &mem_type);
+   lp_blend_type_from_format_desc(src_fmt, &blend_type);
+
+   /* Is the format arithmetic */
+   is_arith = blend_type.length * blend_type.width != mem_type.width * mem_type.length;
+   is_arith &= !(mem_type.width == 16 && mem_type.floating);
+
+   /* Pad if necessary */
+   if (!is_arith && src_type.length < dst_type.length) {
+      for (i = 0; i < num_srcs; ++i) {
+         dst[i] = lp_build_pad_vector(gallivm, src[i], dst_type.length);
+      }
+
+      src_type.length = dst_type.length;
+   }
+
+   /* Special case for half-floats */
+   if (mem_type.width == 16 && mem_type.floating) {
+      assert(blend_type.width == 32 && blend_type.floating);
+      lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
+      is_arith = false;
+   }
+
+   if (!is_arith) {
+      return;
+   }
+
+   src_type.width = blend_type.width * blend_type.length;
+   blend_type.length *= pixels;
+   src_type.length *= pixels / (src_type.length / mem_type.length);
+
+   for (i = 0; i < num_srcs; ++i) {
+      LLVMValueRef chans[4];
+      LLVMValueRef res;
+      unsigned sa = 0;
+
+      dst[i] = LLVMBuildZExt(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
+
+      for (j = 0; j < src_fmt->nr_channels; ++j) {
+         unsigned mask = 0;
+
+         for (k = 0; k < src_fmt->channel[j].size; ++k) {
+            mask |= 1 << k;
+         }
+
+         /* Extract bits from source */
+         chans[j] = LLVMBuildLShr(builder,
+                                  dst[i],
+                                  lp_build_const_int_vec(gallivm, src_type, sa),
+                                  "");
+
+         chans[j] = LLVMBuildAnd(builder,
+                                 chans[j],
+                                 lp_build_const_int_vec(gallivm, src_type, mask),
+                                 "");
+
+         /* Scale bits */
+         chans[j] = scale_bits(gallivm, src_fmt->channel[j].size, blend_type.width, chans[j], src_type);
+
+         /* Insert bits into correct position */
+         chans[j] = LLVMBuildShl(builder,
+                                 chans[j],
+                                 lp_build_const_int_vec(gallivm, src_type, j * blend_type.width),
+                                 "");
+
+         sa += src_fmt->channel[j].size;
+
+         if (j == 0) {
+            res = chans[j];
+         } else {
+            res = LLVMBuildOr(builder, res, chans[j], "");
+         }
+      }
+
+      dst[i] = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, blend_type), "");
+   }
+}
+
+
+/**
+ * Convert from blending format to memory format
+ *
+ * e.g. GL_R3G3B2 is 3 bytes for blending but 1 byte in memory
+ */
+static void
+convert_from_blend_type(struct gallivm_state *gallivm,
+                        const struct util_format_description *src_fmt,
+                        struct lp_type src_type,
+                        struct lp_type dst_type,
+                        LLVMValueRef* src,
+                        unsigned num_srcs,
+                        LLVMValueRef* dst)
+{
+   unsigned i, j, k;
+   struct lp_type mem_type;
+   struct lp_type blend_type;
+   LLVMBuilderRef builder = gallivm->builder;
+   unsigned pixels = 16 / num_srcs;
+   bool is_arith;
+
+   memcpy(dst, src, sizeof(LLVMValueRef*) * num_srcs);
+
+   lp_mem_type_from_format_desc(src_fmt, &mem_type);
+   lp_blend_type_from_format_desc(src_fmt, &blend_type);
+
+   is_arith = (blend_type.length * blend_type.width != mem_type.width * mem_type.length);
+
+   /* Special case for half-floats */
+   if (mem_type.width == 16 && mem_type.floating) {
+      int length = dst_type.length;
+      assert(blend_type.width == 32 && blend_type.floating);
+
+      dst_type.length = src_type.length;
+
+      lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
+
+      dst_type.length = length;
+      is_arith = false;
+   }
+
+   /* Remove any padding */
+   if (!is_arith && (src_type.length % mem_type.length)) {
+      src_type.length -= (src_type.length % mem_type.length);
+
+      for (i = 0; i < num_srcs; ++i) {
+         dst[i] = lp_build_extract_range(gallivm, dst[i], 0, src_type.length);
+      }
+   }
+
+   /* No bit arithmitic to do */
+   if (!is_arith) {
+      return;
+   }
+
+   src_type.length = pixels;
+   src_type.width = blend_type.length * blend_type.width;
+   dst_type.length = pixels;
+
+   for (i = 0; i < num_srcs; ++i) {
+      LLVMValueRef chans[4];
+      LLVMValueRef res;
+      unsigned sa = 0;
+
+      dst[i] = LLVMBuildBitCast(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
+
+      for (j = 0; j < src_fmt->nr_channels; ++j) {
+         unsigned mask = 0;
+
+         assert(blend_type.width > src_fmt->channel[j].size);
+
+         for (k = 0; k < blend_type.width; ++k) {
+            mask |= 1 << k;
+         }
+
+         /* Extract bits */
+         chans[j] = LLVMBuildLShr(builder,
+                                  dst[i],
+                                  lp_build_const_int_vec(gallivm, src_type, j * blend_type.width),
+                                  "");
+
+         chans[j] = LLVMBuildAnd(builder,
+                                 chans[j],
+                                 lp_build_const_int_vec(gallivm, src_type, mask),
+                                 "");
+
+         /* Scale down bits */
+         chans[j] = scale_bits(gallivm, blend_type.width, src_fmt->channel[j].size, chans[j], src_type);
+
+         /* Insert bits */
+         chans[j] = LLVMBuildShl(builder,
+                                 chans[j],
+                                 lp_build_const_int_vec(gallivm, src_type, sa),
+                                 "");
+
+         sa += src_fmt->channel[j].size;
+
+         if (j == 0) {
+            res = chans[j];
+         } else {
+            res = LLVMBuildOr(builder, res, chans[j], "");
+         }
+      }
+
+      assert (dst_type.width != 24);
+
+      dst[i] = LLVMBuildTrunc(builder, res, lp_build_vec_type(gallivm, dst_type), "");
+   }
+}
+
+
+/**
+ * Generates the blend function for unswizzled colour buffers
+ * Also generates the read & write from colour buffer
+ */
+static void
+generate_unswizzled_blend(struct gallivm_state *gallivm,
+                          unsigned rt,
+                          struct lp_fragment_shader_variant *variant,
+                          enum pipe_format out_format,
+                          unsigned int num_fs,
+                          struct lp_type fs_type,
+                          LLVMValueRef* fs_mask,
+                          LLVMValueRef fs_out_color[TGSI_NUM_CHANNELS][4],
+                          LLVMValueRef context_ptr,
+                          LLVMValueRef color_ptr,
+                          LLVMValueRef stride,
+                          unsigned partial_mask,
+                          boolean do_branch)
+{
+   const unsigned alpha_channel = 3;
+   const unsigned block_width = 4;
+   const unsigned block_height = 4;
+   const unsigned block_size = block_width * block_height;
+   const unsigned lp_integer_vector_width = 128;
+
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef fs_src[4][TGSI_NUM_CHANNELS];
+   LLVMValueRef src_alpha[block_size];
+   LLVMValueRef src_mask[block_size];
+   LLVMValueRef src[block_size];
+   LLVMValueRef dst[block_size];
+   LLVMValueRef blend_color;
+   LLVMValueRef blend_alpha;
+   LLVMValueRef i32_zero;
+   LLVMValueRef check_mask;
+
+   struct lp_build_mask_context mask_ctx;
+   struct lp_type mask_type;
+   struct lp_type blend_type;
+   struct lp_type alpha_type;
+   struct lp_type row_type;
+   struct lp_type dst_type;
+
+   unsigned char swizzle[TGSI_NUM_CHANNELS];
+   unsigned vector_width;
+   unsigned dst_channels;
+   unsigned src_channels;
+   unsigned dst_count;
+   unsigned src_count;
+   unsigned i, j;
+
+   const struct util_format_description* out_format_desc = util_format_description(out_format);
+
+   bool pad_inline = is_arithmetic_format(out_format_desc);
+   bool has_alpha = false;
+
+   src_channels = TGSI_NUM_CHANNELS;
+   mask_type = lp_int32_vec4_type();
+   mask_type.length = fs_type.length;
+
+   /* Do not bother executing code when mask is empty.. */
+   if (do_branch) {
+      check_mask = LLVMConstNull(lp_build_int_vec_type(gallivm, mask_type));
+
+      for (i = 0; i < num_fs; ++i) {
+         check_mask = LLVMBuildOr(builder, check_mask, fs_mask[i], "");
+      }
+
+      lp_build_mask_begin(&mask_ctx, gallivm, mask_type, check_mask);
+      lp_build_mask_check(&mask_ctx);
+   }
+
+   partial_mask |= !variant->opaque;
+   i32_zero = lp_build_const_int32(gallivm, 0);
+
+   /* Get type from output format */
+   lp_blend_type_from_format_desc(out_format_desc, &row_type);
+   lp_mem_type_from_format_desc(out_format_desc, &dst_type);
+
+   row_type.length = fs_type.length;
+   vector_width    = dst_type.floating ? lp_native_vector_width : lp_integer_vector_width;
+
+   /* Compute correct swizzle and count channels */
+   memset(swizzle, 0xFF, TGSI_NUM_CHANNELS);
+   dst_channels = 0;
+
+   for (i = 0; i < TGSI_NUM_CHANNELS; ++i) {
+      /* Ensure channel is used */
+      if (out_format_desc->swizzle[i] >= TGSI_NUM_CHANNELS) {
+         continue;
+      }
+
+      /* Ensure not already written to (happens in case with GL_ALPHA) */
+      if (swizzle[out_format_desc->swizzle[i]] < TGSI_NUM_CHANNELS) {
+         continue;
+      }
+
+      /* Ensure we havn't already found all channels */
+      if (dst_channels >= out_format_desc->nr_channels) {
+         continue;
+      }
+
+      swizzle[out_format_desc->swizzle[i]] = i;
+      ++dst_channels;
+
+      if (i == alpha_channel) {
+         has_alpha = true;
+      }
+   }
+
+   /* If 3 channels then pad to include alpha for 4 element transpose */
+   if (dst_channels == 3 && !has_alpha) {
+      swizzle[3] = 3;
+
+      if (out_format_desc->nr_channels == 4) {
+         dst_channels = 4;
+      }
+   }
+
+   /*
+    * Load shader output
+    */
+   for (i = 0; i < num_fs; ++i) {
+      /* Always load alpha for use in blending */
+      LLVMValueRef alpha = LLVMBuildLoad(builder, fs_out_color[alpha_channel][i], "");
+
+      /* Load each channel */
+      for (j = 0; j < dst_channels; ++j) {
+         fs_src[i][j] = LLVMBuildLoad(builder, fs_out_color[swizzle[j]][i], "");
+      }
+
+      /* If 3 channels then pad to include alpha for 4 element transpose */
+      if (dst_channels == 3 && !has_alpha) {
+         fs_src[i][3] = alpha;
+         swizzle[3] = 3;
+      }
+
+      /* We split the row_mask and row_alpha as we want 128bit interleave */
+      if (fs_type.length == 8) {
+         src_mask[i*2 + 0]  = lp_build_extract_range(gallivm, fs_mask[i], 0, src_channels);
+         src_mask[i*2 + 1]  = lp_build_extract_range(gallivm, fs_mask[i], src_channels, src_channels);
+
+         src_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
+         src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, src_channels, src_channels);
+      } else {
+         src_mask[i] = fs_mask[i];
+         src_alpha[i] = alpha;
+      }
+   }
+
+
+   /*
+    * Pixel twiddle from fragment shader order to memory order
+    */
+   src_count = generate_fs_twiddle(gallivm, fs_type, num_fs, dst_channels, fs_src, src, pad_inline);
+   src_channels = dst_channels < 3 ? dst_channels : 4;
+   if (src_count != num_fs * src_channels) {
+      unsigned ds = src_count / (num_fs * src_channels);
+      row_type.length /= ds;
+      fs_type.length = row_type.length;
+   }
+
+   blend_type = row_type;
+   alpha_type = fs_type;
+   alpha_type.length = 4;
+   mask_type.length = 4;
+
+   /* Convert src to row_type */
+   src_count = lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src);
+
+   /* If the rows are not an SSE vector, combine them to become SSE size! */
+   if ((row_type.width * row_type.length) % 128) {
+      unsigned bits = row_type.width * row_type.length;
+      unsigned combined;
+
+      dst_count = src_count / (vector_width / bits);
+      combined = lp_build_concat_n(gallivm, row_type, src, src_count, src, dst_count);
+
+      row_type.length *= combined;
+      src_count /= combined;
+
+      bits = row_type.width * row_type.length;
+      assert(bits == 128 || bits == 256);
+   }
+
+
+   /*
+    * Blend Colour conversion
+    */
+   blend_color = lp_jit_context_f_blend_color(gallivm, context_ptr);
+   blend_color = LLVMBuildPointerCast(builder, blend_color, LLVMPointerType(lp_build_vec_type(gallivm, fs_type), 0), "");
+   blend_color = LLVMBuildLoad(builder, LLVMBuildGEP(builder, blend_color, &i32_zero, 1, ""), "");
+
+   /* Convert */
+   lp_build_conv(gallivm, fs_type, blend_type, &blend_color, 1, &blend_color, 1);
+
+   /* Extract alpha */
+   blend_alpha = lp_build_extract_broadcast(gallivm, blend_type, row_type, blend_color, lp_build_const_int32(gallivm, 3));
+
+   /* Swizzle to appropriate channels, e.g. from RGBA to BGRA BGRA */
+   pad_inline &= (dst_channels * (block_size / src_count) * row_type.width) != vector_width;
+   if (pad_inline) {
+      /* Use all 4 channels e.g. from RGBA RGBA to RGxx RGxx */
+      blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle, TGSI_NUM_CHANNELS, row_type.length);
+   } else {
+      /* Only use dst_channels e.g. RGBA RGBA to RG RG xxxx */
+      blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle, dst_channels, row_type.length);
+   }
+
+   /*
+    * Mask conversion
+    */
+   lp_bld_quad_twiddle(gallivm, mask_type, &src_mask[0], 4, &src_mask[0]);
+
+   if (src_count < block_height) {
+      lp_build_concat_n(gallivm, mask_type, src_mask, 4, src_mask, src_count);
+   } else if (src_count > block_height) {
+      for (i = src_count; i > 0; --i) {
+         unsigned pixels = block_size / src_count;
+         unsigned idx = i - 1;
+
+         src_mask[idx] = lp_build_extract_range(gallivm, src_mask[(idx * pixels) / 4], (idx * pixels) % 4, pixels);
+      }
+   }
+
+   assert(mask_type.width == 32);
+
+   for (i = 0; i < src_count; ++i) {
+      unsigned pixels = block_size / src_count;
+      unsigned pixel_width = row_type.width * dst_channels;
+
+      if (pixel_width == 24) {
+         mask_type.width = 8;
+         mask_type.length = vector_width / mask_type.width;
+      } else {
+         mask_type.length = pixels;
+         mask_type.width = row_type.width * dst_channels;
+
+         src_mask[i] = LLVMBuildIntCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), "");
+
+         mask_type.length *= dst_channels;
+         mask_type.width /= dst_channels;
+      }
+
+      src_mask[i] = LLVMBuildBitCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), "");
+      src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length);
+   }
+
+   /*
+    * Alpha conversion
+    */
+   if (!has_alpha) {
+      unsigned length = row_type.length;
+      row_type.length = alpha_type.length;
+
+      /* Twiddle the alpha to match pixels */
+      lp_bld_quad_twiddle(gallivm, alpha_type, src_alpha, 4, src_alpha);
+
+      for (i = 0; i < 4; ++i) {
+         lp_build_conv(gallivm, alpha_type, row_type, &src_alpha[i], 1, &src_alpha[i], 1);
+      }
+
+      alpha_type = row_type;
+      row_type.length = length;
+
+      /* If only one channel we can only need the single alpha value per pixel */
+      if (src_count == 1) {
+         assert(dst_channels == 1);
+
+         lp_build_concat_n(gallivm, alpha_type, src_alpha, 4, src_alpha, src_count);
+      } else {
+         /* If there are more srcs than rows then we need to split alpha up */
+         if (src_count > block_height) {
+            for (i = src_count; i > 0; --i) {
+               unsigned pixels = block_size / src_count;
+               unsigned idx = i - 1;
+
+               src_alpha[idx] = lp_build_extract_range(gallivm, src_alpha[(idx * pixels) / 4], (idx * pixels) % 4, pixels);
+            }
+         }
+
+         /* If there is a src for each pixel broadcast the alpha across whole row */
+         if (src_count == block_size) {
+            for (i = 0; i < src_count; ++i) {
+               src_alpha[i] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, row_type), src_alpha[i]);
+            }
+         } else {
+            unsigned pixels = block_size / src_count;
+            unsigned channels = pad_inline ? TGSI_NUM_CHANNELS : dst_channels;
+            unsigned alpha_span = 1;
+
+            /* Check if we need 2 src_alphas for our shuffles */
+            if (pixels > alpha_type.length) {
+               alpha_span = 2;
+            }
+
+            /* Broadcast alpha across all channels, e.g. a1a2 to a1a1a1a1a2a2a2a2 */
+            for (i = 0; i < src_count; ++i) {
+               LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+               unsigned idx1 = i, idx2 = i;
+
+               if (alpha_span > 1){
+                  idx1 *= alpha_span;
+                  idx2 = idx1 + 1;
+               }
+
+               for (j = 0; j < row_type.length; ++j) {
+                  if (j < pixels * channels) {
+                     shuffles[j] = lp_build_const_int32(gallivm, j / channels);
+                  } else {
+                     shuffles[j] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+                  }
+               }
+
+               src_alpha[i] = LLVMBuildShuffleVector(builder,
+                                                     src_alpha[idx1],
+                                                     src_alpha[idx2],
+                                                     LLVMConstVector(shuffles, row_type.length),
+                                                     "");
+            }
+         }
+      }
+   }
+
+
+   /*
+    * Load dst from memory
+    */
+   if (src_count < block_height) {
+      dst_count = block_height;
+   } else {
+      dst_count = src_count;
+   }
+
+   dst_type.length *= 16 / dst_count;
+
+   load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height, dst, dst_type, dst_count);
+
+
+   /*
+    * Convert from dst/output format to src/blending format.
+    *
+    * This is necessary as we can only read 1 row from memory at a time,
+    * so the minimum dst_count will ever be at this point is 4.
+    *
+    * With, for example, R8 format you can have all 16 pixels in a 128 bit vector,
+    * this will take the 4 dsts and combine them into 1 src so we can perform blending
+    * on all 16 pixels in that single vector at once.
+    */
+   if (dst_count > src_count) {
+      lp_build_concat_n(gallivm, dst_type, dst, 4, dst, src_count);
+   }
+
+   /*
+    * Blending
+    */
+   convert_to_blend_type(gallivm, out_format_desc, dst_type, row_type, dst, src_count, dst);
+
+   for (i = 0; i < src_count; ++i) {
+      dst[i] = lp_build_blend_aos(gallivm,
+                                  &variant->key.blend,
+                                  variant->key.cbuf_format,
+                                  row_type,
+                                  rt,
+                                  src[i],
+                                  has_alpha ? NULL : src_alpha[i],
+                                  dst[i],
+                                  partial_mask ? src_mask[i] : NULL,
+                                  blend_color,
+                                  has_alpha ? NULL : blend_alpha,
+                                  swizzle,
+                                  pad_inline ? 4 : dst_channels);
+   }
+
+   convert_from_blend_type(gallivm, out_format_desc, row_type, dst_type, dst, src_count, dst);
+
+   /* Split the blend rows back to memory rows */
+   if (dst_count > src_count) {
+      row_type.length = dst_type.length * (dst_count / src_count);
+
+      if (src_count == 1) {
+         dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2, row_type.length / 2);
+         dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2);
+
+         row_type.length /= 2;
+         src_count *= 2;
+      }
+
+      dst[3] = lp_build_extract_range(gallivm, dst[1], row_type.length / 2, row_type.length / 2);
+      dst[2] = lp_build_extract_range(gallivm, dst[1], 0, row_type.length / 2);
+      dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2, row_type.length / 2);
+      dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2);
+
+      row_type.length /= 2;
+      src_count *= 2;
+   }
+
+
+   /*
+    * Store blend result to memory
+    */
+   store_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height, dst, dst_type, dst_count);
+
+   if (do_branch) {
+      lp_build_mask_end(&mask_ctx);
+   }
+}
+
+
+/**
  * Generate the runtime callable function for the whole fragment pipeline.
  * Note that the function which we generate operates on a block of 16
  * pixels at at time.  The block contains 2x2 quads.  Each quad contains
@@ -771,7 +1779,7 @@ generate_fragment(struct llvmpipe_context *lp,
    struct lp_type blend_type;
    LLVMTypeRef fs_elem_type;
    LLVMTypeRef blend_vec_type;
-   LLVMTypeRef arg_types[11];
+   LLVMTypeRef arg_types[12];
    LLVMTypeRef func_type;
    LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
    LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context);
@@ -782,6 +1790,7 @@ generate_fragment(struct llvmpipe_context *lp,
    LLVMValueRef dadx_ptr;
    LLVMValueRef dady_ptr;
    LLVMValueRef color_ptr_ptr;
+   LLVMValueRef stride_ptr;
    LLVMValueRef depth_ptr;
    LLVMValueRef mask_input;
    LLVMValueRef counter = NULL;
@@ -867,6 +1876,7 @@ generate_fragment(struct llvmpipe_context *lp,
    arg_types[8] = LLVMPointerType(int8_type, 0);       /* depth */
    arg_types[9] = int32_type;                          /* mask_input */
    arg_types[10] = LLVMPointerType(int32_type, 0);     /* counter */
+   arg_types[11] = LLVMPointerType(int32_type, 0);     /* stride */
 
    func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context),
                                 arg_types, Elements(arg_types), 0);
@@ -893,6 +1903,7 @@ generate_fragment(struct llvmpipe_context *lp,
    color_ptr_ptr = LLVMGetParam(function, 7);
    depth_ptr    = LLVMGetParam(function, 8);
    mask_input   = LLVMGetParam(function, 9);
+   stride_ptr   = LLVMGetParam(function, 11);
 
    lp_build_name(context_ptr, "context");
    lp_build_name(x, "x");
@@ -903,6 +1914,7 @@ generate_fragment(struct llvmpipe_context *lp,
    lp_build_name(color_ptr_ptr, "color_ptr_ptr");
    lp_build_name(depth_ptr, "depth");
    lp_build_name(mask_input, "mask_input");
+   lp_build_name(stride_ptr, "stride_ptr");
 
    if (key->occlusion_count) {
       counter = LLVMGetParam(function, 10);
@@ -1048,54 +2060,56 @@ generate_fragment(struct llvmpipe_context *lp,
       LLVMValueRef color_ptr;
       LLVMValueRef index = lp_build_const_int32(gallivm, cbuf);
       LLVMValueRef blend_in_color[TGSI_NUM_CHANNELS];
-      unsigned rt;
-
-      /* 
-       * Convert the fs's output color and mask to fit to the blending type.
-       */
-      for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
-         LLVMValueRef fs_color_vals[LP_MAX_VECTOR_LENGTH];
-         
-         for (i = 0; i < num_fs; i++) {
-            fs_color_vals[i] =
-               LLVMBuildLoad(builder, fs_out_color[cbuf][chan][i], "fs_color_vals");
-         }
-
-         lp_build_conv(gallivm, fs_type, blend_type,
-                       fs_color_vals,
-                       num_fs,
-                       &blend_in_color[chan], 1);
+      unsigned rt = key->blend.independent_blend_enable ? cbuf : 0;
 
-         lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]);
-      }
-
-      if (partial_mask || !variant->opaque) {
-         lp_build_conv_mask(variant->gallivm, fs_type, blend_type,
-                            fs_mask, num_fs,
-                            &blend_mask, 1);
-      } else {
-         blend_mask = lp_build_const_int_vec(variant->gallivm, blend_type, ~0);
-      }
+      boolean do_branch = ((key->depth.enabled
+                            || key->stencil[0].enabled
+                            || key->alpha.enabled)
+                           && !shader->info.base.uses_kill);
 
-      color_ptr = LLVMBuildLoad(builder, 
+      color_ptr = LLVMBuildLoad(builder,
                                 LLVMBuildGEP(builder, color_ptr_ptr, &index, 1, ""),
                                 "");
+
       lp_build_name(color_ptr, "color_ptr%d", cbuf);
 
-      /* which blend/colormask state to use */
-      rt = key->blend.independent_blend_enable ? cbuf : 0;
+      if (variant->unswizzled_cbufs & (1 << cbuf)) {
+         LLVMValueRef stride = LLVMBuildLoad(builder,
+                                             LLVMBuildGEP(builder, stride_ptr, &index, 1, ""),
+                                             "");
 
-      /*
-       * Blending.
-       */
-      {
-         /* Could the 4x4 have been killed?
+         generate_unswizzled_blend(gallivm, rt, variant, key->cbuf_format[cbuf],
+                                   num_fs, fs_type, fs_mask, fs_out_color[cbuf],
+                                   context_ptr, color_ptr, stride, partial_mask, do_branch);
+      } else {
+         /*
+          * Convert the fs's output color and mask to fit to the blending type.
           */
-         boolean do_branch = ((key->depth.enabled || key->stencil[0].enabled) &&
-                              !key->alpha.enabled &&
-                              !shader->info.base.uses_kill);
+         for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+            LLVMValueRef fs_color_vals[LP_MAX_VECTOR_LENGTH];
+
+            for (i = 0; i < num_fs; i++) {
+               fs_color_vals[i] =
+                     LLVMBuildLoad(builder, fs_out_color[cbuf][chan][i], "fs_color_vals");
+            }
+
+            lp_build_conv(gallivm, fs_type, blend_type,
+                          fs_color_vals,
+                          num_fs,
+                          &blend_in_color[chan], 1);
+
+            lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]);
+         }
+
+         if (partial_mask || !variant->opaque) {
+            lp_build_conv_mask(gallivm, fs_type, blend_type,
+                               fs_mask, num_fs,
+                               &blend_mask, 1);
+         } else {
+            blend_mask = lp_build_const_int_vec(gallivm, blend_type, ~0);
+         }
 
-         generate_blend(variant->gallivm,
+         generate_blend(gallivm,
                         &key->blend,
                         rt,
                         builder,
@@ -1221,6 +2235,7 @@ generate_variant(struct llvmpipe_context *lp,
    struct lp_fragment_shader_variant *variant;
    const struct util_format_description *cbuf0_format_desc;
    boolean fullcolormask;
+   unsigned i;
 
    variant = CALLOC_STRUCT(lp_fragment_shader_variant);
    if(!variant)
@@ -1258,6 +2273,9 @@ generate_variant(struct llvmpipe_context *lp,
          !shader->info.base.uses_kill
          ? TRUE : FALSE;
 
+   for (i = 0; i < key->nr_cbufs; ++i) {
+      variant->unswizzled_cbufs |= llvmpipe_is_format_unswizzled(key->cbuf_format[i]) << i;
+   }
 
    if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
       lp_debug_fs_variant(variant);
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h
index 306f5f9669a..173d2f452ca 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h
@@ -94,6 +94,9 @@ struct lp_fragment_shader_variant
 
    lp_jit_frag_func jit_function[2];
 
+   /* Bitmask to say what cbufs are unswizzled */
+   unsigned unswizzled_cbufs;
+
    /* Total number of LLVM instructions generated */
    unsigned nr_instrs;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
index 37b37fda40e..9ceb4a6fdcf 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -195,7 +195,7 @@ add_blend_test(struct gallivm_state *gallivm,
       dst = LLVMBuildLoad(builder, dst_ptr, "dst");
       con = LLVMBuildLoad(builder, const_ptr, "const");
 
-      res = lp_build_blend_aos(gallivm, blend, &format, type, rt, src, dst, NULL, con, swizzle);
+      res = lp_build_blend_aos(gallivm, blend, &format, type, rt, src, NULL, dst, NULL, con, NULL, swizzle, 4);
 
       lp_build_name(res, "res");
 
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
index f61e3781406..3d0acdfeb8f 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -756,6 +756,73 @@ llvmpipe_is_resource_referenced( struct pipe_context *pipe,
    return lp_setup_is_resource_referenced(llvmpipe->setup, presource);
 }
 
+boolean
+llvmpipe_is_format_unswizzled( enum pipe_format format )
+{
+   const struct util_format_description *desc = util_format_description(format);
+   unsigned chan;
+
+   if (format == PIPE_FORMAT_B8G8R8X8_UNORM || format == PIPE_FORMAT_B8G8R8A8_UNORM) {
+      return FALSE;
+   }
+
+   if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
+       desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB ||
+       desc->block.width != 1 ||
+       desc->block.height != 1) {
+      return FALSE;
+   }
+
+   for (chan = 0; chan < desc->nr_channels; ++chan) {
+      if (desc->channel[chan].type == UTIL_FORMAT_TYPE_VOID && (chan + 1) == desc->nr_channels)
+         continue;
+
+      if (desc->channel[chan].type != desc->channel[0].type)
+         return FALSE;
+
+      if (desc->channel[chan].normalized != desc->channel[0].normalized)
+         return FALSE;
+
+      if (desc->channel[chan].pure_integer != desc->channel[0].pure_integer)
+         return FALSE;
+   }
+
+   /* All code assumes alpha is the last channel */
+   if (desc->nr_channels == 4 && desc->swizzle[3] < 3) {
+      return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+/**
+ * Returns the largest possible alignment for a format in llvmpipe
+ */
+unsigned
+llvmpipe_get_format_alignment( enum pipe_format format )
+{
+   const struct util_format_description *desc = util_format_description(format);
+   unsigned size = 0;
+   unsigned bytes;
+   unsigned i;
+
+   for (i = 0; i < desc->nr_channels; ++i) {
+      size += desc->channel[i].size;
+   }
+
+   bytes = size / 8;
+
+   if (!util_is_power_of_two(bytes)) {
+      bytes /= desc->nr_channels;
+   }
+
+   if (bytes % 2 || bytes < 1) {
+      return 1;
+   } else {
+      return bytes;
+   }
+}
 
 
 /**
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.h b/src/gallium/drivers/llvmpipe/lp_texture.h
index 73eaddc27ed..67b254021d6 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.h
+++ b/src/gallium/drivers/llvmpipe/lp_texture.h
@@ -256,4 +256,10 @@ llvmpipe_is_resource_referenced( struct pipe_context *pipe,
                                  struct pipe_resource *presource,
                                  unsigned level, int layer);
 
+boolean
+llvmpipe_is_format_unswizzled(enum pipe_format format);
+
+unsigned
+llvmpipe_get_format_alignment(enum pipe_format format);
+
 #endif /* LP_TEXTURE_H */