/**************************************************************************
 *
 * Copyright 2009 VMware, Inc.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 **************************************************************************/


#include "pipe/p_defines.h"

#include "util/format/u_format.h"
#include "util/u_memory.h"
#include "util/u_string.h"
#include "util/u_math.h"

#include "lp_bld_type.h"
#include "lp_bld_const.h"
#include "lp_bld_conv.h"
#include "lp_bld_swizzle.h"
#include "lp_bld_gather.h"
#include "lp_bld_debug.h"
#include "lp_bld_format.h"
#include "lp_bld_arit.h"
#include "lp_bld_pack.h"
#include "lp_bld_flow.h"
#include "lp_bld_printf.h"
#include "lp_bld_intr.h"

static void
convert_to_soa(struct gallivm_state *gallivm,
               LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
               LLVMValueRef dst_soa[4],
               const struct lp_type soa_type)
{
   unsigned j, k;
   struct lp_type aos_channel_type = soa_type;

   LLVMValueRef aos_channels[4];
   unsigned pixels_per_channel = soa_type.length / 4;

   debug_assert((soa_type.length % 4) == 0);

   aos_channel_type.length >>= 1;

   for (j = 0; j < 4; ++j) {
      LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };

      assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);

      for (k = 0; k < pixels_per_channel; ++k) {
         channel[k] = src_aos[j + 4 * k];
      }

      aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
   }

   lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
}


void
lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
                            struct lp_build_context *bld,
                            const LLVMValueRef *unswizzled,
                            LLVMValueRef swizzled_out[4])
{
   if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
      enum pipe_swizzle swizzle;
      LLVMValueRef depth_or_stencil;

      if (util_format_has_stencil(format_desc) &&
          !util_format_has_depth(format_desc)) {
         assert(!bld->type.floating);
         swizzle = format_desc->swizzle[1];
      }
      else {
         assert(bld->type.floating);
         swizzle = format_desc->swizzle[0];
      }
      /*
       * Return zzz1 or sss1 for depth-stencil formats here.
       * Correct swizzling will be handled by apply_sampler_swizzle() later.
       */
      depth_or_stencil = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);

      swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth_or_stencil;
      swizzled_out[3] = bld->one;
   }
   else {
      unsigned chan;
      for (chan = 0; chan < 4; ++chan) {
         enum pipe_swizzle swizzle = format_desc->swizzle[chan];
         swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
      }
   }
}


static LLVMValueRef
lp_build_extract_soa_chan(struct lp_build_context *bld,
                          unsigned blockbits,
                          boolean srgb_chan,
                          struct util_format_channel_description chan_desc,
                          LLVMValueRef packed)
{
   struct gallivm_state *gallivm = bld->gallivm;
   LLVMBuilderRef builder = gallivm->builder;
   struct lp_type type = bld->type;
   LLVMValueRef input = packed;
   const unsigned width = chan_desc.size;
   const unsigned start = chan_desc.shift;
   const unsigned stop = start + width;

   /* Decode the input vector component */

   switch(chan_desc.type) {
   case UTIL_FORMAT_TYPE_VOID:
      input = bld->undef;
      break;

   case UTIL_FORMAT_TYPE_UNSIGNED:
      /*
       * Align the LSB
       */
      if (start) {
         input = LLVMBuildLShr(builder, input,
                               lp_build_const_int_vec(gallivm, type, start), "");
      }

      /*
       * Zero the MSBs
       */
      if (stop < blockbits) {
         unsigned mask = ((unsigned long long)1 << width) - 1;
         input = LLVMBuildAnd(builder, input,
                              lp_build_const_int_vec(gallivm, type, mask), "");
      }

      /*
       * Type conversion
       */
      if (type.floating) {
         if (srgb_chan) {
            struct lp_type conv_type = lp_uint_type(type);
            input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
         }
         else {
            if(chan_desc.normalized)
               input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
            else
               input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
         }
      }
      else if (chan_desc.pure_integer) {
         /* Nothing to do */
      } else {
          /* FIXME */
          assert(0);
      }
      break;

   case UTIL_FORMAT_TYPE_SIGNED:
      /*
       * Align the sign bit first.
       */
      if (stop < type.width) {
         unsigned bits = type.width - stop;
         LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
         input = LLVMBuildShl(builder, input, bits_val, "");
      }

      /*
       * Align the LSB (with an arithmetic shift to preserve the sign)
       */
      if (chan_desc.size < type.width) {
         unsigned bits = type.width - chan_desc.size;
         LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
         input = LLVMBuildAShr(builder, input, bits_val, "");
      }

      /*
       * Type conversion
       */
      if (type.floating) {
         input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
         if (chan_desc.normalized) {
            double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1);
            LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
            input = LLVMBuildFMul(builder, input, scale_val, "");
            /*
             * The formula above will produce value below -1.0 for most negative
             * value but everything seems happy with that hence disable for now.
             */
            if (0)
               input = lp_build_max(bld, input,
                                    lp_build_const_vec(gallivm, type, -1.0f));
         }
      }
      else if (chan_desc.pure_integer) {
         /* Nothing to do */
      } else {
          /* FIXME */
          assert(0);
      }
      break;

   case UTIL_FORMAT_TYPE_FLOAT:
      if (type.floating) {
         if (chan_desc.size == 16) {
            struct lp_type f16i_type = type;
            f16i_type.width /= 2;
            f16i_type.floating = 0;
            if (start) {
               input = LLVMBuildLShr(builder, input,
                                     lp_build_const_int_vec(gallivm, type, start), "");
            }
            input = LLVMBuildTrunc(builder, input,
                                   lp_build_vec_type(gallivm, f16i_type), "");
            input = lp_build_half_to_float(gallivm, input);
         } else {
            assert(start == 0);
            assert(stop == 32);
            assert(type.width == 32);
         }
         input = LLVMBuildBitCast(builder, input, bld->vec_type, "");
      }
      else {
         /* FIXME */
         assert(0);
         input = bld->undef;
      }
      break;

   case UTIL_FORMAT_TYPE_FIXED:
      if (type.floating) {
         double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1);
         LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
         input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
         input = LLVMBuildFMul(builder, input, scale_val, "");
      }
      else {
         /* FIXME */
         assert(0);
         input = bld->undef;
      }
      break;

   default:
      assert(0);
      input = bld->undef;
      break;
   }

   return input;
}


/**
 * Unpack several pixels in SoA.
 *
 * It takes a vector of packed pixels:
 *
 *   packed = {P0, P1, P2, P3, ..., Pn}
 *
 * And will produce four vectors:
 *
 *   red    = {R0, R1, R2, R3, ..., Rn}
 *   green  = {G0, G1, G2, G3, ..., Gn}
 *   blue   = {B0, B1, B2, B3, ..., Bn}
 *   alpha  = {A0, A1, A2, A3, ..., An}
 *
 * It requires that a packed pixel fits into an element of the output
 * channels. The common case is when converting pixel with a depth of 32 bit or
 * less into floats.
 *
 * \param format_desc  the format of the 'packed' incoming pixel vector
 * \param type  the desired type for rgba_out (type.length = n, above)
 * \param packed  the incoming vector of packed pixels
 * \param rgba_out  returns the SoA R,G,B,A vectors
 */
void
lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
                         const struct util_format_description *format_desc,
                         struct lp_type type,
                         LLVMValueRef packed,
                         LLVMValueRef rgba_out[4])
{
   struct lp_build_context bld;
   LLVMValueRef inputs[4];
   unsigned chan;

   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
   assert(format_desc->block.width == 1);
   assert(format_desc->block.height == 1);
   assert(format_desc->block.bits <= type.width);
   /* FIXME: Support more output types */
   assert(type.width == 32);

   lp_build_context_init(&bld, gallivm, type);

   /* Decode the input vector components */
   for (chan = 0; chan < format_desc->nr_channels; ++chan) {
      struct util_format_channel_description chan_desc = format_desc->channel[chan];
      boolean srgb_chan = FALSE;

      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
          format_desc->swizzle[3] != chan) {
         srgb_chan = TRUE;
      }

      inputs[chan] = lp_build_extract_soa_chan(&bld,
                                               format_desc->block.bits,
                                               srgb_chan,
                                               chan_desc,
                                               packed);
   }

   lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
}


/**
 * Convert a vector of rgba8 values into 32bit wide SoA vectors.
 *
 * \param dst_type  The desired return type. For pure integer formats
 *                  this should be a 32bit wide int or uint vector type,
 *                  otherwise a float vector type.
 *
 * \param packed    The rgba8 values to pack.
 *
 * \param rgba      The 4 SoA return vectors.
 */
void
lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
                           struct lp_type dst_type,
                           LLVMValueRef packed,
                           LLVMValueRef *rgba)
{
   LLVMBuilderRef builder = gallivm->builder;
   LLVMValueRef mask = lp_build_const_int_vec(gallivm, dst_type, 0xff);
   unsigned chan;

   /* XXX technically shouldn't use that for uint dst_type */
   packed = LLVMBuildBitCast(builder, packed,
                             lp_build_int_vec_type(gallivm, dst_type), "");

   /* Decode the input vector components */
   for (chan = 0; chan < 4; ++chan) {
#if UTIL_ARCH_LITTLE_ENDIAN
      unsigned start = chan*8;
#else
      unsigned start = (3-chan)*8;
#endif
      unsigned stop = start + 8;
      LLVMValueRef input;

      input = packed;

      if (start)
         input = LLVMBuildLShr(builder, input,
                               lp_build_const_int_vec(gallivm, dst_type, start), "");

      if (stop < 32)
         input = LLVMBuildAnd(builder, input, mask, "");

      if (dst_type.floating)
         input = lp_build_unsigned_norm_to_float(gallivm, 8, dst_type, input);

      rgba[chan] = input;
   }
}


/**
 * Fetch a texels from a texture, returning them in SoA layout.
 *
 * \param type  the desired return type for 'rgba'.  The vector length
 *              is the number of texels to fetch
 * \param aligned if the offset is guaranteed to be aligned to element width
 *
 * \param base_ptr  points to the base of the texture mip tree.
 * \param offset    offset to start of the texture image block.  For non-
 *                  compressed formats, this simply is an offset to the texel.
 *                  For compressed formats, it is an offset to the start of the
 *                  compressed data block.
 *
 * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
 *              these will always be (0,0).  For compressed formats, i will
 *              be in [0, block_width-1] and j will be in [0, block_height-1].
 * \param cache  optional value pointing to a lp_build_format_cache structure
 */
void
lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
                        const struct util_format_description *format_desc,
                        struct lp_type type,
                        boolean aligned,
                        LLVMValueRef base_ptr,
                        LLVMValueRef offset,
                        LLVMValueRef i,
                        LLVMValueRef j,
                        LLVMValueRef cache,
                        LLVMValueRef rgba_out[4])
{
   LLVMBuilderRef builder = gallivm->builder;
   enum pipe_format format = format_desc->format;
   struct lp_type fetch_type;

   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
       format_desc->block.width == 1 &&
       format_desc->block.height == 1 &&
       format_desc->block.bits <= type.width &&
       (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
        format_desc->channel[0].size == 32 ||
        format_desc->channel[0].size == 16))
   {
      /*
       * The packed pixel fits into an element of the destination format. Put
       * the packed pixels into a vector and extract each component for all
       * vector elements in parallel.
       */

      LLVMValueRef packed;

      /*
       * gather the texels from the texture
       * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
       */
      assert(format_desc->block.bits <= type.width);
      fetch_type = lp_type_uint(type.width);
      packed = lp_build_gather(gallivm,
                               type.length,
                               format_desc->block.bits,
                               fetch_type,
                               aligned,
                               base_ptr, offset, FALSE);

      /*
       * convert texels to float rgba
       */
      lp_build_unpack_rgba_soa(gallivm,
                               format_desc,
                               type,
                               packed, rgba_out);
      return;
   }


   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
       format_desc->block.width == 1 &&
       format_desc->block.height == 1 &&
       format_desc->block.bits > type.width &&
       ((format_desc->block.bits <= type.width * type.length &&
         format_desc->channel[0].size <= type.width) ||
        (format_desc->channel[0].size == 64 &&
         format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
         type.floating)))
   {
      /*
       * Similar to above, but the packed pixel is larger than what fits
       * into an element of the destination format. The packed pixels will be
       * shuffled into SoA vectors appropriately, and then the extraction will
       * be done in parallel as much as possible.
       * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
       * the gathered vectors can be shuffled easily (even with avx).
       * 64xn float -> 32xn float is handled too but it's a bit special as
       * it does the conversion pre-shuffle.
       */

      LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32];
      struct lp_type fetch_type, gather_type = type;
      unsigned num_gather, fetch_width, i, j;
      struct lp_build_context bld;
      boolean fp64 = format_desc->channel[0].size == 64;

      lp_build_context_init(&bld, gallivm, type);

      assert(type.width == 32);
      assert(format_desc->block.bits > type.width);

      /*
       * First, figure out fetch order.
       */
      fetch_width = util_next_power_of_two(format_desc->block.bits);
      /*
       * fp64 are treated like fp32 except we fetch twice wide values
       * (as we shuffle after trunc). The shuffles for that work out
       * mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
       * albeit we miss the potential opportunity for hw gather (as it
       * only handles native size).
       */
      num_gather = fetch_width / type.width;
      gather_type.width *= num_gather;
      if (fp64) {
         num_gather /= 2;
      }
      gather_type.length /= num_gather;

      for (i = 0; i < num_gather; i++) {
         LLVMValueRef offsetr, shuf_vec;
         if(num_gather == 4) {
            for (j = 0; j < gather_type.length; j++) {
               unsigned idx = i + 4*j;
               shuffles[j] = lp_build_const_int32(gallivm, idx);
            }
            shuf_vec = LLVMConstVector(shuffles, gather_type.length);
            offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");

         }
         else if (num_gather == 2) {
            assert(num_gather == 2);
            for (j = 0; j < gather_type.length; j++) {
               unsigned idx = i*2 + (j%2) + (j/2)*4;
               shuffles[j] = lp_build_const_int32(gallivm, idx);
            }
            shuf_vec = LLVMConstVector(shuffles, gather_type.length);
            offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
         }
         else {
            assert(num_gather == 1);
            offsetr = offset;
         }
         if (gather_type.length == 1) {
            LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
            offsetr = LLVMBuildExtractElement(builder, offsetr, zero, "");
         }

         /*
          * Determine whether to use float or int loads. This is mostly
          * to outsmart the (stupid) llvm int/float shuffle logic, we
          * don't really care much if the data is floats or ints...
          * But llvm will refuse to use single float shuffle with int data
          * and instead use 3 int shuffles instead, the code looks atrocious.
          * (Note bitcasts often won't help, as llvm is too smart to be
          * fooled by that.)
          * Nobody cares about simd float<->int domain transition penalties,
          * which usually don't even exist for shuffles anyway.
          * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
          * going into transpose, which is unpacks, so doesn't really matter
          * much).
          * With 2x32bit or 4x16bit fetch, we use float vec, since those
          * go into the weird channel separation shuffle. With floats,
          * this is (with 128bit vectors):
          * - 2 movq, 2 movhpd, 2 shufps
          * With ints it would be:
          * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
          * I've seen texture functions increase in code size by 15% just due
          * to that (there's lots of such fetches in them...)
          * (We could chose a different gather order to improve this somewhat
          * for the int path, but it would basically just drop the blends,
          * so the float path with this order really is optimal.)
          * Albeit it is tricky sometimes llvm doesn't ignore the float->int
          * casts so must avoid them until we're done with the float shuffle...
          * 3x16bit formats (the same is also true for 3x8) are pretty bad but
          * there's nothing we can do about them (we could overallocate by
          * those couple bytes and use unaligned but pot sized load).
          * Note that this is very much x86 specific. I don't know if this
          * affect other archs at all.
          */
         if (num_gather > 1) {
            /*
             * We always want some float type here (with x86)
             * due to shuffles being float ones afterwards (albeit for
             * the num_gather == 4 case int should work fine too
             * (unless there's some problems with avx but not avx2).
             */
            if (format_desc->channel[0].size == 64) {
               fetch_type = lp_type_float_vec(64, gather_type.width);
            } else {
               fetch_type = lp_type_int_vec(32, gather_type.width);
            }
         }
         else {
            /* type doesn't matter much */
            if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
                (format_desc->channel[0].size == 32 ||
                 format_desc->channel[0].size == 64)) {
            fetch_type = lp_type_float(gather_type.width);
            } else {
               fetch_type = lp_type_uint(gather_type.width);
            }
         }

         /* Now finally gather the values */
         packed[i] = lp_build_gather(gallivm, gather_type.length,
                                     format_desc->block.bits,
                                     fetch_type, aligned,
                                     base_ptr, offsetr, FALSE);
         if (fp64) {
            struct lp_type conv_type = type;
            conv_type.width *= 2;
            packed[i] = LLVMBuildBitCast(builder, packed[i],
                                         lp_build_vec_type(gallivm, conv_type), "");
            packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, "");
         }
      }

      /* shuffle the gathered values to SoA */
      if (num_gather == 2) {
         for (i = 0; i < num_gather; i++) {
            for (j = 0; j < type.length; j++) {
               unsigned idx = (j%2)*2 + (j/4)*4 + i;
               if ((j/2)%2)
                  idx += type.length;
               shuffles[j] = lp_build_const_int32(gallivm, idx);
            }
            dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1],
                                            LLVMConstVector(shuffles, type.length), "");
         }
      }
      else if (num_gather == 4) {
         lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst);
      }
      else {
         assert(num_gather == 1);
         dst[0] = packed[0];
      }

      /*
       * And finally unpack exactly as above, except that
       * chan shift is adjusted and the right vector selected.
       */
      if (!fp64) {
         for (i = 0; i < num_gather; i++) {
            dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, "");
         }
         for (i = 0; i < format_desc->nr_channels; i++) {
            struct util_format_channel_description chan_desc = format_desc->channel[i];
            unsigned blockbits = type.width;
            unsigned vec_nr;

#if UTIL_ARCH_BIG_ENDIAN
            vec_nr = (format_desc->block.bits - (chan_desc.shift + chan_desc.size)) / type.width;
#else
            vec_nr = chan_desc.shift / type.width;
#endif
            chan_desc.shift %= type.width;

            output[i] = lp_build_extract_soa_chan(&bld,
                                                  blockbits,
                                                  FALSE,
                                                  chan_desc,
                                                  dst[vec_nr]);
         }
      }
      else {
         for (i = 0; i < format_desc->nr_channels; i++)  {
            output[i] = dst[i];
         }
      }

      lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out);
      return;
   }

   if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
       format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
      /*
       * similar conceptually to above but requiring special
       * AoS packed -> SoA float conversion code.
       */
      LLVMValueRef packed;
      struct lp_type fetch_type = lp_type_uint(type.width);

      assert(type.floating);
      assert(type.width == 32);

      packed = lp_build_gather(gallivm, type.length,
                               format_desc->block.bits,
                               fetch_type, aligned,
                               base_ptr, offset, FALSE);
      if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
         lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
      }
      else {
         lp_build_rgb9e5_to_float(gallivm, packed, rgba_out);
      }
      return;
   }

   if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
       format_desc->block.bits == 64) {
      /*
       * special case the format is 64 bits but we only require
       * 32bit (or 8bit) from each block.
       */
      LLVMValueRef packed;
      struct lp_type fetch_type = lp_type_uint(type.width);

      if (format == PIPE_FORMAT_X32_S8X24_UINT) {
         /*
          * for stencil simply fix up offsets - could in fact change
          * base_ptr instead even outside the shader.
          */
         unsigned mask = (1 << 8) - 1;
         LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
         offset = LLVMBuildAdd(builder, offset, s_offset, "");
         packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
                                  aligned, base_ptr, offset, FALSE);
         packed = LLVMBuildAnd(builder, packed,
                               lp_build_const_int_vec(gallivm, type, mask), "");
      }
      else {
         assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
         packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
                                  aligned, base_ptr, offset, TRUE);
         packed = LLVMBuildBitCast(builder, packed,
                                   lp_build_vec_type(gallivm, type), "");
      }
      /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
      rgba_out[0] = rgba_out[1] = rgba_out[2] = packed;
      rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f);
      return;
   }

   /*
    * Try calling lp_build_fetch_rgba_aos for all pixels.
    * Should only really hit subsampled, compressed
    * (for s3tc srgb too, for rgtc the unorm ones only) by now.
    * (This is invalid for plain 8unorm formats because we're lazy with
    * the swizzle since some results would arrive swizzled, some not.)
    */

   if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) &&
       (util_format_fits_8unorm(format_desc) ||
        format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) &&
       type.floating && type.width == 32 &&
       (type.length == 1 || (type.length % 4 == 0))) {
      struct lp_type tmp_type;
      struct lp_build_context bld;
      LLVMValueRef packed, rgba[4];
      const struct util_format_description *flinear_desc;
      const struct util_format_description *frgba8_desc;
      unsigned chan;

      lp_build_context_init(&bld, gallivm, type);

      /*
       * Make sure the conversion in aos really only does convert to rgba8
       * and not anything more (so use linear format, adjust type).
       */
      flinear_desc = util_format_description(util_format_linear(format));
      memset(&tmp_type, 0, sizeof tmp_type);
      tmp_type.width = 8;
      tmp_type.length = type.length * 4;
      tmp_type.norm = TRUE;

      packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type,
                                       aligned, base_ptr, offset, i, j, cache);
      packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, "");

      /*
       * The values are now packed so they match ordinary (srgb) RGBA8 format,
       * hence need to use matching format for unpack.
       */
      frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM);
      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
         assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
         frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
      }
      lp_build_unpack_rgba_soa(gallivm,
                               frgba8_desc,
                               type,
                               packed, rgba);

      /*
       * We converted 4 channels. Make sure llvm can drop unneeded ones
       * (luckily the rgba order is fixed, only LA needs special case).
       */
      for (chan = 0; chan < 4; chan++) {
         enum pipe_swizzle swizzle = format_desc->swizzle[chan];
         if (chan == 3 && util_format_is_luminance_alpha(format)) {
            swizzle = PIPE_SWIZZLE_W;
         }
         rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle);
      }
      return;
   }


   /*
    * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
    *
    * This is not the most efficient way of fetching pixels, as we
    * miss some opportunities to do vectorization, but this is
    * convenient for formats or scenarios for which there was no
    * opportunity or incentive to optimize.
    *
    * We do NOT want to end up here, this typically is quite terrible,
    * in particular if the formats have less than 4 channels.
    *
    * Right now, this should only be hit for:
    * - RGTC snorm formats
    *   (those miss fast fetch functions hence they are terrible anyway)
    */

   {
      unsigned k;
      struct lp_type tmp_type;
      LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];

      if (gallivm_debug & GALLIVM_DEBUG_PERF) {
         debug_printf("%s: AoS fetch fallback for %s\n",
                      __FUNCTION__, format_desc->short_name);
      }

      tmp_type = type;
      tmp_type.length = 4;

      /*
       * Note that vector transpose can be worse compared to insert/extract
       * for aos->soa conversion (for formats with 1 or 2 channels). However,
       * we should try to avoid getting here for just about all formats, so
       * don't bother.
       */

      /* loop over number of pixels */
      for(k = 0; k < type.length; ++k) {
         LLVMValueRef index = lp_build_const_int32(gallivm, k);
         LLVMValueRef offset_elem;
         LLVMValueRef i_elem, j_elem;

         offset_elem = LLVMBuildExtractElement(builder, offset,
                                               index, "");

         i_elem = LLVMBuildExtractElement(builder, i, index, "");
         j_elem = LLVMBuildExtractElement(builder, j, index, "");

         /* Get a single float[4]={R,G,B,A} pixel */
         aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
                                                aligned, base_ptr, offset_elem,
                                                i_elem, j_elem, cache);

      }
      convert_to_soa(gallivm, aos_fetch, rgba_out, type);
   }
}

static void
lp_build_insert_soa_chan(struct lp_build_context *bld,
                         unsigned blockbits,
                         struct util_format_channel_description chan_desc,
                         LLVMValueRef *output,
                         LLVMValueRef rgba)
{
    struct gallivm_state *gallivm = bld->gallivm;
    LLVMBuilderRef builder = gallivm->builder;
    struct lp_type type = bld->type;
    const unsigned width = chan_desc.size;
    const unsigned start = chan_desc.shift;
    ASSERTED const unsigned stop = start + width;
    LLVMValueRef chan = NULL;
    switch(chan_desc.type) {
    case UTIL_FORMAT_TYPE_UNSIGNED:

       if (chan_desc.pure_integer)
          chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
       else if (type.floating) {
          if (chan_desc.normalized)
             chan = lp_build_clamped_float_to_unsigned_norm(gallivm, type, width, rgba);
          else
             chan = LLVMBuildFPToSI(builder, rgba, bld->vec_type, "");
       }
       if (start)
          chan = LLVMBuildShl(builder, chan,
                              lp_build_const_int_vec(gallivm, type, start), "");
       if (!*output)
          *output = chan;
       else
          *output = LLVMBuildOr(builder, *output, chan, "");
       break;
    case UTIL_FORMAT_TYPE_SIGNED:
       if (chan_desc.pure_integer)
          chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
       else if (type.floating) {
          uint32_t mask_val = (1UL << chan_desc.size) - 1;
          if (chan_desc.normalized) {
             char intrin[32];
             double scale = ((1 << (chan_desc.size - 1)) - 1);
             LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
             rgba = lp_build_clamp(bld, rgba, lp_build_negate(bld, bld->one), bld->one);
             rgba = LLVMBuildFMul(builder, rgba, scale_val, "");
             lp_format_intrinsic(intrin, sizeof intrin, "llvm.rint", bld->vec_type);
             rgba = lp_build_intrinsic_unary(builder, intrin, bld->vec_type, rgba);
          }
          chan = LLVMBuildFPToSI(builder, rgba, bld->int_vec_type, "");
          chan = LLVMBuildAnd(builder, chan, lp_build_const_int_vec(gallivm, type, mask_val), "");
       }
       if (start)
          chan = LLVMBuildShl(builder, chan,
                              lp_build_const_int_vec(gallivm, type, start), "");
       if (!*output)
          *output = chan;
       else
          *output = LLVMBuildOr(builder, *output, chan, "");
       break;
    case UTIL_FORMAT_TYPE_FLOAT:
       if (type.floating) {
          if (chan_desc.size == 16) {
             chan = lp_build_float_to_half(gallivm, rgba);
             chan = LLVMBuildZExt(builder, chan, bld->int_vec_type, "");
             if (start)
                chan = LLVMBuildShl(builder, chan,
                                    lp_build_const_int_vec(gallivm, type, start), "");
             if (!*output)
                *output = chan;
             else
                *output = LLVMBuildOr(builder, *output, chan, "");
          } else {
             assert(start == 0);
             assert(stop == 32);
             assert(type.width == 32);
             *output = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
          }
       } else
          assert(0);
       break;
    default:
       assert(0);
       *output = bld->undef;
    }
}

static void
lp_build_pack_rgba_soa(struct gallivm_state *gallivm,
                       const struct util_format_description *format_desc,
                       struct lp_type type,
                       const LLVMValueRef rgba_in[4],
                       LLVMValueRef *packed)
{
   unsigned chan;
   struct lp_build_context bld;
   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
   assert(format_desc->block.width == 1);
   assert(format_desc->block.height == 1);
   assert(format_desc->block.bits <= type.width);
   /* FIXME: Support more output types */
   assert(type.width == 32);

   lp_build_context_init(&bld, gallivm, type);
   for (chan = 0; chan < format_desc->nr_channels; ++chan) {
      struct util_format_channel_description chan_desc = format_desc->channel[chan];

      lp_build_insert_soa_chan(&bld, format_desc->block.bits,
                               chan_desc,
                               packed,
                               rgba_in[chan]);
   }
}

void
lp_build_store_rgba_soa(struct gallivm_state *gallivm,
                        const struct util_format_description *format_desc,
                        struct lp_type type,
                        LLVMValueRef exec_mask,
                        LLVMValueRef base_ptr,
                        LLVMValueRef offset,
                        LLVMValueRef out_of_bounds,
                        const LLVMValueRef rgba_in[4])
{
   enum pipe_format format = format_desc->format;
   LLVMValueRef packed[4];
   unsigned num_stores = 0;

   memset(packed, 0, sizeof(LLVMValueRef) * 4);
   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
       format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
       format_desc->block.width == 1 &&
       format_desc->block.height == 1 &&
       format_desc->block.bits <= type.width &&
       (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
        format_desc->channel[0].size == 32 ||
        format_desc->channel[0].size == 16))
   {
      lp_build_pack_rgba_soa(gallivm, format_desc, type, rgba_in, &packed[0]);

      num_stores = 1;
   } else if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
       format_desc->block.width == 1 &&
       format_desc->block.height == 1 &&
       format_desc->block.bits > type.width &&
       ((format_desc->block.bits <= type.width * type.length &&
         format_desc->channel[0].size <= type.width) ||
        (format_desc->channel[0].size == 64 &&
         format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
         type.floating)))
   {
      /*
       * Similar to above, but the packed pixel is larger than what fits
       * into an element of the destination format. The packed pixels will be
       * shuffled into SoA vectors appropriately, and then the extraction will
       * be done in parallel as much as possible.
       * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
       * the gathered vectors can be shuffled easily (even with avx).
       * 64xn float -> 32xn float is handled too but it's a bit special as
       * it does the conversion pre-shuffle.
       */
      struct lp_build_context bld;

      lp_build_context_init(&bld, gallivm, type);
      assert(type.width == 32);
      assert(format_desc->block.bits > type.width);

      unsigned store_width = util_next_power_of_two(format_desc->block.bits);
      num_stores = store_width / type.width;
      for (unsigned i = 0; i < format_desc->nr_channels; i++) {
            struct util_format_channel_description chan_desc = format_desc->channel[i];
            unsigned blockbits = type.width;
            unsigned vec_nr;

            vec_nr = chan_desc.shift / type.width;
            chan_desc.shift %= type.width;

            lp_build_insert_soa_chan(&bld, blockbits,
                                     chan_desc,
                                     &packed[vec_nr],
                                     rgba_in[i]);
      }

      assert(num_stores == 4 || num_stores == 2);
      /* we can transpose and store at the same time */
   } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
      packed[0] = lp_build_float_to_r11g11b10(gallivm, rgba_in);
      num_stores = 1;
   } else
      assert(0);

   assert(exec_mask);

   LLVMTypeRef int32_ptr_type = LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0);
   LLVMTypeRef int16_ptr_type = LLVMPointerType(LLVMInt16TypeInContext(gallivm->context), 0);
   LLVMTypeRef int8_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);

   LLVMValueRef should_store_mask = LLVMBuildAnd(gallivm->builder, exec_mask, LLVMBuildNot(gallivm->builder, out_of_bounds, ""), "store_mask");
   should_store_mask = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask, lp_build_const_int_vec(gallivm, type, 0), "");
   for (unsigned i = 0; i < num_stores; i++) {
      struct lp_build_loop_state loop_state;

      LLVMValueRef store_offset = LLVMBuildAdd(gallivm->builder, offset, lp_build_const_int_vec(gallivm, type, i * 4), "");
      store_offset = LLVMBuildGEP(gallivm->builder, base_ptr, &store_offset, 1, "");

      lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));

      struct lp_build_if_state ifthen;
      LLVMValueRef cond = LLVMBuildExtractElement(gallivm->builder, should_store_mask, loop_state.counter, "");
      lp_build_if(&ifthen, gallivm, cond);

      LLVMValueRef data = LLVMBuildExtractElement(gallivm->builder, packed[i], loop_state.counter, "");
      LLVMValueRef this_offset = LLVMBuildExtractElement(gallivm->builder, store_offset, loop_state.counter, "");

      if (format_desc->block.bits == 8) {
         this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int8_ptr_type, "");
         data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt8TypeInContext(gallivm->context), "");
      } else if (format_desc->block.bits == 16) {
         this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int16_ptr_type, "");
         data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt16TypeInContext(gallivm->context), "");
      } else
         this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int32_ptr_type, "");
      LLVMBuildStore(gallivm->builder, data, this_offset);
      lp_build_endif(&ifthen);
      lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, type.length),
                             NULL, LLVMIntUGE);
   }
}