3 files changed, 368 insertions, 40 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 1c352006f3e..a032d9d6895 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -1402,6 +1402,144 @@ lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
    }
 }
 
+/**
+ * Generate new coords and faces for cubemap texels falling off the face.
+ *
+ * @param face   face (center) of the pixel
+ * @param x0     lower x coord
+ * @param x1     higher x coord (must be x0 + 1)
+ * @param y0     lower y coord
+ * @param y1     higher y coord (must be x0 + 1)
+ * @param max_coord     texture cube (level) size - 1
+ * @param next_faces    new face values when falling off
+ * @param next_xcoords  new x coord values when falling off
+ * @param next_ycoords  new y coord values when falling off
+ *
+ * The arrays hold the new values when under/overflow of
+ * lower x, higher x, lower y, higher y coord would occur (in this order).
+ * next_xcoords/next_ycoords have two entries each (for both new lower and
+ * higher coord).
+ */
+void
+lp_build_cube_new_coords(struct lp_build_context *ivec_bld,
+                        LLVMValueRef face,
+                        LLVMValueRef x0,
+                        LLVMValueRef x1,
+                        LLVMValueRef y0,
+                        LLVMValueRef y1,
+                        LLVMValueRef max_coord,
+                        LLVMValueRef next_faces[4],
+                        LLVMValueRef next_xcoords[4][2],
+                        LLVMValueRef next_ycoords[4][2])
+{
+   /*
+    * Lookup tables aren't nice for simd code hence try some logic here.
+    * (Note that while it would not be necessary to do per-sample (4) lookups
+    * when using a LUT as it's impossible that texels fall off of positive
+    * and negative edges simultaneously, it would however be necessary to
+    * do 2 lookups for corner handling as in this case texels both fall off
+    * of x and y axes.)
+    */
+   /*
+    * Next faces (for face 012345):
+    * x < 0.0  : 451110
+    * x >= 1.0 : 540001
+    * y < 0.0  : 225422
+    * y >= 1.0 : 334533
+    * Hence nfx+ (and nfy+) == nfx- (nfy-) xor 1
+    * nfx-: face > 1 ? (face == 5 ? 0 : 1) : (4 + face & 1)
+    * nfy+: face & ~4 > 1 ? face + 2 : 3;
+    * This could also use pshufb instead, but would need (manually coded)
+    * ssse3 intrinsic (llvm won't do non-constant shuffles).
+    */
+   struct gallivm_state *gallivm = ivec_bld->gallivm;
+   LLVMValueRef sel, sel_f2345, sel_f23, sel_f2, tmpsel, tmp;
+   LLVMValueRef faceand1, sel_fand1, maxmx0, maxmx1, maxmy0, maxmy1;
+   LLVMValueRef c2 = lp_build_const_int_vec(gallivm, ivec_bld->type, 2);
+   LLVMValueRef c3 = lp_build_const_int_vec(gallivm, ivec_bld->type, 3);
+   LLVMValueRef c4 = lp_build_const_int_vec(gallivm, ivec_bld->type, 4);
+   LLVMValueRef c5 = lp_build_const_int_vec(gallivm, ivec_bld->type, 5);
+
+   sel = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, face, c5);
+   tmpsel = lp_build_select(ivec_bld, sel, ivec_bld->zero, ivec_bld->one);
+   sel_f2345 = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, face, ivec_bld->one);
+   faceand1 = lp_build_and(ivec_bld, face, ivec_bld->one);
+   tmp = lp_build_add(ivec_bld, faceand1, c4);
+   next_faces[0] = lp_build_select(ivec_bld, sel_f2345, tmpsel, tmp);
+   next_faces[1] = lp_build_xor(ivec_bld, next_faces[0], ivec_bld->one);
+
+   tmp = lp_build_andnot(ivec_bld, face, c4);
+   sel_f23 = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, tmp, ivec_bld->one);
+   tmp = lp_build_add(ivec_bld, face, c2);
+   next_faces[3] = lp_build_select(ivec_bld, sel_f23, tmp, c3);
+   next_faces[2] = lp_build_xor(ivec_bld, next_faces[3], ivec_bld->one);
+
+   /*
+    * new xcoords (for face 012345):
+    * x < 0.0  : max   max   t     max-t max  max
+    * x >= 1.0 : 0     0     max-t t     0    0
+    * y < 0.0  : max   0     max-s s     s    max-s
+    * y >= 1.0 : max   0     s     max-s s    max-s
+    *
+    * ncx[1] = face & ~4 > 1 ? (face == 2 ? max-t : t) : 0
+    * ncx[0] = max - ncx[1]
+    * ncx[3] = face > 1 ? (face & 1 ? max-s : s) : (face & 1) ? 0 : max
+    * ncx[2] = face & ~4 > 1 ? max - ncx[3] : ncx[3]
+    */
+   sel_f2 = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, face, c2);
+   maxmy0 = lp_build_sub(ivec_bld, max_coord, y0);
+   tmp = lp_build_select(ivec_bld, sel_f2, maxmy0, y0);
+   next_xcoords[1][0] = lp_build_select(ivec_bld, sel_f23, tmp, ivec_bld->zero);
+   next_xcoords[0][0] = lp_build_sub(ivec_bld, max_coord, next_xcoords[1][0]);
+   maxmy1 = lp_build_sub(ivec_bld, max_coord, y1);
+   tmp = lp_build_select(ivec_bld, sel_f2, maxmy1, y1);
+   next_xcoords[1][1] = lp_build_select(ivec_bld, sel_f23, tmp, ivec_bld->zero);
+   next_xcoords[0][1] = lp_build_sub(ivec_bld, max_coord, next_xcoords[1][1]);
+
+   sel_fand1 = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, faceand1, ivec_bld->one);
+
+   tmpsel = lp_build_select(ivec_bld, sel_fand1, ivec_bld->zero, max_coord);
+   maxmx0 = lp_build_sub(ivec_bld, max_coord, x0);
+   tmp = lp_build_select(ivec_bld, sel_fand1, maxmx0, x0);
+   next_xcoords[3][0] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
+   tmp = lp_build_sub(ivec_bld, max_coord, next_xcoords[3][0]);
+   next_xcoords[2][0] = lp_build_select(ivec_bld, sel_f23, tmp, next_xcoords[3][0]);
+   maxmx1 = lp_build_sub(ivec_bld, max_coord, x1);
+   tmp = lp_build_select(ivec_bld, sel_fand1, maxmx1, x1);
+   next_xcoords[3][1] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
+   tmp = lp_build_sub(ivec_bld, max_coord, next_xcoords[3][1]);
+   next_xcoords[2][1] = lp_build_select(ivec_bld, sel_f23, tmp, next_xcoords[3][1]);
+
+   /*
+    * new ycoords (for face 012345):
+    * x < 0.0  : t     t     0     max   t    t
+    * x >= 1.0 : t     t     0     max   t    t
+    * y < 0.0  : max-s s     0     max   max  0
+    * y >= 1.0 : s     max-s 0     max   0    max
+    *
+    * ncy[0] = face & ~4 > 1 ? (face == 2 ? 0 : max) : t
+    * ncy[1] = ncy[0]
+    * ncy[3] = face > 1 ? (face & 1 ? max : 0) : (face & 1) ? max-s : max
+    * ncx[2] = face & ~4 > 1 ? max - ncx[3] : ncx[3]
+    */
+   tmp = lp_build_select(ivec_bld, sel_f2, ivec_bld->zero, max_coord);
+   next_ycoords[0][0] = lp_build_select(ivec_bld, sel_f23, tmp, y0);
+   next_ycoords[1][0] = next_ycoords[0][0];
+   next_ycoords[0][1] = lp_build_select(ivec_bld, sel_f23, tmp, y1);
+   next_ycoords[1][1] = next_ycoords[0][1];
+
+   tmpsel = lp_build_select(ivec_bld, sel_fand1, maxmx0, x0);
+   tmp = lp_build_select(ivec_bld, sel_fand1, max_coord, ivec_bld->zero);
+   next_ycoords[3][0] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
+   tmp = lp_build_sub(ivec_bld, max_coord, next_ycoords[3][0]);
+   next_ycoords[2][0] = lp_build_select(ivec_bld, sel_f23, next_ycoords[3][0], tmp);
+   tmpsel = lp_build_select(ivec_bld, sel_fand1, maxmx1, x1);
+   tmp = lp_build_select(ivec_bld, sel_fand1, max_coord, ivec_bld->zero);
+   next_ycoords[3][1] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
+   tmp = lp_build_sub(ivec_bld, max_coord, next_ycoords[3][1]);
+   next_ycoords[2][1] = lp_build_select(ivec_bld, sel_f23, next_ycoords[3][1], tmp);
+}
+
 
 /** Helper used by lp_build_cube_lookup() */
 static LLVMValueRef
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 70f03503f0f..5039128a203 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -464,6 +464,19 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
 
 
 void
+lp_build_cube_new_coords(struct lp_build_context *ivec_bld,
+                         LLVMValueRef face,
+                         LLVMValueRef x0,
+                         LLVMValueRef x1,
+                         LLVMValueRef y0,
+                         LLVMValueRef y1,
+                         LLVMValueRef max_coord,
+                         LLVMValueRef new_faces[4],
+                         LLVMValueRef new_xcoords[4][2],
+                         LLVMValueRef new_ycoords[4][2]);
+
+
+void
 lp_build_sample_partial_offset(struct lp_build_context *bld,
                                unsigned block_length,
                                LLVMValueRef coord,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 54dee25bfd9..8e2d0d9f33b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -848,10 +848,14 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    LLVMValueRef flt_width_vec;
    LLVMValueRef flt_height_vec;
    LLVMValueRef flt_depth_vec;
-   LLVMValueRef x0, y0 = NULL, z0 = NULL, x1, y1 = NULL, z1 = NULL;
+   LLVMValueRef z1 = NULL;
+   LLVMValueRef z00 = NULL, z01 = NULL, z10 = NULL, z11 = NULL;
+   LLVMValueRef x00 = NULL, x01 = NULL, x10 = NULL, x11 = NULL;
+   LLVMValueRef y00 = NULL, y01 = NULL, y10 = NULL, y11 = NULL;
    LLVMValueRef s_fpart, t_fpart = NULL, r_fpart = NULL;
+   LLVMValueRef xs[4], ys[4], zs[4];
    LLVMValueRef neighbors[2][2][4];
-   int chan;
+   int chan, texel_index;
 
    lp_build_extract_image_sizes(bld,
                                 &bld->int_size_bld,
@@ -870,39 +874,202 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    /*
     * Compute integer texcoords.
     */
-   lp_build_sample_wrap_linear(bld, coords[0], width_vec,
-                               flt_width_vec, offsets[0],
-                               bld->static_texture_state->pot_width,
-                               bld->static_sampler_state->wrap_s,
-                               &x0, &x1, &s_fpart);
-   lp_build_name(x0, "tex.x0.wrapped");
-   lp_build_name(x1, "tex.x1.wrapped");
 
-   if (dims >= 2) {
-      lp_build_sample_wrap_linear(bld, coords[1], height_vec,
-                                  flt_height_vec, offsets[1],
-                                  bld->static_texture_state->pot_height,
-                                  bld->static_sampler_state->wrap_t,
-                                  &y0, &y1, &t_fpart);
-      lp_build_name(y0, "tex.y0.wrapped");
-      lp_build_name(y1, "tex.y1.wrapped");
+   if (bld->static_texture_state->target != PIPE_TEXTURE_CUBE ||
+       !bld->static_sampler_state->seamless_cube_map) {
+      lp_build_sample_wrap_linear(bld, coords[0], width_vec,
+                                  flt_width_vec, offsets[0],
+                                  bld->static_texture_state->pot_width,
+                                  bld->static_sampler_state->wrap_s,
+                                  &x00, &x01, &s_fpart);
+      lp_build_name(x00, "tex.x0.wrapped");
+      lp_build_name(x01, "tex.x1.wrapped");
+      x10 = x00;
+      x11 = x01;
 
-      if (dims == 3) {
-         lp_build_sample_wrap_linear(bld, coords[2], depth_vec,
-                                     flt_depth_vec, offsets[2],
-                                     bld->static_texture_state->pot_depth,
-                                     bld->static_sampler_state->wrap_r,
-                                     &z0, &z1, &r_fpart);
-         lp_build_name(z0, "tex.z0.wrapped");
-         lp_build_name(z1, "tex.z1.wrapped");
+      if (dims >= 2) {
+         lp_build_sample_wrap_linear(bld, coords[1], height_vec,
+                                     flt_height_vec, offsets[1],
+                                     bld->static_texture_state->pot_height,
+                                     bld->static_sampler_state->wrap_t,
+                                     &y00, &y10, &t_fpart);
+         lp_build_name(y00, "tex.y0.wrapped");
+         lp_build_name(y10, "tex.y1.wrapped");
+         y01 = y00;
+         y11 = y10;
+
+         if (dims == 3) {
+            lp_build_sample_wrap_linear(bld, coords[2], depth_vec,
+                                        flt_depth_vec, offsets[2],
+                                        bld->static_texture_state->pot_depth,
+                                        bld->static_sampler_state->wrap_r,
+                                        &z00, &z1, &r_fpart);
+            z01 = z10 = z11 = z00;
+            lp_build_name(z00, "tex.z0.wrapped");
+            lp_build_name(z1, "tex.z1.wrapped");
+         }
+      }
+      if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
+          bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
+          bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
+         z00 = z01 = z10 = z11 = z1 = coords[2];  /* cube face or layer */
+         lp_build_name(z00, "tex.z0.layer");
+         lp_build_name(z1, "tex.z1.layer");
       }
    }
-   if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
-       bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
-       bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
-      z0 = z1 = coords[2];  /* cube face or layer */
-      lp_build_name(z0, "tex.z0.layer");
-      lp_build_name(z1, "tex.z1.layer");
+   else {
+      LLVMBuilderRef builder = bld->gallivm->builder;
+      struct lp_build_context *ivec_bld = &bld->int_coord_bld;
+      struct lp_build_context *coord_bld = &bld->coord_bld;
+      struct lp_build_if_state edge_if;
+      LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2];
+      LLVMValueRef fall_off[4], coord, have_edge;
+      LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp;
+      LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp;
+      LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped;
+      LLVMValueRef face = coords[2];
+      LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5f);
+      LLVMValueRef length_minus_one = lp_build_sub(ivec_bld, width_vec, ivec_bld->one);
+      /* XXX drop height calcs. Could (should) do this without seamless filtering too */
+      height_vec = width_vec;
+      flt_height_vec = flt_width_vec;
+
+      /* XXX the overflow logic is actually sort of duplicated with trilinear,
+       * since an overflow in one mip should also have a corresponding overflow
+       * in another.
+       */
+      /* should always have normalized coords, and offsets are undefined */
+      assert(bld->static_sampler_state->normalized_coords);
+      coord = lp_build_mul(coord_bld, coords[0], flt_width_vec);
+      /* instead of clamp, build mask if overflowed */
+      coord = lp_build_sub(coord_bld, coord, half);
+      /* convert to int, compute lerp weight */
+      /* not ideal with AVX (and no AVX2) */
+      lp_build_ifloor_fract(coord_bld, coord, &x0, &s_fpart);
+      x1 = lp_build_add(ivec_bld, x0, ivec_bld->one);
+      coord = lp_build_mul(coord_bld, coords[1], flt_height_vec);
+      coord = lp_build_sub(coord_bld, coord, half);
+      lp_build_ifloor_fract(coord_bld, coord, &y0, &t_fpart);
+      y1 = lp_build_add(ivec_bld, y0, ivec_bld->one);
+
+      fall_off[0] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, x0, ivec_bld->zero);
+      fall_off[1] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, x1, length_minus_one);
+      fall_off[2] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, y0, ivec_bld->zero);
+      fall_off[3] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, y1, length_minus_one);
+
+      have_edge = lp_build_or(ivec_bld, fall_off[0], fall_off[1]);
+      have_edge = lp_build_or(ivec_bld, have_edge, fall_off[2]);
+      have_edge = lp_build_or(ivec_bld, have_edge, fall_off[3]);
+
+      have_edge = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_edge);
+
+      for (texel_index = 0; texel_index < 4; texel_index++) {
+         xs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "xs");
+         ys[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "ys");
+         zs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "zs");
+      }
+
+      lp_build_if(&edge_if, bld->gallivm, have_edge);
+
+      /*
+       * Need to feed clamped values here for cheap corner handling,
+       * but only for y coord (as when falling off both edges we only
+       * fall off the x one) - this should be sufficient.
+       */
+      y0_clamped = lp_build_max(ivec_bld, y0, ivec_bld->zero);
+      y1_clamped = lp_build_min(ivec_bld, y1, length_minus_one);
+
+      /*
+       * Get all possible new coords.
+       */
+      lp_build_cube_new_coords(ivec_bld, face,
+                               x0, x1, y0_clamped, y1_clamped,
+                               length_minus_one,
+                               new_faces, new_xcoords, new_ycoords);
+
+      /* handle fall off x-, x+ direction */
+      /* determine new coords, face (not both fall_off vars can be true at same time) */
+      x00 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][0], x0);
+      y00 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][0], y0_clamped);
+      x10 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][1], x0);
+      y10 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][1], y1_clamped);
+      x01 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][0], x1);
+      y01 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][0], y0_clamped);
+      x11 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][1], x1);
+      y11 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][1], y1_clamped);
+
+      z00 = z10 = lp_build_select(ivec_bld, fall_off[0], new_faces[0], face);
+      z01 = z11 = lp_build_select(ivec_bld, fall_off[1], new_faces[1], face);
+
+      /* handle fall off y-, y+ direction */
+      /*
+       * Cheap corner logic: just hack up things so a texel doesn't fall
+       * off both sides (which means filter weights will be wrong but we'll only
+       * use valid texels in the filter).
+       * This means however (y) coords must additionally be clamped (see above).
+       * This corner handling should be fully OpenGL (but not d3d10) compliant.
+       */
+      fall_off_ym_notxm = lp_build_andnot(ivec_bld, fall_off[2], fall_off[0]);
+      fall_off_ym_notxp = lp_build_andnot(ivec_bld, fall_off[2], fall_off[1]);
+      fall_off_yp_notxm = lp_build_andnot(ivec_bld, fall_off[3], fall_off[0]);
+      fall_off_yp_notxp = lp_build_andnot(ivec_bld, fall_off[3], fall_off[1]);
+
+      x00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_xcoords[2][0], x00);
+      y00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_ycoords[2][0], y00);
+      x01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_xcoords[2][1], x01);
+      y01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_ycoords[2][1], y01);
+      x10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_xcoords[3][0], x10);
+      y10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_ycoords[3][0], y10);
+      x11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_xcoords[3][1], x11);
+      y11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_ycoords[3][1], y11);
+
+      z00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_faces[2], z00);
+      z01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_faces[2], z01);
+      z10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_faces[3], z10);
+      z11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_faces[3], z11);
+
+      LLVMBuildStore(builder, x00, xs[0]);
+      LLVMBuildStore(builder, x01, xs[1]);
+      LLVMBuildStore(builder, x10, xs[2]);
+      LLVMBuildStore(builder, x11, xs[3]);
+      LLVMBuildStore(builder, y00, ys[0]);
+      LLVMBuildStore(builder, y01, ys[1]);
+      LLVMBuildStore(builder, y10, ys[2]);
+      LLVMBuildStore(builder, y11, ys[3]);
+      LLVMBuildStore(builder, z00, zs[0]);
+      LLVMBuildStore(builder, z01, zs[1]);
+      LLVMBuildStore(builder, z10, zs[2]);
+      LLVMBuildStore(builder, z11, zs[3]);
+
+      lp_build_else(&edge_if);
+
+      LLVMBuildStore(builder, x0, xs[0]);
+      LLVMBuildStore(builder, x1, xs[1]);
+      LLVMBuildStore(builder, x0, xs[2]);
+      LLVMBuildStore(builder, x1, xs[3]);
+      LLVMBuildStore(builder, y0, ys[0]);
+      LLVMBuildStore(builder, y0, ys[1]);
+      LLVMBuildStore(builder, y1, ys[2]);
+      LLVMBuildStore(builder, y1, ys[3]);
+      LLVMBuildStore(builder, face, zs[0]);
+      LLVMBuildStore(builder, face, zs[1]);
+      LLVMBuildStore(builder, face, zs[2]);
+      LLVMBuildStore(builder, face, zs[3]);
+
+      lp_build_endif(&edge_if);
+
+      x00 = LLVMBuildLoad(builder, xs[0], "");
+      x01 = LLVMBuildLoad(builder, xs[1], "");
+      x10 = LLVMBuildLoad(builder, xs[2], "");
+      x11 = LLVMBuildLoad(builder, xs[3], "");
+      y00 = LLVMBuildLoad(builder, ys[0], "");
+      y01 = LLVMBuildLoad(builder, ys[1], "");
+      y10 = LLVMBuildLoad(builder, ys[2], "");
+      y11 = LLVMBuildLoad(builder, ys[3], "");
+      z00 = LLVMBuildLoad(builder, zs[0], "");
+      z01 = LLVMBuildLoad(builder, zs[1], "");
+      z10 = LLVMBuildLoad(builder, zs[2], "");
+      z11 = LLVMBuildLoad(builder, zs[3], "");
    }
 
    if (linear_mask) {
@@ -937,12 +1104,12 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    /* get x0/x1 texels */
    lp_build_sample_texel_soa(bld,
                              width_vec, height_vec, depth_vec,
-                             x0, y0, z0,
+                             x00, y00, z00,
                              row_stride_vec, img_stride_vec,
                              data_ptr, mipoffsets, neighbors[0][0]);
    lp_build_sample_texel_soa(bld,
                              width_vec, height_vec, depth_vec,
-                             x1, y0, z0,
+                             x01, y01, z01,
                              row_stride_vec, img_stride_vec,
                              data_ptr, mipoffsets, neighbors[0][1]);
 
@@ -973,12 +1140,12 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
       /* get x0/x1 texels at y1 */
       lp_build_sample_texel_soa(bld,
                                 width_vec, height_vec, depth_vec,
-                                x0, y1, z0,
+                                x10, y10, z10,
                                 row_stride_vec, img_stride_vec,
                                 data_ptr, mipoffsets, neighbors[1][0]);
       lp_build_sample_texel_soa(bld,
                                 width_vec, height_vec, depth_vec,
-                                x1, y1, z0,
+                                x11, y11, z11,
                                 row_stride_vec, img_stride_vec,
                                 data_ptr, mipoffsets, neighbors[1][1]);
 
@@ -1012,22 +1179,22 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
          /* get x0/x1/y0/y1 texels at z1 */
          lp_build_sample_texel_soa(bld,
                                    width_vec, height_vec, depth_vec,
-                                   x0, y0, z1,
+                                   x00, y00, z1,
                                    row_stride_vec, img_stride_vec,
                                    data_ptr, mipoffsets, neighbors1[0][0]);
          lp_build_sample_texel_soa(bld,
                                    width_vec, height_vec, depth_vec,
-                                   x1, y0, z1,
+                                   x01, y01, z1,
                                    row_stride_vec, img_stride_vec,
                                    data_ptr, mipoffsets, neighbors1[0][1]);
          lp_build_sample_texel_soa(bld,
                                    width_vec, height_vec, depth_vec,
-                                   x0, y1, z1,
+                                   x10, y10, z1,
                                    row_stride_vec, img_stride_vec,
                                    data_ptr, mipoffsets, neighbors1[1][0]);
          lp_build_sample_texel_soa(bld,
                                    width_vec, height_vec, depth_vec,
-                                   x1, y1, z1,
+                                   x11, y11, z1,
                                    row_stride_vec, img_stride_vec,
                                    data_ptr, mipoffsets, neighbors1[1][1]);
 
@@ -2306,15 +2473,25 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
             use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_r);
          }
       }
+      if (static_texture_state->target == PIPE_TEXTURE_CUBE &&
+          derived_sampler_state.seamless_cube_map &&
+          (derived_sampler_state.min_img_filter == PIPE_TEX_FILTER_LINEAR ||
+           derived_sampler_state.mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
+         /* theoretically possible with AoS filtering but not implemented (complex!) */
+         use_aos = 0;
+      }
 
       if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
           !use_aos && util_format_fits_8unorm(bld.format_desc)) {
          debug_printf("%s: using floating point linear filtering for %s\n",
                       __FUNCTION__, bld.format_desc->short_name);
-         debug_printf("  min_img %d  mag_img %d  mip %d  wraps %d  wrapt %d  wrapr %d\n",
+         debug_printf("  min_img %d  mag_img %d  mip %d  target %d  seamless %d"
+                      "  wraps %d  wrapt %d  wrapr %d\n",
                       derived_sampler_state.min_img_filter,
                       derived_sampler_state.mag_img_filter,
                       derived_sampler_state.min_mip_filter,
+                      static_texture_state->target,
+                      derived_sampler_state.seamless_cube_map,
                       derived_sampler_state.wrap_s,
                       derived_sampler_state.wrap_t,
                       derived_sampler_state.wrap_r);