2 files changed, 74 insertions, 31 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 6e5c4a1e48f..95541041e31 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -200,7 +200,7 @@ lp_sampler_static_sampler_state(struct lp_static_sampler_state *state,
  * Generate code to compute coordinate gradient (rho).
  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
  *
- * The resulting rho is scalar per quad.
+ * The resulting rho has bld->levelf format (per quad or per element).
  */
 static LLVMValueRef
 lp_build_rho(struct lp_build_sample_context *bld,
@@ -230,13 +230,17 @@ lp_build_rho(struct lp_build_sample_context *bld,
    LLVMValueRef first_level, first_level_vec;
    unsigned length = coord_bld->type.length;
    unsigned num_quads = length / 4;
+   boolean rho_per_quad = levelf_bld->type.length != length;
    unsigned i;
    LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
    LLVMValueRef rho_xvec, rho_yvec;
 
    /* Note that all simplified calculations will only work for isotropic filtering */
 
-   assert(bld->num_lods != length);
+   /*
+    * rho calcs are always per quad except for explicit derivs (excluding
+    * the messy cube maps for now) when requested.
+    */
 
    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                  bld->gallivm, texture_unit);
@@ -247,11 +251,18 @@ lp_build_rho(struct lp_build_sample_context *bld,
    if (cube_rho) {
       LLVMValueRef cubesize;
       LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
+
       /*
        * Cube map code did already everything except size mul and per-quad extraction.
+       * Luckily cube maps are always quadratic!
        */
-      rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                      levelf_bld->type, cube_rho, 0);
+      if (rho_per_quad) {
+         rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+                                         levelf_bld->type, cube_rho, 0);
+      }
+      else {
+         rho = lp_build_swizzle_scalar_aos(coord_bld, cube_rho, 0, 4);
+      }
       if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
          rho = lp_build_sqrt(levelf_bld, rho);
       }
@@ -290,29 +301,35 @@ lp_build_rho(struct lp_build_sample_context *bld,
             rho_xvec = lp_build_add(coord_bld, rho_xvec, ddx[2]);
             rho_yvec = lp_build_add(coord_bld, rho_yvec, ddy[2]);
          }
-         rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
-         rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                         levelf_bld->type, rho_vec, 0);
-         /*
-          * note that as long as we don't care about per-pixel lod could reduce math
-          * more (at some shuffle cost), but for now only do sqrt after packing.
-          */
+         rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
+
+         if (rho_per_quad) {
+            /*
+             * note for this case without per-pixel lod could reduce math more
+             * (at some shuffle cost), but for now only do sqrt after packing,
+             * otherwise would also need different code to per-pixel lod case.
+             */
+            rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+                                            levelf_bld->type, rho, 0);
+         }
          rho = lp_build_sqrt(levelf_bld, rho);
+
       }
       else {
-         rho_vec = ddmax[0];
+         rho = ddmax[0];
          if (dims > 1) {
-            rho_vec = lp_build_max(coord_bld, rho_vec, ddmax[1]);
+            rho = lp_build_max(coord_bld, rho, ddmax[1]);
             if (dims > 2) {
-               rho_vec = lp_build_max(coord_bld, rho_vec, ddmax[2]);
+               rho = lp_build_max(coord_bld, rho, ddmax[2]);
             }
          }
-         /*
-          * rho_vec now still contains per-pixel rho, convert to scalar per quad
-          * since we can't handle per-pixel rho/lod from now on (TODO).
-          */
-         rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                         levelf_bld->type, rho_vec, 0);
+         if (rho_per_quad) {
+            /*
+             * rho_vec contains per-pixel rho, convert to scalar per quad.
+             */
+            rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+                                            levelf_bld->type, rho, 0);
+         }
       }
    }
    else {
@@ -379,12 +396,25 @@ lp_build_rho(struct lp_build_sample_context *bld,
             ddx_ddy[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
             rho_vec = lp_build_add(coord_bld, rho_vec, ddx_ddy[1]);
          }
+
          rho_xvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
          rho_yvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
-         rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
+         rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
 
-         rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                         levelf_bld->type, rho_vec, 0);
+         if (rho_per_quad) {
+            rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+                                            levelf_bld->type, rho, 0);
+         }
+         else {
+            /*
+             * on some cpus with half-speed 8-wide sqrt (e.g. SNB but not IVB)
+             * doing pack/sqrt/unpack/swizzle might be better for 8-wide case,
+             * same is true for cpus having faster scalars than 4-wide vecs
+             * for 4-wide case (where pack/unpack would be no-ops anyway).
+             * (Same is true really for cube_rho case above.)
+             */
+            rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
+         }
          rho = lp_build_sqrt(levelf_bld, rho);
       }
       else {
@@ -464,8 +494,13 @@ lp_build_rho(struct lp_build_sample_context *bld,
                   }
                }
             }
-            rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                            levelf_bld->type, rho, 0);
+            if (rho_per_quad) {
+               rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+                                               levelf_bld->type, rho, 0);
+            }
+            else {
+               rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
+            }
          }
          else {
             if (dims <= 1) {
@@ -491,6 +526,9 @@ lp_build_rho(struct lp_build_sample_context *bld,
                   }
                }
             }
+            if (!rho_per_quad) {
+               rho = lp_build_broadcast_scalar(levelf_bld, rho);
+            }
          }
       }
    }
@@ -729,8 +767,9 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
 
          /* add shader lod bias */
          if (lod_bias) {
-            lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
-                  levelf_bld->type, lod_bias, 0);
+            if (bld->num_lods != bld->coord_type.length)
+               lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
+                                                    levelf_bld->type, lod_bias, 0);
             lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
          }
       }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index deb6ef429d5..743dd0a4501 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -1942,7 +1942,9 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
     * There are other situations where at least the multiple int lods could be
     * avoided like min and max lod being equal.
     */
-   if (explicit_lod && lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
+   if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
+       (explicit_lod || lod_bias ||
+        (derivs && static_texture_state->target != PIPE_TEXTURE_CUBE)) &&
        ((is_fetch && target != PIPE_BUFFER) ||
         (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)))
       bld.num_lods = type.length;
@@ -2140,9 +2142,11 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
          bld4.levelf_type.length = 1;
          bld4.leveli_type = lp_int_type(bld4.levelf_type);
 
-         if (explicit_lod && lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
-             ((is_fetch && target != PIPE_BUFFER) ||
-              (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)))
+         if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
+               (explicit_lod || lod_bias ||
+                (derivs && static_texture_state->target != PIPE_TEXTURE_CUBE)) &&
+               ((is_fetch && target != PIPE_BUFFER) ||
+                (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)))
             bld4.num_lods = type4.length;
          else
             bld4.num_lods = 1;