3 files changed, 235 insertions, 56 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index dc593aabac4..39c3a2f9d9e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -273,7 +273,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
       cubesize = lp_build_mul(rho_bld, cubesize, cubesize);
       rho = lp_build_mul(rho_bld, cubesize, rho);
    }
-   else if (derivs && !(bld->static_texture_state->target == PIPE_TEXTURE_CUBE)) {
+   else if (derivs) {
       LLVMValueRef ddmax[3], ddx[3], ddy[3];
       for (i = 0; i < dims; i++) {
          LLVMValueRef floatdim;
@@ -1481,6 +1481,21 @@ lp_build_cube_face(struct lp_build_sample_context *bld,
 }
 
 
+/** Helper for doing 3-wise selection.
+ * Returns sel1 ? val2 : (sel0 ? val0 : val1).
+ */
+static LLVMValueRef
+lp_build_select3(struct lp_build_context *sel_bld,
+                 LLVMValueRef sel0,
+                 LLVMValueRef sel1,
+                 LLVMValueRef val0,
+                 LLVMValueRef val1,
+                 LLVMValueRef val2)
+{
+   LLVMValueRef tmp;
+   tmp = lp_build_select(sel_bld, sel0, val0, val1);
+   return lp_build_select(sel_bld, sel1, val2, tmp);
+}
 
 /**
  * Generate code to do cube face selection and compute per-face texcoords.
@@ -1488,8 +1503,9 @@ lp_build_cube_face(struct lp_build_sample_context *bld,
 void
 lp_build_cube_lookup(struct lp_build_sample_context *bld,
                      LLVMValueRef *coords,
-                     const struct lp_derivatives *derivs, /* optional */
+                     const struct lp_derivatives *derivs_in, /* optional */
                      LLVMValueRef *rho,
+                     struct lp_derivatives *derivs_out, /* optional */
                      boolean need_derivs)
 {
    struct lp_build_context *coord_bld = &bld->coord_bld;
@@ -1512,19 +1528,16 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
        * the edge). Still this is possibly a win over just selecting the same face
        * for all pixels. Unfortunately, something like that doesn't work for
        * explicit derivatives.
-       * TODO: handle explicit derivatives by transforming them alongside coords
-       * somehow.
        */
       struct lp_build_context *cint_bld = &bld->int_coord_bld;
       struct lp_type intctype = cint_bld->type;
       LLVMTypeRef coord_vec_type = coord_bld->vec_type;
       LLVMTypeRef cint_vec_type = cint_bld->vec_type;
-      LLVMValueRef signs, signt, signr, signma;
       LLVMValueRef as, at, ar, face, face_s, face_t;
       LLVMValueRef as_ge_at, maxasat, ar_ge_as_at;
       LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
       LLVMValueRef tnegi, rnegi;
-      LLVMValueRef ma, mai, ima;
+      LLVMValueRef ma, mai, signma, signmabit, imahalfpos;
       LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
       LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
                                                      1 << (intctype.width - 1));
@@ -1563,7 +1576,166 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
       maxasat = lp_build_max(coord_bld, as, at);
       ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
 
-      if (need_derivs) {
+      if (need_derivs && (derivs_in ||
+          ((gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) &&
+           (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX)))) {
+         /*
+          * XXX: This is really really complex.
+          * It is a bit overkill to use this for implicit derivatives as well,
+          * no way this is worth the cost in practice, but seems to be the
+          * only way for getting accurate and per-pixel lod values.
+          */
+         LLVMValueRef ima, imahalf, tmp, ddx[3], ddy[3];
+         LLVMValueRef madx, mady, madxdivma, madydivma;
+         LLVMValueRef sdxi, tdxi, rdxi, sdyi, tdyi, rdyi;
+         LLVMValueRef tdxnegi, rdxnegi, tdynegi, rdynegi;
+         LLVMValueRef sdxnewx, sdxnewy, sdxnewz, tdxnewx, tdxnewy, tdxnewz;
+         LLVMValueRef sdynewx, sdynewy, sdynewz, tdynewx, tdynewy, tdynewz;
+         LLVMValueRef face_sdx, face_tdx, face_sdy, face_tdy;
+         /*
+          * s = 1/2 * ( sc / ma + 1)
+          * t = 1/2 * ( tc / ma + 1)
+          *
+          * s' = 1/2 * (sc' * ma - sc * ma') / ma^2
+          * t' = 1/2 * (tc' * ma - tc * ma') / ma^2
+          *
+          * dx.s = 0.5 * (dx.sc - sc * dx.ma / ma) / ma
+          * dx.t = 0.5 * (dx.tc - tc * dx.ma / ma) / ma
+          * dy.s = 0.5 * (dy.sc - sc * dy.ma / ma) / ma
+          * dy.t = 0.5 * (dy.tc - tc * dy.ma / ma) / ma
+          */
+
+         /* select ma, calculate ima */
+         ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r);
+         mai = LLVMBuildBitCast(builder, ma, cint_vec_type, "");
+         signmabit = LLVMBuildAnd(builder, mai, signmask, "");
+         ima = lp_build_div(coord_bld, coord_bld->one, ma);
+         imahalf = lp_build_mul(coord_bld, posHalf, ima);
+         imahalfpos = lp_build_abs(coord_bld, imahalf);
+
+         if (!derivs_in) {
+            ddx[0] = lp_build_ddx(coord_bld, s);
+            ddx[1] = lp_build_ddx(coord_bld, t);
+            ddx[2] = lp_build_ddx(coord_bld, r);
+            ddy[0] = lp_build_ddy(coord_bld, s);
+            ddy[1] = lp_build_ddy(coord_bld, t);
+            ddy[2] = lp_build_ddy(coord_bld, r);
+         }
+         else {
+            ddx[0] = derivs_in->ddx[0];
+            ddx[1] = derivs_in->ddx[1];
+            ddx[2] = derivs_in->ddx[2];
+            ddy[0] = derivs_in->ddy[0];
+            ddy[1] = derivs_in->ddy[1];
+            ddy[2] = derivs_in->ddy[2];
+         }
+
+         /* select major derivatives */
+         madx = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, ddx[0], ddx[1], ddx[2]);
+         mady = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, ddy[0], ddy[1], ddy[2]);
+
+         si = LLVMBuildBitCast(builder, s, cint_vec_type, "");
+         ti = LLVMBuildBitCast(builder, t, cint_vec_type, "");
+         ri = LLVMBuildBitCast(builder, r, cint_vec_type, "");
+
+         sdxi = LLVMBuildBitCast(builder, ddx[0], cint_vec_type, "");
+         tdxi = LLVMBuildBitCast(builder, ddx[1], cint_vec_type, "");
+         rdxi = LLVMBuildBitCast(builder, ddx[2], cint_vec_type, "");
+
+         sdyi = LLVMBuildBitCast(builder, ddy[0], cint_vec_type, "");
+         tdyi = LLVMBuildBitCast(builder, ddy[1], cint_vec_type, "");
+         rdyi = LLVMBuildBitCast(builder, ddy[2], cint_vec_type, "");
+
+         /*
+          * compute all possible new s/t coords, which does the mirroring,
+          * and do the same for derivs minor axes.
+          * snewx = signma * -r;
+          * tnewx = -t;
+          * snewy = s;
+          * tnewy = signma * r;
+          * snewz = signma * s;
+          * tnewz = -t;
+          */
+         tnegi = LLVMBuildXor(builder, ti, signmask, "");
+         rnegi = LLVMBuildXor(builder, ri, signmask, "");
+         tdxnegi = LLVMBuildXor(builder, tdxi, signmask, "");
+         rdxnegi = LLVMBuildXor(builder, rdxi, signmask, "");
+         tdynegi = LLVMBuildXor(builder, tdyi, signmask, "");
+         rdynegi = LLVMBuildXor(builder, rdyi, signmask, "");
+
+         snewx = LLVMBuildXor(builder, signmabit, rnegi, "");
+         tnewx = tnegi;
+         sdxnewx = LLVMBuildXor(builder, signmabit, rdxnegi, "");
+         tdxnewx = tdxnegi;
+         sdynewx = LLVMBuildXor(builder, signmabit, rdynegi, "");
+         tdynewx = tdynegi;
+
+         snewy = si;
+         tnewy = LLVMBuildXor(builder, signmabit, ri, "");
+         sdxnewy = sdxi;
+         tdxnewy = LLVMBuildXor(builder, signmabit, rdxi, "");
+         sdynewy = sdyi;
+         tdynewy = LLVMBuildXor(builder, signmabit, rdyi, "");
+
+         snewz = LLVMBuildXor(builder, signmabit, si, "");
+         tnewz = tnegi;
+         sdxnewz = LLVMBuildXor(builder, signmabit, sdxi, "");
+         tdxnewz = tdxnegi;
+         sdynewz = LLVMBuildXor(builder, signmabit, sdyi, "");
+         tdynewz = tdynegi;
+
+         /* select the mirrored values */
+         face = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, facex, facey, facez);
+         face_s = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, snewx, snewy, snewz);
+         face_t = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tnewx, tnewy, tnewz);
+         face_sdx = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, sdxnewx, sdxnewy, sdxnewz);
+         face_tdx = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tdxnewx, tdxnewy, tdxnewz);
+         face_sdy = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, sdynewx, sdynewy, sdynewz);
+         face_tdy = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tdynewx, tdynewy, tdynewz);
+
+         face_s = LLVMBuildBitCast(builder, face_s, coord_vec_type, "");
+         face_t = LLVMBuildBitCast(builder, face_t, coord_vec_type, "");
+         face_sdx = LLVMBuildBitCast(builder, face_sdx, coord_vec_type, "");
+         face_tdx = LLVMBuildBitCast(builder, face_tdx, coord_vec_type, "");
+         face_sdy = LLVMBuildBitCast(builder, face_sdy, coord_vec_type, "");
+         face_tdy = LLVMBuildBitCast(builder, face_tdy, coord_vec_type, "");
+
+         /* deriv math, dx.s = 0.5 * (dx.sc - sc * dx.ma / ma) / ma */
+         madxdivma = lp_build_mul(coord_bld, madx, ima);
+         tmp = lp_build_mul(coord_bld, madxdivma, face_s);
+         tmp = lp_build_sub(coord_bld, face_sdx, tmp);
+         derivs_out->ddx[0] = lp_build_mul(coord_bld, tmp, imahalf);
+
+         /* dx.t = 0.5 * (dx.tc - tc * dx.ma / ma) / ma */
+         tmp = lp_build_mul(coord_bld, madxdivma, face_t);
+         tmp = lp_build_sub(coord_bld, face_tdx, tmp);
+         derivs_out->ddx[1] = lp_build_mul(coord_bld, tmp, imahalf);
+
+         /* dy.s = 0.5 * (dy.sc - sc * dy.ma / ma) / ma */
+         madydivma = lp_build_mul(coord_bld, mady, ima);
+         tmp = lp_build_mul(coord_bld, madydivma, face_s);
+         tmp = lp_build_sub(coord_bld, face_sdy, tmp);
+         derivs_out->ddy[0] = lp_build_mul(coord_bld, tmp, imahalf);
+
+         /* dy.t = 0.5 * (dy.tc - tc * dy.ma / ma) / ma */
+         tmp = lp_build_mul(coord_bld, madydivma, face_t);
+         tmp = lp_build_sub(coord_bld, face_tdy, tmp);
+         derivs_out->ddy[1] = lp_build_mul(coord_bld, tmp, imahalf);
+
+         signma = LLVMBuildLShr(builder, mai, signshift, "");
+         coords[2] = LLVMBuildOr(builder, face, signma, "face");
+
+         /* project coords */
+         face_s = lp_build_mul(coord_bld, face_s, imahalfpos);
+         face_t = lp_build_mul(coord_bld, face_t, imahalfpos);
+
+         coords[0] = lp_build_add(coord_bld, face_s, posHalf);
+         coords[1] = lp_build_add(coord_bld, face_t, posHalf);
+
+         return;
+      }
+
+      else if (need_derivs) {
          LLVMValueRef ddx_ddy[2], tmp[3], rho_vec;
          static const unsigned char swizzle0[] = { /* no-op swizzle */
             0, LP_BLD_SWIZZLE_DONTCARE,
@@ -1590,12 +1762,11 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
           * scale the s/t/r coords pre-select/mirror so we can calculate
           * "reasonable" derivs.
           */
-         ma = lp_build_select(coord_bld, as_ge_at, s, t);
-         ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
-         ima = lp_build_cube_imapos(coord_bld, ma);
-         s = lp_build_mul(coord_bld, s, ima);
-         t = lp_build_mul(coord_bld, t, ima);
-         r = lp_build_mul(coord_bld, r, ima);
+         ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r);
+         imahalfpos = lp_build_cube_imapos(coord_bld, ma);
+         s = lp_build_mul(coord_bld, s, imahalfpos);
+         t = lp_build_mul(coord_bld, t, imahalfpos);
+         r = lp_build_mul(coord_bld, r, imahalfpos);
 
          /*
           * This isn't quite the same as the "ordinary" (3d deriv) path since we
@@ -1625,56 +1796,41 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
          *rho = lp_build_max(coord_bld, tmp[0], tmp[1]);
       }
 
+      if (!need_derivs) {
+         ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r);
+      }
+      mai = LLVMBuildBitCast(builder, ma, cint_vec_type, "");
+      signmabit = LLVMBuildAnd(builder, mai, signmask, "");
+
       si = LLVMBuildBitCast(builder, s, cint_vec_type, "");
       ti = LLVMBuildBitCast(builder, t, cint_vec_type, "");
       ri = LLVMBuildBitCast(builder, r, cint_vec_type, "");
-      signs = LLVMBuildAnd(builder, si, signmask, "");
-      signt = LLVMBuildAnd(builder, ti, signmask, "");
-      signr = LLVMBuildAnd(builder, ri, signmask, "");
 
       /*
-       * compute all possible new s/t coords
-       * snewx = signs * -r;
+       * compute all possible new s/t coords, which does the mirroring
+       * snewx = signma * -r;
        * tnewx = -t;
        * snewy = s;
-       * tnewy = signt * r;
-       * snewz = signr * s;
+       * tnewy = signma * r;
+       * snewz = signma * s;
        * tnewz = -t;
        */
       tnegi = LLVMBuildXor(builder, ti, signmask, "");
       rnegi = LLVMBuildXor(builder, ri, signmask, "");
 
-      snewx = LLVMBuildXor(builder, signs, rnegi, "");
+      snewx = LLVMBuildXor(builder, signmabit, rnegi, "");
       tnewx = tnegi;
 
       snewy = si;
-      tnewy = LLVMBuildXor(builder, signt, ri, "");
+      tnewy = LLVMBuildXor(builder, signmabit, ri, "");
 
-      snewz = LLVMBuildXor(builder, signr, si, "");
+      snewz = LLVMBuildXor(builder, signmabit, si, "");
       tnewz = tnegi;
 
-      /* XXX on x86 unclear if we should cast the values back to float
-       * or not - on some cpus (nehalem) pblendvb has twice the throughput
-       * of blendvps though on others there just might be domain
-       * transition penalties when using it (this depends on what llvm
-       * will chose for the bit ops above so there appears no "right way",
-       * but given the boatload of selects let's just use the int type).
-       */
-
-      /* select/mirror */
-      if (!need_derivs) {
-         ma = lp_build_select(coord_bld, as_ge_at, s, t);
-      }
-      face_s = lp_build_select(cint_bld, as_ge_at, snewx, snewy);
-      face_t = lp_build_select(cint_bld, as_ge_at, tnewx, tnewy);
-      face = lp_build_select(cint_bld, as_ge_at, facex, facey);
-
-      if (!need_derivs) {
-         ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
-      }
-      face_s = lp_build_select(cint_bld, ar_ge_as_at, snewz, face_s);
-      face_t = lp_build_select(cint_bld, ar_ge_as_at, tnewz, face_t);
-      face = lp_build_select(cint_bld, ar_ge_as_at, facez, face);
+      /* select the mirrored values */
+      face_s = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, snewx, snewy, snewz);
+      face_t = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tnewx, tnewy, tnewz);
+      face = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, facex, facey, facez);
 
       face_s = LLVMBuildBitCast(builder, face_s, coord_vec_type, "");
       face_t = LLVMBuildBitCast(builder, face_t, coord_vec_type, "");
@@ -1684,15 +1840,14 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
        * as long as we ensure vblendvps gets used we can actually
        * skip the comparison and just use sign as a "mask" directly.
        */
-      mai = LLVMBuildBitCast(builder, ma, cint_vec_type, "");
       signma = LLVMBuildLShr(builder, mai, signshift, "");
       coords[2] = LLVMBuildOr(builder, face, signma, "face");
 
       /* project coords */
       if (!need_derivs) {
-         ima = lp_build_cube_imapos(coord_bld, ma);
-         face_s = lp_build_mul(coord_bld, face_s, ima);
-         face_t = lp_build_mul(coord_bld, face_t, ima);
+         imahalfpos = lp_build_cube_imapos(coord_bld, ma);
+         face_s = lp_build_mul(coord_bld, face_s, imahalfpos);
+         face_t = lp_build_mul(coord_bld, face_t, imahalfpos);
       }
 
       coords[0] = lp_build_add(coord_bld, face_s, posHalf);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 803a99e3b0c..70f03503f0f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -457,8 +457,9 @@ lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
 void
 lp_build_cube_lookup(struct lp_build_sample_context *bld,
                      LLVMValueRef *coords,
-                     const struct lp_derivatives *derivs, /* optional */
+                     const struct lp_derivatives *derivs_in, /* optional */
                      LLVMValueRef *rho,
+                     struct lp_derivatives *derivs_out, /* optional */
                      boolean need_derivs);
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 33378bcdcd0..54dee25bfd9 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -1387,6 +1387,7 @@ lp_build_sample_common(struct lp_build_sample_context *bld,
    const unsigned target = bld->static_texture_state->target;
    LLVMValueRef first_level, cube_rho = NULL;
    LLVMValueRef lod_ipart = NULL;
+   struct lp_derivatives cube_derivs;
 
    /*
    printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
@@ -1403,7 +1404,8 @@ lp_build_sample_common(struct lp_build_sample_context *bld,
                       mip_filter != PIPE_TEX_MIPFILTER_NONE) &&
                       !bld->static_sampler_state->min_max_lod_equal &&
                       !explicit_lod);
-      lp_build_cube_lookup(bld, coords, derivs, &cube_rho, need_derivs);
+      lp_build_cube_lookup(bld, coords, derivs, &cube_rho, &cube_derivs, need_derivs);
+      derivs = &cube_derivs;
    }
    else if (target == PIPE_TEXTURE_1D_ARRAY ||
             target == PIPE_TEXTURE_2D_ARRAY) {
@@ -2163,9 +2165,24 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
     * avoided like min and max lod being equal.
     */
    bld.num_mips = bld.num_lods = 1;
-   if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
-       (explicit_lod || lod_bias ||
-        (derivs && static_texture_state->target != PIPE_TEXTURE_CUBE))) {
+
+   if ((gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) &&
+       (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) &&
+       (static_texture_state->target == PIPE_TEXTURE_CUBE) &&
+       (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
+      /*
+       * special case for using per-pixel lod even for implicit lod,
+       * which is generally never required (ok by APIs) except to please
+       * some (somewhat broken imho) tests (because per-pixel face selection
+       * can cause derivatives to be different for pixels outside the primitive
+       * due to the major axis division even if pre-project derivatives are
+       * looking normal).
+       */
+      bld.num_mips = type.length;
+      bld.num_lods = type.length;
+   }
+   else if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT ||
+       (explicit_lod || lod_bias || derivs)) {
       if ((is_fetch && target != PIPE_BUFFER) ||
           (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
          bld.num_mips = type.length;
@@ -2371,9 +2388,15 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
          bld4.texel_type.length = 4;
 
          bld4.num_mips = bld4.num_lods = 1;
+         if ((gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) &&
+             (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) &&
+             (static_texture_state->target == PIPE_TEXTURE_CUBE) &&
+             (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
+            bld4.num_mips = type4.length;
+            bld4.num_lods = type4.length;
+         }
          if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
-             (explicit_lod || lod_bias ||
-              (derivs && static_texture_state->target != PIPE_TEXTURE_CUBE))) {
+             (explicit_lod || lod_bias || derivs)) {
             if ((is_fetch && target != PIPE_BUFFER) ||
                 (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
                bld4.num_mips = type4.length;