aboutsummaryrefslogtreecommitdiffstats
path: root/src/gallium/auxiliary/gallivm/lp_bld_sample.c
diff options
context:
space:
mode:
authorRoland Scheidegger <[email protected]>2013-04-03 03:26:22 +0200
committerRoland Scheidegger <[email protected]>2013-04-04 01:03:42 +0200
commitf621015cb55ed6901f571710c808113129b1b939 (patch)
tree633c6c8f5d8c6b3f5890180bf0228a9f3a1b7acc /src/gallium/auxiliary/gallivm/lp_bld_sample.c
parentbdfbeb9633eb3f8cf1ad76723f6c3839e57a08a3 (diff)
gallivm: do per-pixel cube face selection (finally!!!)
This proved to be tricky, the problem is that after selection/mirroring we cannot calculate reasonable derivatives (if not all pixels in a quad end up on the same face the derivatives could get "randomly" exceedingly large). However, it is actually quite easy to simply calculate the derivatives before selection/mirroring and then transform them similar to the cube coordinates (they only need selection/projection, but not mirroring as we're not interested in the sign bit, of course). While there is a tiny bit more work to do (need to calculate derivs for 3 coords instead of 2, and additional selects) it also simplifies things somewhat for the coord selection itself (as we save some broadcast aos shuffles, and we don't need to calculate the average vector) - hence if derivatives aren't needed this should actually be faster. Also, this has the benefit that this will (trivially) work for explicit derivatives too, which we completely ignored before that (will be in a separate commit for better trackability). Note that while the way for getting rho looks very different, it should result in "nearly" the same values as before (the "nearly" is only because before the code would choose the face based on an "average" vector and hence the derivatives calculated according to this face, where now (for implicit derivatives) the derivatives are projected on the face selected for the first (top-left) pixel in a quad, so not necessarly the same face). The transformation done might not quite be state-of-the-art, calculating length(dx,dy) as max(dx,dy) certainly isn't neither but this stays the same as before (that is I think a better transform would _somehow_ take the "derivative major axis" into account so that derivative changes in the major axis wouldn't get ignored). Should solve some accuracy problems with cubemaps (can easily be seen with the cubemap demo when switching wrapping/filtering), though we still don't do seamless filtering to fix it completely (so not per-sample but per-pixel is certainly better than per-quad and already sufficient for accurate results with nearest tex filter). As for performance, it seems to be a tiny bit faster too (maybe 3% or so with cubemap demo). Which I'd have expected with nearest/nearest filtering where this will be less instructions, but the difference seems to actually be larger with linear/linear_mipmap_linear where it is slightly more instructions, probably the code appears less serialized allowing better scheduling (on a sandy bridge cpu). It actually seems to be now at least as fast as the old path using a conditional when using 128bit vectors too (that is probably more a result of testing with a newer cpu though), for now that old path is still there but unused. No piglit regressions. Reviewed-by: Jose Fonseca <[email protected]>
Diffstat (limited to 'src/gallium/auxiliary/gallivm/lp_bld_sample.c')
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_sample.c249
1 files changed, 172 insertions, 77 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 9a008971c04..5d5092155cf 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -207,6 +207,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
LLVMValueRef s,
LLVMValueRef t,
LLVMValueRef r,
+ LLVMValueRef cube_rho,
const struct lp_derivatives *derivs)
{
struct gallivm_state *gallivm = bld->gallivm;
@@ -240,8 +241,22 @@ lp_build_rho(struct lp_build_sample_context *bld,
int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec);
float_size = lp_build_int_to_float(float_size_bld, int_size);
- /* XXX ignoring explicit derivs for cube maps for now */
- if (derivs && !(bld->static_texture_state->target == PIPE_TEXTURE_CUBE)) {
+ if (cube_rho) {
+ LLVMValueRef cubesize;
+ LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
+ /*
+ * If we have derivs too then we have per-pixel cube_rho - doesn't matter
+ * though until we do per-pixel lod.
+ * Cube map code did already everything except size mul and per-quad extraction.
+ */
+ /* Could optimize this for single quad just skip the broadcast */
+ cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
+ coord_bld->type, float_size, index0);
+ rho_vec = lp_build_mul(coord_bld, cubesize, cube_rho);
+ rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+ perquadf_bld->type, rho_vec, 0);
+ }
+ else if (derivs && !(bld->static_texture_state->target == PIPE_TEXTURE_CUBE)) {
LLVMValueRef ddmax[3];
for (i = 0; i < dims; i++) {
LLVMValueRef ddx, ddy;
@@ -561,6 +576,7 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
LLVMValueRef s,
LLVMValueRef t,
LLVMValueRef r,
+ LLVMValueRef cube_rho,
const struct lp_derivatives *derivs,
LLVMValueRef lod_bias, /* optional */
LLVMValueRef explicit_lod, /* optional */
@@ -594,7 +610,7 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
else {
LLVMValueRef rho;
- rho = lp_build_rho(bld, texture_unit, s, t, r, derivs);
+ rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs);
/*
* Compute lod = log2(rho)
@@ -1273,33 +1289,36 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
LLVMValueRef r,
LLVMValueRef *face,
LLVMValueRef *face_s,
- LLVMValueRef *face_t)
+ LLVMValueRef *face_t,
+ LLVMValueRef *rho)
{
struct lp_build_context *coord_bld = &bld->coord_bld;
LLVMBuilderRef builder = bld->gallivm->builder;
struct gallivm_state *gallivm = bld->gallivm;
- LLVMValueRef rx, ry, rz;
- LLVMValueRef tmp[4], rxyz, arxyz;
+ LLVMValueRef si, ti, ri;
+ boolean implicit_derivs = TRUE;
+ boolean need_derivs = TRUE;
- /*
- * Use the average of the four pixel's texcoords to choose the face.
- * Slight simplification just calculate the sum, skip scaling.
- */
- tmp[0] = s;
- tmp[1] = t;
- tmp[2] = r;
- rxyz = lp_build_hadd_partial4(&bld->coord_bld, tmp, 3);
- arxyz = lp_build_abs(&bld->coord_bld, rxyz);
-
- if (coord_bld->type.length > 4) {
+ if (1 || coord_bld->type.length > 4) {
+ /*
+ * Do per-pixel face selection. We cannot however (as we used to do)
+ * simply calculate the derivs afterwards (which is very bogus for
+ * explicit derivs anyway) because the values would be "random" when
+ * not all pixels lie on the same face. Hence just transform the derivs
+ * (or rather only the dmax values), which works both for implicit and
+ * explicit derivatives and doesn't add much math (except need to
+ * calculate derivs for 3 instead of 2 coords and have a couple more selects
+ * but cuts some minor math elsewhere). The derivs don't need mirroring,
+ * just selection, since noone cares about the sign.
+ */
struct lp_build_context *cint_bld = &bld->int_coord_bld;
struct lp_type intctype = cint_bld->type;
- LLVMValueRef signrxs, signrys, signrzs, signrxyz, sign;
- LLVMValueRef arxs, arys, arzs;
- LLVMValueRef arx_ge_ary, maxarxsarys, arz_ge_arx_ary;
+ LLVMValueRef signs, signt, signr, signma;
+ LLVMValueRef as, at, ar;
+ LLVMValueRef as_ge_at, maxasat, ar_ge_as_at;
LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
- LLVMValueRef ryneg, rzneg;
- LLVMValueRef ma, ima;
+ LLVMValueRef tnegi, rnegi;
+ LLVMValueRef ma, mai, ima;
LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
1 << (intctype.width - 1));
@@ -1308,54 +1327,102 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
+ LLVMValueRef dmax[3], dmaxsnew, dmaxtnew;
assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
- rx = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), "");
- ry = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), "");
- rz = LLVMBuildBitCast(builder, r, lp_build_vec_type(gallivm, intctype), "");
- ryneg = LLVMBuildXor(builder, ry, signmask, "");
- rzneg = LLVMBuildXor(builder, rz, signmask, "");
+ /*
+ * TODO do this only when needed, and implement explicit derivs (trivial).
+ */
+ if (need_derivs && implicit_derivs) {
+ LLVMValueRef ddx_ddy[2], tmp[2];
+ /*
+ * This isn't quite the same as the "ordinary" path since
+ * we need to extract the ds/dt/dr values before further processing.
+ */
+ static const unsigned char swizzle11[] = { /* no-op swizzle */
+ 0, LP_BLD_SWIZZLE_DONTCARE,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ static const unsigned char swizzle12[] = {
+ 2, LP_BLD_SWIZZLE_DONTCARE,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ static const unsigned char swizzle21[] = { /* no-op swizzle */
+ 0, LP_BLD_SWIZZLE_DONTCARE,
+ 2, LP_BLD_SWIZZLE_DONTCARE
+ };
+ static const unsigned char swizzle22[] = {
+ 1, LP_BLD_SWIZZLE_DONTCARE,
+ 3, LP_BLD_SWIZZLE_DONTCARE
+ };
+
+ ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
+ ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
+ ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
+ ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
- /* the sign bit comes from the averaged vector (per quad),
- * as does the decision which face to use */
- signrxyz = LLVMBuildBitCast(builder, rxyz, lp_build_vec_type(gallivm, intctype), "");
- signrxyz = LLVMBuildAnd(builder, signrxyz, signmask, "");
+ tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle21);
+ tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle22);
+ dmax[0] = lp_build_max(coord_bld, tmp[0], tmp[1]);
+ dmax[1] = lp_build_swizzle_aos(coord_bld, dmax[0], swizzle12);
- arxs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 0, 4);
- arys = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 1, 4);
- arzs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 2, 4);
+ tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle11);
+ tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle12);
+ dmax[2] = lp_build_max(coord_bld, tmp[0], tmp[1]);
+ }
+ else if (need_derivs) {
+ /* dmax[0] = lp_build_max(coord_bld, derivs->ddx[0], derivs->ddy[0]);
+ dmax[1] = lp_build_max(coord_bld, derivs->ddx[1], derivs->ddy[1]);
+ dmax[2] = lp_build_max(coord_bld, derivs->ddx[2], derivs->ddy[2]); */
+ }
+
+ si = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), "");
+ ti = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), "");
+ ri = LLVMBuildBitCast(builder, r, lp_build_vec_type(gallivm, intctype), "");
+
+ /*
+ * get absolute value (for x/y/z face selection) and sign bit
+ * (for mirroring minor coords and pos/neg face selection)
+ * of the original coords.
+ */
+ as = lp_build_abs(&bld->coord_bld, s);
+ at = lp_build_abs(&bld->coord_bld, t);
+ ar = lp_build_abs(&bld->coord_bld, r);
+ signs = LLVMBuildAnd(builder, si, signmask, "");
+ signt = LLVMBuildAnd(builder, ti, signmask, "");
+ signr = LLVMBuildAnd(builder, ri, signmask, "");
/*
- * select x if x >= y else select y
+ * major face determination: select x if x >= y else select y
* select previous result if y >= max(x,y) else select z
*/
- arx_ge_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, arxs, arys);
- maxarxsarys = lp_build_max(coord_bld, arxs, arys);
- arz_ge_arx_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, maxarxsarys, arzs);
+ as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, as, at);
+ maxasat = lp_build_max(coord_bld, as, at);
+ ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, maxasat, ar);
/*
* compute all possible new s/t coords
- * snewx = signrx * -rz;
- * tnewx = -ry;
- * snewy = rx;
- * tnewy = signry * rz;
- * snewz = signrz * rx;
- * tnewz = -ry;
+ * snewx = signs * -r;
+ * tnewx = -t;
+ * snewy = s;
+ * tnewy = signt * r;
+ * snewz = signr * s;
+ * tnewz = -t;
*/
- signrxs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 0, 4);
- snewx = LLVMBuildXor(builder, signrxs, rzneg, "");
- tnewx = ryneg;
+ tnegi = LLVMBuildXor(builder, ti, signmask, "");
+ rnegi = LLVMBuildXor(builder, ri, signmask, "");
- signrys = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 1, 4);
- snewy = rx;
- tnewy = LLVMBuildXor(builder, signrys, rz, "");
+ snewx = LLVMBuildXor(builder, signs, rnegi, "");
+ tnewx = tnegi;
- signrzs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 2, 4);
- snewz = LLVMBuildXor(builder, signrzs, rx, "");
- tnewz = ryneg;
+ snewy = si;
+ tnewy = LLVMBuildXor(builder, signt, ri, "");
+
+ snewz = LLVMBuildXor(builder, signr, si, "");
+ tnewz = tnegi;
/* XXX on x86 unclear if we should cast the values back to float
* or not - on some cpus (nehalem) pblendvb has twice the throughput
@@ -1363,20 +1430,26 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
* transition penalties when using it (this depends on what llvm
* will chose for the bit ops above so there appears no "right way",
* but given the boatload of selects let's just use the int type).
- *
- * Unfortunately we also need the sign bit of the summed coords.
*/
- *face_s = lp_build_select(cint_bld, arx_ge_ary, snewx, snewy);
- *face_t = lp_build_select(cint_bld, arx_ge_ary, tnewx, tnewy);
- ma = lp_build_select(coord_bld, arx_ge_ary, s, t);
- *face = lp_build_select(cint_bld, arx_ge_ary, facex, facey);
- sign = lp_build_select(cint_bld, arx_ge_ary, signrxs, signrys);
-
- *face_s = lp_build_select(cint_bld, arz_ge_arx_ary, *face_s, snewz);
- *face_t = lp_build_select(cint_bld, arz_ge_arx_ary, *face_t, tnewz);
- ma = lp_build_select(coord_bld, arz_ge_arx_ary, ma, r);
- *face = lp_build_select(cint_bld, arz_ge_arx_ary, *face, facez);
- sign = lp_build_select(cint_bld, arz_ge_arx_ary, sign, signrzs);
+
+ /* select/mirror */
+ *face_s = lp_build_select(cint_bld, as_ge_at, snewx, snewy);
+ *face_t = lp_build_select(cint_bld, as_ge_at, tnewx, tnewy);
+ ma = lp_build_select(coord_bld, as_ge_at, s, t);
+ *face = lp_build_select(cint_bld, as_ge_at, facex, facey);
+ if (need_derivs) {
+ dmaxsnew = lp_build_select(coord_bld, as_ge_at, dmax[2], dmax[0]);
+ dmaxtnew = lp_build_select(coord_bld, as_ge_at, dmax[1], dmax[2]);
+ }
+
+ *face_s = lp_build_select(cint_bld, ar_ge_as_at, *face_s, snewz);
+ *face_t = lp_build_select(cint_bld, ar_ge_as_at, *face_t, tnewz);
+ ma = lp_build_select(coord_bld, ar_ge_as_at, ma, r);
+ *face = lp_build_select(cint_bld, ar_ge_as_at, *face, facez);
+ if (need_derivs) {
+ dmaxsnew = lp_build_select(coord_bld, ar_ge_as_at, dmaxsnew, dmax[0]);
+ dmaxtnew = lp_build_select(coord_bld, ar_ge_as_at, dmaxtnew, dmax[1]);
+ }
*face_s = LLVMBuildBitCast(builder, *face_s,
lp_build_vec_type(gallivm, coord_bld->type), "");
@@ -1388,15 +1461,30 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
* as long as we ensure vblendvps gets used we can actually
* skip the comparison and just use sign as a "mask" directly.
*/
- sign = LLVMBuildLShr(builder, sign, signshift, "");
- *face = LLVMBuildOr(builder, *face, sign, "face");
+ mai = LLVMBuildBitCast(builder, ma, lp_build_vec_type(gallivm, intctype), "");
+ signma = LLVMBuildLShr(builder, mai, signshift, "");
+ *face = LLVMBuildOr(builder, *face, signma, "face");
ima = lp_build_cube_imapos(coord_bld, ma);
+ /* project coords */
*face_s = lp_build_mul(coord_bld, *face_s, ima);
*face_s = lp_build_add(coord_bld, *face_s, posHalf);
*face_t = lp_build_mul(coord_bld, *face_t, ima);
*face_t = lp_build_add(coord_bld, *face_t, posHalf);
+
+ /* project derivs */
+ if (need_derivs) {
+ /*
+ * we do some optimization here, since we know it's square
+ * we can do the max before projection (and before size mul,
+ * which the so-called "rho" is missing here).
+ * For explicit derivs this is fully per-pixel vector, for implicit
+ * derivs only the first value per quad contains useful values.
+ */
+ *rho = lp_build_max(coord_bld, dmaxsnew, dmaxtnew);
+ *rho = lp_build_mul(coord_bld, *rho, ima);
+ }
}
else {
@@ -1408,10 +1496,17 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
LLVMValueRef shuffles[4];
LLVMValueRef arxy_ge_aryx, arxy_ge_arzz, arxy_ge_arxy_arzz;
LLVMValueRef arxyxy, aryxzz, arxyxy_ge_aryxzz;
+ LLVMValueRef tmp[4], rxyz, arxyz;
struct lp_build_context *float_bld = &bld->float_bld;
assert(bld->coord_bld.type.length == 4);
+ tmp[0] = s;
+ tmp[1] = t;
+ tmp[2] = r;
+ rxyz = lp_build_hadd_partial4(&bld->coord_bld, tmp, 3);
+ arxyz = lp_build_abs(&bld->coord_bld, rxyz);
+
shuffles[0] = lp_build_const_int32(gallivm, 0);
shuffles[1] = lp_build_const_int32(gallivm, 1);
shuffles[2] = lp_build_const_int32(gallivm, 0);
@@ -1450,14 +1545,14 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
{
/* +/- X face */
LLVMValueRef sign, ima;
- rx = LLVMBuildExtractElement(builder, rxyz,
+ si = LLVMBuildExtractElement(builder, rxyz,
lp_build_const_int32(gallivm, 0), "");
/* +/- X face */
- sign = lp_build_sgn(float_bld, rx);
+ sign = lp_build_sgn(float_bld, si);
ima = lp_build_cube_imaneg(coord_bld, s);
*face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
*face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
- *face = lp_build_cube_face(bld, rx,
+ *face = lp_build_cube_face(bld, si,
PIPE_TEX_FACE_POS_X,
PIPE_TEX_FACE_NEG_X);
LLVMBuildStore(builder, *face_s, face_s_var);
@@ -1472,13 +1567,13 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
{
LLVMValueRef sign, ima;
/* +/- Y face */
- ry = LLVMBuildExtractElement(builder, rxyz,
+ ti = LLVMBuildExtractElement(builder, rxyz,
lp_build_const_int32(gallivm, 1), "");
- sign = lp_build_sgn(float_bld, ry);
+ sign = lp_build_sgn(float_bld, ti);
ima = lp_build_cube_imaneg(coord_bld, t);
*face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
*face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
- *face = lp_build_cube_face(bld, ry,
+ *face = lp_build_cube_face(bld, ti,
PIPE_TEX_FACE_POS_Y,
PIPE_TEX_FACE_NEG_Y);
LLVMBuildStore(builder, *face_s, face_s_var);
@@ -1489,13 +1584,13 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
{
/* +/- Z face */
LLVMValueRef sign, ima;
- rz = LLVMBuildExtractElement(builder, rxyz,
+ ri = LLVMBuildExtractElement(builder, rxyz,
lp_build_const_int32(gallivm, 2), "");
- sign = lp_build_sgn(float_bld, rz);
+ sign = lp_build_sgn(float_bld, ri);
ima = lp_build_cube_imaneg(coord_bld, r);
*face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
*face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
- *face = lp_build_cube_face(bld, rz,
+ *face = lp_build_cube_face(bld, ri,
PIPE_TEX_FACE_POS_Z,
PIPE_TEX_FACE_NEG_Z);
LLVMBuildStore(builder, *face_s, face_s_var);