diff options
Diffstat (limited to 'src/gallium')
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_sample.c | 113 |
1 files changed, 74 insertions, 39 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c index e1cfd78e885..9b0a92c9cb9 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c @@ -232,6 +232,7 @@ lp_build_rho(struct lp_build_sample_context *bld, unsigned length = coord_bld->type.length; unsigned num_quads = length / 4; boolean rho_per_quad = rho_bld->type.length != length; + boolean no_rho_opt = (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1); unsigned i; LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); LLVMValueRef rho_xvec, rho_yvec; @@ -264,12 +265,13 @@ lp_build_rho(struct lp_build_sample_context *bld, else { rho = lp_build_swizzle_scalar_aos(coord_bld, cube_rho, 0, 4); } - if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) { - rho = lp_build_sqrt(rho_bld, rho); - } /* Could optimize this for single quad just skip the broadcast */ cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type, rho_bld->type, float_size, index0); + if (no_rho_opt) { + /* skipping sqrt hence returning rho squared */ + cubesize = lp_build_mul(rho_bld, cubesize, cubesize); + } rho = lp_build_mul(rho_bld, cubesize, rho); } else if (derivs && !(bld->static_texture_state->target == PIPE_TEXTURE_CUBE)) { @@ -281,7 +283,11 @@ lp_build_rho(struct lp_build_sample_context *bld, floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type, coord_bld->type, float_size, indexi); - if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) { + /* + * note that for rho_per_quad case could reduce math (at some shuffle + * cost), but for now use same code to per-pixel lod case. + */ + if (no_rho_opt) { ddx[i] = lp_build_mul(coord_bld, floatdim, derivs->ddx[i]); ddy[i] = lp_build_mul(coord_bld, floatdim, derivs->ddy[i]); ddx[i] = lp_build_mul(coord_bld, ddx[i], ddx[i]); @@ -295,7 +301,7 @@ lp_build_rho(struct lp_build_sample_context *bld, ddmax[i] = lp_build_mul(coord_bld, floatdim, ddmax[i]); } } - if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) { + if (no_rho_opt) { rho_xvec = lp_build_add(coord_bld, ddx[0], ddx[1]); rho_yvec = lp_build_add(coord_bld, ddy[0], ddy[1]); if (dims > 2) { @@ -303,19 +309,8 @@ lp_build_rho(struct lp_build_sample_context *bld, rho_yvec = lp_build_add(coord_bld, rho_yvec, ddy[2]); } rho = lp_build_max(coord_bld, rho_xvec, rho_yvec); - - if (rho_per_quad) { - /* - * note for this case without per-pixel lod could reduce math more - * (at some shuffle cost), but for now only do sqrt after packing, - * otherwise would also need different code to per-pixel lod case. - */ - rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type, - rho_bld->type, rho, 0); - } - rho = lp_build_sqrt(rho_bld, rho); - - } + /* skipping sqrt hence returning rho squared */ + } else { rho = ddmax[0]; if (dims > 1) { @@ -324,13 +319,13 @@ lp_build_rho(struct lp_build_sample_context *bld, rho = lp_build_max(coord_bld, rho, ddmax[2]); } } - if (rho_per_quad) { - /* - * rho_vec contains per-pixel rho, convert to scalar per quad. - */ - rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type, - rho_bld->type, rho, 0); - } + } + if (rho_per_quad) { + /* + * rho_vec contains per-pixel rho, convert to scalar per quad. + */ + rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type, + rho_bld->type, rho, 0); } } else { @@ -362,7 +357,7 @@ lp_build_rho(struct lp_build_sample_context *bld, } } - if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) { + if (no_rho_opt) { static const unsigned char swizzle01[] = { /* no-op swizzle */ 0, 1, LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE @@ -407,16 +402,9 @@ lp_build_rho(struct lp_build_sample_context *bld, rho_bld->type, rho, 0); } else { - /* - * on some cpus with half-speed 8-wide sqrt (e.g. SNB but not IVB) - * doing pack/sqrt/unpack/swizzle might be better for 8-wide case, - * same is true for cpus having faster scalars than 4-wide vecs - * for 4-wide case (where pack/unpack would be no-ops anyway). - * (Same is true really for cube_rho case above.) - */ rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4); } - rho = lp_build_sqrt(rho_bld, rho); + /* skipping sqrt hence returning rho squared */ } else { ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]); @@ -636,7 +624,7 @@ lp_build_brilinear_rho(struct lp_build_context *bld, /* * The pre factor will make the intersections with the exact powers of two - * happen precisely where we want then to be, which means that the integer + * happen precisely where we want them to be, which means that the integer * part will not need any post adjustments. */ rho = lp_build_mul(bld, rho, @@ -666,6 +654,34 @@ lp_build_brilinear_rho(struct lp_build_context *bld, /** + * Fast implementation of iround(log2(sqrt(x))), based on + * log2(x^n) == n*log2(x). + * + * Gives accurate results all the time. + * (Could be trivially extended to handle other power-of-two roots.) + */ +static LLVMValueRef +lp_build_ilog2_sqrt(struct lp_build_context *bld, + LLVMValueRef x) +{ + LLVMBuilderRef builder = bld->gallivm->builder; + LLVMValueRef ipart; + struct lp_type i_type = lp_int_type(bld->type); + LLVMValueRef one = lp_build_const_int_vec(bld->gallivm, i_type, 1); + + assert(bld->type.floating); + + assert(lp_check_value(bld->type, x)); + + /* ipart = log2(x) + 0.5 = 0.5*(log2(x^2) + 1.0) */ + ipart = lp_build_extract_exponent(bld, x, 1); + ipart = LLVMBuildAShr(builder, ipart, one, ""); + + return ipart; +} + + +/** * Generate code to compute texture level of detail (lambda). * \param derivs partial derivatives of (s, t, r, q) with respect to X and Y * \param lod_bias optional float vector with the shader lod bias @@ -740,6 +756,8 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, } else { LLVMValueRef rho; + boolean rho_squared = (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && + (bld->dims > 1); rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs); @@ -760,16 +778,28 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, if (mip_filter == PIPE_TEX_MIPFILTER_NONE || mip_filter == PIPE_TEX_MIPFILTER_NEAREST) { /* - * Don't actually need both all the time, ipart is needed - * for nearest mipfilter, pos_or_zero if min != mag. + * Don't actually need both values all the time, lod_ipart is + * needed for nearest mipfilter, lod_positive if min != mag. */ - *out_lod_ipart = lp_build_ilog2(lodf_bld, rho); + if (rho_squared) { + *out_lod_ipart = lp_build_ilog2_sqrt(lodf_bld, rho); + } + else { + *out_lod_ipart = lp_build_ilog2(lodf_bld, rho); + } *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER, rho, lodf_bld->one); return; } if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR && - !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) { + !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR) && + !rho_squared) { + /* + * This can't work if rho is squared. Not sure if it could be + * fixed while keeping it worthwile, could also do sqrt here + * but brilinear and no_rho_opt seems like a combination not + * making much sense anyway so just use ordinary path below. + */ lp_build_brilinear_rho(lodf_bld, rho, BRILINEAR_FACTOR, out_lod_ipart, out_lod_fpart); *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER, @@ -784,6 +814,11 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, else { lod = lp_build_fast_log2(lodf_bld, rho); } + if (rho_squared) { + /* log2(x^2) == 0.5*log2(x) */ + lod = lp_build_mul(lodf_bld, lod, + lp_build_const_vec(bld->gallivm, lodf_bld->type, 0.5F)); + } /* add shader lod bias */ if (lod_bias) { |