/************************************************************************** * * Copyright 2009-2010 VMware, Inc. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sub license, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * **************************************************************************/ /** * @file * Depth/stencil testing to LLVM IR translation. * * To be done accurately/efficiently the depth/stencil test must be done with * the same type/format of the depth/stencil buffer, which implies massaging * the incoming depths to fit into place. Using a more straightforward * type/format for depth/stencil values internally and only convert when * flushing would avoid this, but it would most likely result in depth fighting * artifacts. * * Since we're using linear layout for everything, but we need to deal with * 2x2 quads, we need to load/store multiple values and swizzle them into * place (we could avoid this by doing depth/stencil testing in linear format, * which would be easy for late depth/stencil test as we could do that after * the fragment shader loop just as we do for color buffers, but more tricky * for early depth test as we'd need both masks and interpolated depth in * linear format). * * * @author Jose Fonseca * @author Brian Paul */ #include "pipe/p_state.h" #include "util/format/u_format.h" #include "util/u_cpu_detect.h" #include "gallivm/lp_bld_type.h" #include "gallivm/lp_bld_arit.h" #include "gallivm/lp_bld_bitarit.h" #include "gallivm/lp_bld_const.h" #include "gallivm/lp_bld_conv.h" #include "gallivm/lp_bld_logic.h" #include "gallivm/lp_bld_flow.h" #include "gallivm/lp_bld_intr.h" #include "gallivm/lp_bld_debug.h" #include "gallivm/lp_bld_swizzle.h" #include "gallivm/lp_bld_pack.h" #include "lp_bld_depth.h" /** Used to select fields from pipe_stencil_state */ enum stencil_op { S_FAIL_OP, Z_FAIL_OP, Z_PASS_OP }; /** * Do the stencil test comparison (compare FB stencil values against ref value). * This will be used twice when generating two-sided stencil code. * \param stencil the front/back stencil state * \param stencilRef the stencil reference value, replicated as a vector * \param stencilVals vector of stencil values from framebuffer * \return vector mask of pass/fail values (~0 or 0) */ static LLVMValueRef lp_build_stencil_test_single(struct lp_build_context *bld, const struct pipe_stencil_state *stencil, LLVMValueRef stencilRef, LLVMValueRef stencilVals) { LLVMBuilderRef builder = bld->gallivm->builder; const unsigned stencilMax = 255; /* XXX fix */ struct lp_type type = bld->type; LLVMValueRef res; /* * SSE2 has intrinsics for signed comparisons, but not unsigned ones. Values * are between 0..255 so ensure we generate the fastest comparisons for * wider elements. */ if (type.width <= 8) { assert(!type.sign); } else { assert(type.sign); } assert(stencil->enabled); if (stencil->valuemask != stencilMax) { /* compute stencilRef = stencilRef & valuemask */ LLVMValueRef valuemask = lp_build_const_int_vec(bld->gallivm, type, stencil->valuemask); stencilRef = LLVMBuildAnd(builder, stencilRef, valuemask, ""); /* compute stencilVals = stencilVals & valuemask */ stencilVals = LLVMBuildAnd(builder, stencilVals, valuemask, ""); } res = lp_build_cmp(bld, stencil->func, stencilRef, stencilVals); return res; } /** * Do the one or two-sided stencil test comparison. * \sa lp_build_stencil_test_single * \param front_facing an integer vector mask, indicating front (~0) or back * (0) facing polygon. If NULL, assume front-facing. */ static LLVMValueRef lp_build_stencil_test(struct lp_build_context *bld, const struct pipe_stencil_state stencil[2], LLVMValueRef stencilRefs[2], LLVMValueRef stencilVals, LLVMValueRef front_facing) { LLVMValueRef res; assert(stencil[0].enabled); /* do front face test */ res = lp_build_stencil_test_single(bld, &stencil[0], stencilRefs[0], stencilVals); if (stencil[1].enabled && front_facing != NULL) { /* do back face test */ LLVMValueRef back_res; back_res = lp_build_stencil_test_single(bld, &stencil[1], stencilRefs[1], stencilVals); res = lp_build_select(bld, front_facing, res, back_res); } return res; } /** * Apply the stencil operator (add/sub/keep/etc) to the given vector * of stencil values. * \return new stencil values vector */ static LLVMValueRef lp_build_stencil_op_single(struct lp_build_context *bld, const struct pipe_stencil_state *stencil, enum stencil_op op, LLVMValueRef stencilRef, LLVMValueRef stencilVals) { LLVMBuilderRef builder = bld->gallivm->builder; struct lp_type type = bld->type; LLVMValueRef res; LLVMValueRef max = lp_build_const_int_vec(bld->gallivm, type, 0xff); unsigned stencil_op; assert(type.sign); switch (op) { case S_FAIL_OP: stencil_op = stencil->fail_op; break; case Z_FAIL_OP: stencil_op = stencil->zfail_op; break; case Z_PASS_OP: stencil_op = stencil->zpass_op; break; default: assert(0 && "Invalid stencil_op mode"); stencil_op = PIPE_STENCIL_OP_KEEP; } switch (stencil_op) { case PIPE_STENCIL_OP_KEEP: res = stencilVals; /* we can return early for this case */ return res; case PIPE_STENCIL_OP_ZERO: res = bld->zero; break; case PIPE_STENCIL_OP_REPLACE: res = stencilRef; break; case PIPE_STENCIL_OP_INCR: res = lp_build_add(bld, stencilVals, bld->one); res = lp_build_min(bld, res, max); break; case PIPE_STENCIL_OP_DECR: res = lp_build_sub(bld, stencilVals, bld->one); res = lp_build_max(bld, res, bld->zero); break; case PIPE_STENCIL_OP_INCR_WRAP: res = lp_build_add(bld, stencilVals, bld->one); res = LLVMBuildAnd(builder, res, max, ""); break; case PIPE_STENCIL_OP_DECR_WRAP: res = lp_build_sub(bld, stencilVals, bld->one); res = LLVMBuildAnd(builder, res, max, ""); break; case PIPE_STENCIL_OP_INVERT: res = LLVMBuildNot(builder, stencilVals, ""); res = LLVMBuildAnd(builder, res, max, ""); break; default: assert(0 && "bad stencil op mode"); res = bld->undef; } return res; } /** * Do the one or two-sided stencil test op/update. */ static LLVMValueRef lp_build_stencil_op(struct lp_build_context *bld, const struct pipe_stencil_state stencil[2], enum stencil_op op, LLVMValueRef stencilRefs[2], LLVMValueRef stencilVals, LLVMValueRef mask, LLVMValueRef front_facing) { LLVMBuilderRef builder = bld->gallivm->builder; LLVMValueRef res; assert(stencil[0].enabled); /* do front face op */ res = lp_build_stencil_op_single(bld, &stencil[0], op, stencilRefs[0], stencilVals); if (stencil[1].enabled && front_facing != NULL) { /* do back face op */ LLVMValueRef back_res; back_res = lp_build_stencil_op_single(bld, &stencil[1], op, stencilRefs[1], stencilVals); res = lp_build_select(bld, front_facing, res, back_res); } if (stencil[0].writemask != 0xff || (stencil[1].enabled && front_facing != NULL && stencil[1].writemask != 0xff)) { /* mask &= stencil[0].writemask */ LLVMValueRef writemask = lp_build_const_int_vec(bld->gallivm, bld->type, stencil[0].writemask); if (stencil[1].enabled && stencil[1].writemask != stencil[0].writemask && front_facing != NULL) { LLVMValueRef back_writemask = lp_build_const_int_vec(bld->gallivm, bld->type, stencil[1].writemask); writemask = lp_build_select(bld, front_facing, writemask, back_writemask); } mask = LLVMBuildAnd(builder, mask, writemask, ""); /* res = (res & mask) | (stencilVals & ~mask) */ res = lp_build_select_bitwise(bld, mask, res, stencilVals); } else { /* res = mask ? res : stencilVals */ res = lp_build_select(bld, mask, res, stencilVals); } return res; } /** * Return a type that matches the depth/stencil format. */ struct lp_type lp_depth_type(const struct util_format_description *format_desc, unsigned length) { struct lp_type type; unsigned z_swizzle; assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS); assert(format_desc->block.width == 1); assert(format_desc->block.height == 1); memset(&type, 0, sizeof type); type.width = format_desc->block.bits; z_swizzle = format_desc->swizzle[0]; if (z_swizzle < 4) { if (format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT) { type.floating = TRUE; assert(z_swizzle == 0); assert(format_desc->channel[z_swizzle].size == 32); } else if(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) { assert(format_desc->block.bits <= 32); assert(format_desc->channel[z_swizzle].normalized); if (format_desc->channel[z_swizzle].size < format_desc->block.bits) { /* Prefer signed integers when possible, as SSE has less support * for unsigned comparison; */ type.sign = TRUE; } } else assert(0); } type.length = length; return type; } /** * Compute bitmask and bit shift to apply to the incoming fragment Z values * and the Z buffer values needed before doing the Z comparison. * * Note that we leave the Z bits in the position that we find them * in the Z buffer (typically 0xffffff00 or 0x00ffffff). That lets us * get by with fewer bit twiddling steps. */ static boolean get_z_shift_and_mask(const struct util_format_description *format_desc, unsigned *shift, unsigned *width, unsigned *mask) { unsigned total_bits; unsigned z_swizzle; assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS); assert(format_desc->block.width == 1); assert(format_desc->block.height == 1); /* 64bit d/s format is special already extracted 32 bits */ total_bits = format_desc->block.bits > 32 ? 32 : format_desc->block.bits; z_swizzle = format_desc->swizzle[0]; if (z_swizzle == PIPE_SWIZZLE_NONE) return FALSE; *width = format_desc->channel[z_swizzle].size; /* & 31 is for the same reason as the 32-bit limit above */ *shift = format_desc->channel[z_swizzle].shift & 31; if (*width == total_bits) { *mask = 0xffffffff; } else { *mask = ((1 << *width) - 1) << *shift; } return TRUE; } /** * Compute bitmask and bit shift to apply to the framebuffer pixel values * to put the stencil bits in the least significant position. * (i.e. 0x000000ff) */ static boolean get_s_shift_and_mask(const struct util_format_description *format_desc, unsigned *shift, unsigned *mask) { unsigned s_swizzle; unsigned sz; s_swizzle = format_desc->swizzle[1]; if (s_swizzle == PIPE_SWIZZLE_NONE) return FALSE; /* just special case 64bit d/s format */ if (format_desc->block.bits > 32) { /* XXX big-endian? */ assert(format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT); *shift = 0; *mask = 0xff; return TRUE; } *shift = format_desc->channel[s_swizzle].shift; sz = format_desc->channel[s_swizzle].size; *mask = (1U << sz) - 1U; return TRUE; } /** * Perform the occlusion test and increase the counter. * Test the depth mask. Add the number of channel which has none zero mask * into the occlusion counter. e.g. maskvalue is {-1, -1, -1, -1}. * The counter will add 4. * TODO: could get that out of the fs loop. * * \param type holds element type of the mask vector. * \param maskvalue is the depth test mask. * \param counter is a pointer of the uint32 counter. */ void lp_build_occlusion_count(struct gallivm_state *gallivm, struct lp_type type, LLVMValueRef maskvalue, LLVMValueRef counter) { LLVMBuilderRef builder = gallivm->builder; LLVMContextRef context = gallivm->context; LLVMValueRef countmask = lp_build_const_int_vec(gallivm, type, 1); LLVMValueRef count, newcount; assert(type.length <= 16); assert(type.floating); if(util_cpu_caps.has_sse && type.length == 4) { const char *movmskintr = "llvm.x86.sse.movmsk.ps"; const char *popcntintr = "llvm.ctpop.i32"; LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue, lp_build_vec_type(gallivm, type), ""); bits = lp_build_intrinsic_unary(builder, movmskintr, LLVMInt32TypeInContext(context), bits); count = lp_build_intrinsic_unary(builder, popcntintr, LLVMInt32TypeInContext(context), bits); count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), ""); } else if(util_cpu_caps.has_avx && type.length == 8) { const char *movmskintr = "llvm.x86.avx.movmsk.ps.256"; const char *popcntintr = "llvm.ctpop.i32"; LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue, lp_build_vec_type(gallivm, type), ""); bits = lp_build_intrinsic_unary(builder, movmskintr, LLVMInt32TypeInContext(context), bits); count = lp_build_intrinsic_unary(builder, popcntintr, LLVMInt32TypeInContext(context), bits); count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), ""); } else { unsigned i; LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv"); LLVMTypeRef counttype = LLVMIntTypeInContext(context, type.length * 8); LLVMTypeRef i8vntype = LLVMVectorType(LLVMInt8TypeInContext(context), type.length * 4); LLVMValueRef shufflev, countd; LLVMValueRef shuffles[16]; const char *popcntintr = NULL; countv = LLVMBuildBitCast(builder, countv, i8vntype, ""); for (i = 0; i < type.length; i++) { shuffles[i] = lp_build_const_int32(gallivm, 4*i); } shufflev = LLVMConstVector(shuffles, type.length); countd = LLVMBuildShuffleVector(builder, countv, LLVMGetUndef(i8vntype), shufflev, ""); countd = LLVMBuildBitCast(builder, countd, counttype, "countd"); /* * XXX FIXME * this is bad on cpus without popcount (on x86 supported by intel * nehalem, amd barcelona, and up - not tied to sse42). * Would be much faster to just sum the 4 elements of the vector with * some horizontal add (shuffle/add/shuffle/add after the initial and). */ switch (type.length) { case 4: popcntintr = "llvm.ctpop.i32"; break; case 8: popcntintr = "llvm.ctpop.i64"; break; case 16: popcntintr = "llvm.ctpop.i128"; break; default: assert(0); } count = lp_build_intrinsic_unary(builder, popcntintr, counttype, countd); if (type.length > 8) { count = LLVMBuildTrunc(builder, count, LLVMIntTypeInContext(context, 64), ""); } else if (type.length < 8) { count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), ""); } } newcount = LLVMBuildLoad(builder, counter, "origcount"); newcount = LLVMBuildAdd(builder, newcount, count, "newcount"); LLVMBuildStore(builder, newcount, counter); } /** * Load depth/stencil values. * The stored values are linear, swizzle them. * * \param type the data type of the fragment depth/stencil values * \param format_desc description of the depth/stencil surface * \param is_1d whether this resource has only one dimension * \param loop_counter the current loop iteration * \param depth_ptr pointer to the depth/stencil values of this 4x4 block * \param depth_stride stride of the depth/stencil buffer * \param z_fb contains z values loaded from fb (may include padding) * \param s_fb contains s values loaded from fb (may include padding) */ void lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm, struct lp_type z_src_type, const struct util_format_description *format_desc, boolean is_1d, LLVMValueRef depth_ptr, LLVMValueRef depth_stride, LLVMValueRef *z_fb, LLVMValueRef *s_fb, LLVMValueRef loop_counter) { LLVMBuilderRef builder = gallivm->builder; LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4]; LLVMValueRef zs_dst1, zs_dst2; LLVMValueRef zs_dst_ptr; LLVMValueRef depth_offset1, depth_offset2; LLVMTypeRef load_ptr_type; unsigned depth_bytes = format_desc->block.bits / 8; struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length); struct lp_type zs_load_type = zs_type; zs_load_type.length = zs_load_type.length / 2; load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0); if (z_src_type.length == 4) { unsigned i; LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter, lp_build_const_int32(gallivm, 1), ""); LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter, lp_build_const_int32(gallivm, 2), ""); LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb, depth_stride, ""); depth_offset1 = LLVMBuildMul(builder, looplsb, lp_build_const_int32(gallivm, depth_bytes * 2), ""); depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, ""); /* just concatenate the loaded 2x2 values into 4-wide vector */ for (i = 0; i < 4; i++) { shuffles[i] = lp_build_const_int32(gallivm, i); } } else { unsigned i; LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter, lp_build_const_int32(gallivm, 1), ""); assert(z_src_type.length == 8); depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, ""); /* * We load 2x4 values, and need to swizzle them (order * 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately. */ for (i = 0; i < 8; i++) { shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2); } } depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, ""); /* Load current z/stencil values from z/stencil buffer */ zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, ""); zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, ""); zs_dst1 = LLVMBuildLoad(builder, zs_dst_ptr, ""); if (is_1d) { zs_dst2 = lp_build_undef(gallivm, zs_load_type); } else { zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, ""); zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, ""); zs_dst2 = LLVMBuildLoad(builder, zs_dst_ptr, ""); } *z_fb = LLVMBuildShuffleVector(builder, zs_dst1, zs_dst2, LLVMConstVector(shuffles, zs_type.length), ""); *s_fb = *z_fb; if (format_desc->block.bits < z_src_type.width) { /* Extend destination ZS values (e.g., when reading from Z16_UNORM) */ *z_fb = LLVMBuildZExt(builder, *z_fb, lp_build_int_vec_type(gallivm, z_src_type), ""); } else if (format_desc->block.bits > 32) { /* rely on llvm to handle too wide vector we have here nicely */ unsigned i; struct lp_type typex2 = zs_type; struct lp_type s_type = zs_type; LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 4]; LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 4]; LLVMValueRef tmp; typex2.width = typex2.width / 2; typex2.length = typex2.length * 2; s_type.width = s_type.width / 2; s_type.floating = 0; tmp = LLVMBuildBitCast(builder, *z_fb, lp_build_vec_type(gallivm, typex2), ""); for (i = 0; i < zs_type.length; i++) { shuffles1[i] = lp_build_const_int32(gallivm, i * 2); shuffles2[i] = lp_build_const_int32(gallivm, i * 2 + 1); } *z_fb = LLVMBuildShuffleVector(builder, tmp, tmp, LLVMConstVector(shuffles1, zs_type.length), ""); *s_fb = LLVMBuildShuffleVector(builder, tmp, tmp, LLVMConstVector(shuffles2, zs_type.length), ""); *s_fb = LLVMBuildBitCast(builder, *s_fb, lp_build_vec_type(gallivm, s_type), ""); lp_build_name(*s_fb, "s_dst"); } lp_build_name(*z_fb, "z_dst"); lp_build_name(*s_fb, "s_dst"); lp_build_name(*z_fb, "z_dst"); } /** * Store depth/stencil values. * Incoming values are swizzled (typically n 2x2 quads), stored linear. * If there's a mask it will do select/store otherwise just store. * * \param type the data type of the fragment depth/stencil values * \param format_desc description of the depth/stencil surface * \param is_1d whether this resource has only one dimension * \param mask the alive/dead pixel mask for the quad (vector) * \param z_fb z values read from fb (with padding) * \param s_fb s values read from fb (with padding) * \param loop_counter the current loop iteration * \param depth_ptr pointer to the depth/stencil values of this 4x4 block * \param depth_stride stride of the depth/stencil buffer * \param z_value the depth values to store (with padding) * \param s_value the stencil values to store (with padding) */ void lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm, struct lp_type z_src_type, const struct util_format_description *format_desc, boolean is_1d, struct lp_build_mask_context *mask, LLVMValueRef z_fb, LLVMValueRef s_fb, LLVMValueRef loop_counter, LLVMValueRef depth_ptr, LLVMValueRef depth_stride, LLVMValueRef z_value, LLVMValueRef s_value) { struct lp_build_context z_bld; LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4]; LLVMBuilderRef builder = gallivm->builder; LLVMValueRef mask_value = NULL; LLVMValueRef zs_dst1, zs_dst2; LLVMValueRef zs_dst_ptr1, zs_dst_ptr2; LLVMValueRef depth_offset1, depth_offset2; LLVMTypeRef load_ptr_type; unsigned depth_bytes = format_desc->block.bits / 8; struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length); struct lp_type z_type = zs_type; struct lp_type zs_load_type = zs_type; zs_load_type.length = zs_load_type.length / 2; load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0); z_type.width = z_src_type.width; lp_build_context_init(&z_bld, gallivm, z_type); /* * This is far from ideal, at least for late depth write we should do this * outside the fs loop to avoid all the swizzle stuff. */ if (z_src_type.length == 4) { LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter, lp_build_const_int32(gallivm, 1), ""); LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter, lp_build_const_int32(gallivm, 2), ""); LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb, depth_stride, ""); depth_offset1 = LLVMBuildMul(builder, looplsb, lp_build_const_int32(gallivm, depth_bytes * 2), ""); depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, ""); } else { unsigned i; LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter, lp_build_const_int32(gallivm, 1), ""); assert(z_src_type.length == 8); depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, ""); /* * We load 2x4 values, and need to swizzle them (order * 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately. */ for (i = 0; i < 8; i++) { shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2); } } depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, ""); zs_dst_ptr1 = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, ""); zs_dst_ptr1 = LLVMBuildBitCast(builder, zs_dst_ptr1, load_ptr_type, ""); zs_dst_ptr2 = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, ""); zs_dst_ptr2 = LLVMBuildBitCast(builder, zs_dst_ptr2, load_ptr_type, ""); if (format_desc->block.bits > 32) { s_value = LLVMBuildBitCast(builder, s_value, z_bld.vec_type, ""); } if (mask) { mask_value = lp_build_mask_value(mask); z_value = lp_build_select(&z_bld, mask_value, z_value, z_fb); if (format_desc->block.bits > 32) { s_fb = LLVMBuildBitCast(builder, s_fb, z_bld.vec_type, ""); s_value = lp_build_select(&z_bld, mask_value, s_value, s_fb); } } if (zs_type.width < z_src_type.width) { /* Truncate ZS values (e.g., when writing to Z16_UNORM) */ z_value = LLVMBuildTrunc(builder, z_value, lp_build_int_vec_type(gallivm, zs_type), ""); } if (format_desc->block.bits <= 32) { if (z_src_type.length == 4) { zs_dst1 = lp_build_extract_range(gallivm, z_value, 0, 2); zs_dst2 = lp_build_extract_range(gallivm, z_value, 2, 2); } else { assert(z_src_type.length == 8); zs_dst1 = LLVMBuildShuffleVector(builder, z_value, z_value, LLVMConstVector(&shuffles[0], zs_load_type.length), ""); zs_dst2 = LLVMBuildShuffleVector(builder, z_value, z_value, LLVMConstVector(&shuffles[4], zs_load_type.length), ""); } } else { if (z_src_type.length == 4) { zs_dst1 = lp_build_interleave2(gallivm, z_type, z_value, s_value, 0); zs_dst2 = lp_build_interleave2(gallivm, z_type, z_value, s_value, 1); } else { unsigned i; LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 2]; assert(z_src_type.length == 8); for (i = 0; i < 8; i++) { shuffles[i*2] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2); shuffles[i*2+1] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2 + z_src_type.length); } zs_dst1 = LLVMBuildShuffleVector(builder, z_value, s_value, LLVMConstVector(&shuffles[0], z_src_type.length), ""); zs_dst2 = LLVMBuildShuffleVector(builder, z_value, s_value, LLVMConstVector(&shuffles[8], z_src_type.length), ""); } zs_dst1 = LLVMBuildBitCast(builder, zs_dst1, lp_build_vec_type(gallivm, zs_load_type), ""); zs_dst2 = LLVMBuildBitCast(builder, zs_dst2, lp_build_vec_type(gallivm, zs_load_type), ""); } LLVMBuildStore(builder, zs_dst1, zs_dst_ptr1); if (!is_1d) { LLVMBuildStore(builder, zs_dst2, zs_dst_ptr2); } } /** * Generate code for performing depth and/or stencil tests. * We operate on a vector of values (typically n 2x2 quads). * * \param depth the depth test state * \param stencil the front/back stencil state * \param type the data type of the fragment depth/stencil values * \param format_desc description of the depth/stencil surface * \param mask the alive/dead pixel mask for the quad (vector) * \param stencil_refs the front/back stencil ref values (scalar) * \param z_src the incoming depth/stencil values (n 2x2 quad values, float32) * \param zs_dst the depth/stencil values in framebuffer * \param face contains boolean value indicating front/back facing polygon */ void lp_build_depth_stencil_test(struct gallivm_state *gallivm, const struct pipe_depth_state *depth, const struct pipe_stencil_state stencil[2], struct lp_type z_src_type, const struct util_format_description *format_desc, struct lp_build_mask_context *mask, LLVMValueRef stencil_refs[2], LLVMValueRef z_src, LLVMValueRef z_fb, LLVMValueRef s_fb, LLVMValueRef face, LLVMValueRef *z_value, LLVMValueRef *s_value, boolean do_branch) { LLVMBuilderRef builder = gallivm->builder; struct lp_type z_type; struct lp_build_context z_bld; struct lp_build_context s_bld; struct lp_type s_type; unsigned z_shift = 0, z_width = 0, z_mask = 0; LLVMValueRef z_dst = NULL; LLVMValueRef stencil_vals = NULL; LLVMValueRef z_bitmask = NULL, stencil_shift = NULL; LLVMValueRef z_pass = NULL, s_pass_mask = NULL; LLVMValueRef current_mask = lp_build_mask_value(mask); LLVMValueRef front_facing = NULL; boolean have_z, have_s; /* * Depths are expected to be between 0 and 1, even if they are stored in * floats. Setting these bits here will ensure that the lp_build_conv() call * below won't try to unnecessarily clamp the incoming values. */ if(z_src_type.floating) { z_src_type.sign = FALSE; z_src_type.norm = TRUE; } else { assert(!z_src_type.sign); assert(z_src_type.norm); } /* Pick the type matching the depth-stencil format. */ z_type = lp_depth_type(format_desc, z_src_type.length); /* Pick the intermediate type for depth operations. */ z_type.width = z_src_type.width; assert(z_type.length == z_src_type.length); /* FIXME: for non-float depth/stencil might generate better code * if we'd always split it up to use 128bit operations. * For stencil we'd almost certainly want to pack to 8xi16 values, * for z just run twice. */ /* Sanity checking */ { const unsigned z_swizzle = format_desc->swizzle[0]; const unsigned s_swizzle = format_desc->swizzle[1]; assert(z_swizzle != PIPE_SWIZZLE_NONE || s_swizzle != PIPE_SWIZZLE_NONE); assert(depth->enabled || stencil[0].enabled); assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS); assert(format_desc->block.width == 1); assert(format_desc->block.height == 1); if (stencil[0].enabled) { assert(s_swizzle < 4); assert(format_desc->channel[s_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED); assert(format_desc->channel[s_swizzle].pure_integer); assert(!format_desc->channel[s_swizzle].normalized); assert(format_desc->channel[s_swizzle].size == 8); } if (depth->enabled) { assert(z_swizzle < 4); if (z_type.floating) { assert(z_swizzle == 0); assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT); assert(format_desc->channel[z_swizzle].size == 32); } else { assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED); assert(format_desc->channel[z_swizzle].normalized); assert(!z_type.fixed); } } } /* Setup build context for Z vals */ lp_build_context_init(&z_bld, gallivm, z_type); /* Setup build context for stencil vals */ s_type = lp_int_type(z_type); lp_build_context_init(&s_bld, gallivm, s_type); /* Compute and apply the Z/stencil bitmasks and shifts. */ { unsigned s_shift, s_mask; z_dst = z_fb; stencil_vals = s_fb; have_z = get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask); have_s = get_s_shift_and_mask(format_desc, &s_shift, &s_mask); if (have_z) { if (z_mask != 0xffffffff) { z_bitmask = lp_build_const_int_vec(gallivm, z_type, z_mask); } /* * Align the framebuffer Z 's LSB to the right. */ if (z_shift) { LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift); z_dst = LLVMBuildLShr(builder, z_dst, shift, "z_dst"); } else if (z_bitmask) { z_dst = LLVMBuildAnd(builder, z_dst, z_bitmask, "z_dst"); } else { lp_build_name(z_dst, "z_dst"); } } if (have_s) { if (s_shift) { LLVMValueRef shift = lp_build_const_int_vec(gallivm, s_type, s_shift); stencil_vals = LLVMBuildLShr(builder, stencil_vals, shift, ""); stencil_shift = shift; /* used below */ } if (s_mask != 0xffffffff) { LLVMValueRef mask = lp_build_const_int_vec(gallivm, s_type, s_mask); stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, ""); } lp_build_name(stencil_vals, "s_dst"); } } if (stencil[0].enabled) { if (face) { if (0) { /* * XXX: the scalar expansion below produces atrocious code * (basically producing a 64bit scalar value, then moving the 2 * 32bit pieces separately to simd, plus 4 shuffles, which is * seriously lame). But the scalar-simd transitions are always * tricky, so no big surprise there. * This here would be way better, however llvm has some serious * trouble later using it in the select, probably because it will * recognize the expression as constant and move the simd value * away (out of the loop) - and then it will suddenly try * constructing i1 high-bit masks out of it later... * (Try piglit stencil-twoside.) * Note this is NOT due to using SExt/Trunc, it fails exactly the * same even when using native compare/select. * I cannot reproduce this problem when using stand-alone compiler * though, suggesting some problem with optimization passes... * (With stand-alone compilation, the construction of this mask * value, no matter if the easy 3 instruction here or the complex * 16+ one below, never gets separated from where it's used.) * The scalar code still has the same problem, but the generated * code looks a bit better at least for some reason, even if * mostly by luck (the fundamental issue clearly is the same). */ front_facing = lp_build_broadcast(gallivm, s_bld.vec_type, face); /* front_facing = face != 0 ? ~0 : 0 */ front_facing = lp_build_compare(gallivm, s_bld.type, PIPE_FUNC_NOTEQUAL, front_facing, s_bld.zero); } else { LLVMValueRef zero = lp_build_const_int32(gallivm, 0); /* front_facing = face != 0 ? ~0 : 0 */ front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, ""); front_facing = LLVMBuildSExt(builder, front_facing, LLVMIntTypeInContext(gallivm->context, s_bld.type.length*s_bld.type.width), ""); front_facing = LLVMBuildBitCast(builder, front_facing, s_bld.int_vec_type, ""); } } s_pass_mask = lp_build_stencil_test(&s_bld, stencil, stencil_refs, stencil_vals, front_facing); /* apply stencil-fail operator */ { LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, current_mask, s_pass_mask); stencil_vals = lp_build_stencil_op(&s_bld, stencil, S_FAIL_OP, stencil_refs, stencil_vals, s_fail_mask, front_facing); } } if (depth->enabled) { /* * Convert fragment Z to the desired type, aligning the LSB to the right. */ assert(z_type.width == z_src_type.width); assert(z_type.length == z_src_type.length); assert(lp_check_value(z_src_type, z_src)); if (z_src_type.floating) { /* * Convert from floating point values */ if (!z_type.floating) { z_src = lp_build_clamped_float_to_unsigned_norm(gallivm, z_src_type, z_width, z_src); } } else { /* * Convert from unsigned normalized values. */ assert(!z_src_type.sign); assert(!z_src_type.fixed); assert(z_src_type.norm); assert(!z_type.floating); if (z_src_type.width > z_width) { LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_src_type, z_src_type.width - z_width); z_src = LLVMBuildLShr(builder, z_src, shift, ""); } } assert(lp_check_value(z_type, z_src)); lp_build_name(z_src, "z_src"); /* compare src Z to dst Z, returning 'pass' mask */ z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst); /* mask off bits that failed stencil test */ if (s_pass_mask) { current_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, ""); } if (!stencil[0].enabled) { /* We can potentially skip all remaining operations here, but only * if stencil is disabled because we still need to update the stencil * buffer values. Don't need to update Z buffer values. */ lp_build_mask_update(mask, z_pass); if (do_branch) { lp_build_mask_check(mask); } } if (depth->writemask) { LLVMValueRef z_pass_mask; /* mask off bits that failed Z test */ z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, ""); /* Mix the old and new Z buffer values. * z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i] */ z_dst = lp_build_select(&z_bld, z_pass_mask, z_src, z_dst); } if (stencil[0].enabled) { /* update stencil buffer values according to z pass/fail result */ LLVMValueRef z_fail_mask, z_pass_mask; /* apply Z-fail operator */ z_fail_mask = lp_build_andnot(&s_bld, current_mask, z_pass); stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_FAIL_OP, stencil_refs, stencil_vals, z_fail_mask, front_facing); /* apply Z-pass operator */ z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, ""); stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP, stencil_refs, stencil_vals, z_pass_mask, front_facing); } } else { /* No depth test: apply Z-pass operator to stencil buffer values which * passed the stencil test. */ s_pass_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, ""); stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP, stencil_refs, stencil_vals, s_pass_mask, front_facing); } /* Put Z and stencil bits in the right place */ if (have_z && z_shift) { LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift); z_dst = LLVMBuildShl(builder, z_dst, shift, ""); } if (stencil_vals && stencil_shift) stencil_vals = LLVMBuildShl(builder, stencil_vals, stencil_shift, ""); /* Finally, merge the z/stencil values */ if (format_desc->block.bits <= 32) { if (have_z && have_s) *z_value = LLVMBuildOr(builder, z_dst, stencil_vals, ""); else if (have_z) *z_value = z_dst; else *z_value = stencil_vals; *s_value = *z_value; } else { *z_value = z_dst; *s_value = stencil_vals; } if (s_pass_mask) lp_build_mask_update(mask, s_pass_mask); if (depth->enabled && stencil[0].enabled) lp_build_mask_update(mask, z_pass); }