diff options
Diffstat (limited to 'src/gallium/drivers/llvmpipe')
43 files changed, 2789 insertions, 1861 deletions
diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile index 55b877b4ab9..08da2286b05 100644 --- a/src/gallium/drivers/llvmpipe/Makefile +++ b/src/gallium/drivers/llvmpipe/Makefile @@ -28,8 +28,6 @@ C_SOURCES = \ lp_scene_queue.c \ lp_screen.c \ lp_setup.c \ - lp_setup_coef.c \ - lp_setup_coef_intrin.c \ lp_setup_line.c \ lp_setup_point.c \ lp_setup_tri.c \ @@ -38,6 +36,7 @@ C_SOURCES = \ lp_state_clip.c \ lp_state_derived.c \ lp_state_fs.c \ + lp_state_setup.c \ lp_state_gs.c \ lp_state_rasterizer.c \ lp_state_sampler.c \ @@ -63,12 +62,12 @@ PROGS := lp_test_format \ # Need this for the lp_test_*.o files CLEAN_EXTRA = *.o +include ../../Makefile.template + lp_test_sincos.o : sse_mathfun.h PROGS_DEPS := ../../auxiliary/libgallium.a -include ../../Makefile.template - lp_tile_soa.c: lp_tile_soa.py ../../auxiliary/util/u_format_parse.py ../../auxiliary/util/u_format_pack.py ../../auxiliary/util/u_format.csv python lp_tile_soa.py ../../auxiliary/util/u_format.csv > $@ diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript index 650435f0f19..49950153a4f 100644 --- a/src/gallium/drivers/llvmpipe/SConscript +++ b/src/gallium/drivers/llvmpipe/SConscript @@ -27,13 +27,7 @@ env.Depends('lp_tile_soa.c', [ ]) -# Only enable SSSE3 for lp_tile_soa_sse3.c -ssse3_env = env.Clone() -if env['gcc'] \ - and distutils.version.LooseVersion(env['CCVERSION']) >= distutils.version.LooseVersion('4.3') \ - and env['machine'] in ('x86', 'x86_64') : - ssse3_env.Append(CCFLAGS = ['-mssse3']) -lp_tile_soa_os = ssse3_env.SharedObject('lp_tile_soa.c') +lp_tile_soa_os = env.SharedObject('lp_tile_soa.c') llvmpipe = env.ConvenienceLibrary( @@ -64,13 +58,12 @@ llvmpipe = env.ConvenienceLibrary( 'lp_setup_line.c', 'lp_setup_point.c', 'lp_setup_tri.c', - 'lp_setup_coef.c', - 'lp_setup_coef_intrin.c', 'lp_setup_vbuf.c', 'lp_state_blend.c', 'lp_state_clip.c', 'lp_state_derived.c', 'lp_state_fs.c', + 'lp_state_setup.c', 'lp_state_gs.c', 'lp_state_rasterizer.c', 'lp_state_sampler.c', diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.c b/src/gallium/drivers/llvmpipe/lp_bld_alpha.c index e28efe778f9..e50643790c8 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_alpha.c +++ b/src/gallium/drivers/llvmpipe/lp_bld_alpha.c @@ -48,7 +48,8 @@ lp_build_alpha_test(LLVMBuilderRef builder, struct lp_type type, struct lp_build_mask_context *mask, LLVMValueRef alpha, - LLVMValueRef ref) + LLVMValueRef ref, + boolean do_branch) { struct lp_build_context bld; LLVMValueRef test; @@ -60,4 +61,7 @@ lp_build_alpha_test(LLVMBuilderRef builder, lp_build_name(test, "alpha_mask"); lp_build_mask_update(mask, test); + + if (do_branch) + lp_build_mask_check(mask); } diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h index 44603b418c0..27ca8aad4d4 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h +++ b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h @@ -48,7 +48,8 @@ lp_build_alpha_test(LLVMBuilderRef builder, struct lp_type type, struct lp_build_mask_context *mask, LLVMValueRef alpha, - LLVMValueRef ref); + LLVMValueRef ref, + boolean do_branch); #endif /* !LP_BLD_ALPHA_H */ diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c index 7561899a74e..7eb76d4fb31 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c +++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c @@ -1,6 +1,6 @@ /************************************************************************** * - * Copyright 2009 VMware, Inc. + * Copyright 2009-2010 VMware, Inc. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a @@ -53,15 +53,8 @@ * ... ... ... ... ... ... ... ... ... * * - * Stencil test: - * Two-sided stencil test is supported but probably not as efficient as - * it could be. Currently, we use if/then/else constructs to do the - * operations for front vs. back-facing polygons. We could probably do - * both the front and back arithmetic then use a Select() instruction to - * choose the result depending on polyon orientation. We'd have to - * measure performance both ways and see which is better. - * * @author Jose Fonseca <[email protected]> + * @author Brian Paul <[email protected]> */ #include "pipe/p_state.h" @@ -71,6 +64,7 @@ #include "gallivm/lp_bld_arit.h" #include "gallivm/lp_bld_bitarit.h" #include "gallivm/lp_bld_const.h" +#include "gallivm/lp_bld_conv.h" #include "gallivm/lp_bld_logic.h" #include "gallivm/lp_bld_flow.h" #include "gallivm/lp_bld_intr.h" @@ -128,57 +122,32 @@ lp_build_stencil_test_single(struct lp_build_context *bld, /** * Do the one or two-sided stencil test comparison. * \sa lp_build_stencil_test_single - * \param face an integer indicating front (+) or back (-) facing polygon. - * If NULL, assume front-facing. + * \param front_facing an integer vector mask, indicating front (~0) or back + * (0) facing polygon. If NULL, assume front-facing. */ static LLVMValueRef lp_build_stencil_test(struct lp_build_context *bld, const struct pipe_stencil_state stencil[2], LLVMValueRef stencilRefs[2], LLVMValueRef stencilVals, - LLVMValueRef face) + LLVMValueRef front_facing) { LLVMValueRef res; assert(stencil[0].enabled); - if (stencil[1].enabled && face) { - /* do two-sided test */ - struct lp_build_flow_context *flow_ctx; - struct lp_build_if_state if_ctx; - LLVMValueRef front_facing; - LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0); - LLVMValueRef result = bld->undef; - - flow_ctx = lp_build_flow_create(bld->builder); - lp_build_flow_scope_begin(flow_ctx); + /* do front face test */ + res = lp_build_stencil_test_single(bld, &stencil[0], + stencilRefs[0], stencilVals); - lp_build_flow_scope_declare(flow_ctx, &result); + if (stencil[1].enabled && front_facing) { + /* do back face test */ + LLVMValueRef back_res; - /* front_facing = face > 0.0 */ - front_facing = LLVMBuildFCmp(bld->builder, LLVMRealUGT, face, zero, ""); - - lp_build_if(&if_ctx, flow_ctx, bld->builder, front_facing); - { - result = lp_build_stencil_test_single(bld, &stencil[0], - stencilRefs[0], stencilVals); - } - lp_build_else(&if_ctx); - { - result = lp_build_stencil_test_single(bld, &stencil[1], - stencilRefs[1], stencilVals); - } - lp_build_endif(&if_ctx); + back_res = lp_build_stencil_test_single(bld, &stencil[1], + stencilRefs[1], stencilVals); - lp_build_flow_scope_end(flow_ctx); - lp_build_flow_destroy(flow_ctx); - - res = result; - } - else { - /* do single-side test */ - res = lp_build_stencil_test_single(bld, &stencil[0], - stencilRefs[0], stencilVals); + res = lp_build_select(bld, front_facing, res, back_res); } return res; @@ -195,14 +164,12 @@ lp_build_stencil_op_single(struct lp_build_context *bld, const struct pipe_stencil_state *stencil, enum stencil_op op, LLVMValueRef stencilRef, - LLVMValueRef stencilVals, - LLVMValueRef mask) + LLVMValueRef stencilVals) { - const unsigned stencilMax = 255; /* XXX fix */ struct lp_type type = bld->type; LLVMValueRef res; - LLVMValueRef max = lp_build_const_int_vec(type, stencilMax); + LLVMValueRef max = lp_build_const_int_vec(type, 0xff); unsigned stencil_op; assert(type.sign); @@ -255,19 +222,7 @@ lp_build_stencil_op_single(struct lp_build_context *bld, break; default: assert(0 && "bad stencil op mode"); - res = NULL; - } - - if (stencil->writemask != stencilMax) { - /* mask &= stencil->writemask */ - LLVMValueRef writemask = lp_build_const_int_vec(type, stencil->writemask); - mask = LLVMBuildAnd(bld->builder, mask, writemask, ""); - /* res = (res & mask) | (stencilVals & ~mask) */ - res = lp_build_select_bitwise(bld, writemask, res, stencilVals); - } - else { - /* res = mask ? res : stencilVals */ - res = lp_build_select(bld, mask, res, stencilVals); + res = bld->undef; } return res; @@ -284,49 +239,40 @@ lp_build_stencil_op(struct lp_build_context *bld, LLVMValueRef stencilRefs[2], LLVMValueRef stencilVals, LLVMValueRef mask, - LLVMValueRef face) + LLVMValueRef front_facing) { - assert(stencil[0].enabled); - - if (stencil[1].enabled && face) { - /* do two-sided op */ - struct lp_build_flow_context *flow_ctx; - struct lp_build_if_state if_ctx; - LLVMValueRef front_facing; - LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0); - LLVMValueRef result = bld->undef; + LLVMValueRef res; - flow_ctx = lp_build_flow_create(bld->builder); - lp_build_flow_scope_begin(flow_ctx); + assert(stencil[0].enabled); - lp_build_flow_scope_declare(flow_ctx, &result); + /* do front face op */ + res = lp_build_stencil_op_single(bld, &stencil[0], op, + stencilRefs[0], stencilVals); - /* front_facing = face > 0.0 */ - front_facing = LLVMBuildFCmp(bld->builder, LLVMRealUGT, face, zero, ""); + if (stencil[1].enabled && front_facing) { + /* do back face op */ + LLVMValueRef back_res; - lp_build_if(&if_ctx, flow_ctx, bld->builder, front_facing); - { - result = lp_build_stencil_op_single(bld, &stencil[0], op, - stencilRefs[0], stencilVals, mask); - } - lp_build_else(&if_ctx); - { - result = lp_build_stencil_op_single(bld, &stencil[1], op, - stencilRefs[1], stencilVals, mask); - } - lp_build_endif(&if_ctx); + back_res = lp_build_stencil_op_single(bld, &stencil[1], op, + stencilRefs[1], stencilVals); - lp_build_flow_scope_end(flow_ctx); - lp_build_flow_destroy(flow_ctx); + res = lp_build_select(bld, front_facing, res, back_res); + } - return result; + if (stencil->writemask != 0xff) { + /* mask &= stencil->writemask */ + LLVMValueRef writemask = lp_build_const_int_vec(bld->type, stencil->writemask); + mask = LLVMBuildAnd(bld->builder, mask, writemask, ""); + /* res = (res & mask) | (stencilVals & ~mask) */ + res = lp_build_select_bitwise(bld, writemask, res, stencilVals); } else { - /* do single-sided op */ - return lp_build_stencil_op_single(bld, &stencil[0], op, - stencilRefs[0], stencilVals, mask); + /* res = mask ? res : stencilVals */ + res = lp_build_select(bld, mask, res, stencilVals); } + + return res; } @@ -358,8 +304,13 @@ lp_depth_type(const struct util_format_description *format_desc, } else if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) { assert(format_desc->block.bits <= 32); - if(format_desc->channel[swizzle].normalized) - type.norm = TRUE; + assert(format_desc->channel[swizzle].normalized); + if (format_desc->channel[swizzle].size < format_desc->block.bits) { + /* Prefer signed integers when possible, as SSE has less support + * for unsigned comparison; + */ + type.sign = TRUE; + } } else assert(0); @@ -381,7 +332,7 @@ lp_depth_type(const struct util_format_description *format_desc, */ static boolean get_z_shift_and_mask(const struct util_format_description *format_desc, - unsigned *shift, unsigned *mask) + unsigned *shift, unsigned *width, unsigned *mask) { const unsigned total_bits = format_desc->block.bits; unsigned z_swizzle; @@ -397,12 +348,14 @@ get_z_shift_and_mask(const struct util_format_description *format_desc, if (z_swizzle == UTIL_FORMAT_SWIZZLE_NONE) return FALSE; + *width = format_desc->channel[z_swizzle].size; + padding_right = 0; for (chan = 0; chan < z_swizzle; ++chan) padding_right += format_desc->channel[chan].size; padding_left = - total_bits - (padding_right + format_desc->channel[z_swizzle].size); + total_bits - (padding_right + *width); if (padding_left || padding_right) { unsigned long long mask_left = (1ULL << (total_bits - padding_left)) - 1; @@ -413,7 +366,7 @@ get_z_shift_and_mask(const struct util_format_description *format_desc, *mask = 0xffffffff; } - *shift = padding_left; + *shift = padding_right; return TRUE; } @@ -457,7 +410,7 @@ get_s_shift_and_mask(const struct util_format_description *format_desc, * \param maskvalue is the depth test mask. * \param counter is a pointer of the uint32 counter. */ -static void +void lp_build_occlusion_count(LLVMBuilderRef builder, struct lp_type type, LLVMValueRef maskvalue, @@ -494,31 +447,57 @@ lp_build_occlusion_count(LLVMBuilderRef builder, * \param format_desc description of the depth/stencil surface * \param mask the alive/dead pixel mask for the quad (vector) * \param stencil_refs the front/back stencil ref values (scalar) - * \param z_src the incoming depth/stencil values (a 2x2 quad) + * \param z_src the incoming depth/stencil values (a 2x2 quad, float32) * \param zs_dst_ptr pointer to depth/stencil values in framebuffer - * \param facing contains float value indicating front/back facing polygon + * \param facing contains boolean value indicating front/back facing polygon */ void lp_build_depth_stencil_test(LLVMBuilderRef builder, const struct pipe_depth_state *depth, const struct pipe_stencil_state stencil[2], - struct lp_type type, + struct lp_type z_src_type, const struct util_format_description *format_desc, struct lp_build_mask_context *mask, LLVMValueRef stencil_refs[2], LLVMValueRef z_src, LLVMValueRef zs_dst_ptr, LLVMValueRef face, - LLVMValueRef counter) + LLVMValueRef *zs_value, + boolean do_branch) { - struct lp_build_context bld; - struct lp_build_context sbld; + struct lp_type z_type; + struct lp_build_context z_bld; + struct lp_build_context s_bld; struct lp_type s_type; + unsigned z_shift = 0, z_width = 0, z_mask = 0; LLVMValueRef zs_dst, z_dst = NULL; LLVMValueRef stencil_vals = NULL; LLVMValueRef z_bitmask = NULL, stencil_shift = NULL; LLVMValueRef z_pass = NULL, s_pass_mask = NULL; - LLVMValueRef orig_mask = mask->value; + LLVMValueRef orig_mask = lp_build_mask_value(mask); + LLVMValueRef front_facing = NULL; + + + /* + * Depths are expected to be between 0 and 1, even if they are stored in + * floats. Setting these bits here will ensure that the lp_build_conv() call + * below won't try to unnecessarily clamp the incoming values. + */ + if(z_src_type.floating) { + z_src_type.sign = FALSE; + z_src_type.norm = TRUE; + } + else { + assert(!z_src_type.sign); + assert(z_src_type.norm); + } + + /* Pick the depth type. */ + z_type = lp_depth_type(format_desc, z_src_type.width*z_src_type.length); + + /* FIXME: Cope with a depth test type with a different bit width. */ + assert(z_type.width == z_src_type.width); + assert(z_type.length == z_src_type.length); /* Sanity checking */ { @@ -540,8 +519,8 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder, } assert(z_swizzle < 4); - assert(format_desc->block.bits == type.width); - if (type.floating) { + assert(format_desc->block.bits == z_type.width); + if (z_type.floating) { assert(z_swizzle == 0); assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT); @@ -552,54 +531,56 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder, assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED); assert(format_desc->channel[z_swizzle].normalized); - assert(!type.fixed); - assert(!type.sign); - assert(type.norm); + assert(!z_type.fixed); } } /* Setup build context for Z vals */ - lp_build_context_init(&bld, builder, type); + lp_build_context_init(&z_bld, builder, z_type); /* Setup build context for stencil vals */ - s_type = lp_type_int_vec(type.width); - lp_build_context_init(&sbld, builder, s_type); + s_type = lp_type_int_vec(z_type.width); + lp_build_context_init(&s_bld, builder, s_type); /* Load current z/stencil value from z/stencil buffer */ + zs_dst_ptr = LLVMBuildBitCast(builder, + zs_dst_ptr, + LLVMPointerType(z_bld.vec_type, 0), ""); zs_dst = LLVMBuildLoad(builder, zs_dst_ptr, ""); - lp_build_name(zs_dst, "zsbufval"); + lp_build_name(zs_dst, "zs_dst"); /* Compute and apply the Z/stencil bitmasks and shifts. */ { - unsigned z_shift, z_mask; unsigned s_shift, s_mask; - if (get_z_shift_and_mask(format_desc, &z_shift, &z_mask)) { - if (z_shift) { - LLVMValueRef shift = lp_build_const_int_vec(type, z_shift); - z_src = LLVMBuildLShr(builder, z_src, shift, ""); - } - + if (get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask)) { if (z_mask != 0xffffffff) { - LLVMValueRef mask = lp_build_const_int_vec(type, z_mask); - z_src = LLVMBuildAnd(builder, z_src, mask, ""); - z_dst = LLVMBuildAnd(builder, zs_dst, mask, ""); - z_bitmask = mask; /* used below */ + z_bitmask = lp_build_const_int_vec(z_type, z_mask); } - else { + + /* + * Align the framebuffer Z 's LSB to the right. + */ + if (z_shift) { + LLVMValueRef shift = lp_build_const_int_vec(z_type, z_shift); + z_dst = LLVMBuildLShr(builder, zs_dst, shift, "z_dst"); + } else if (z_bitmask) { + /* TODO: Instead of loading a mask from memory and ANDing, it's + * probably faster to just shake the bits with two shifts. */ + z_dst = LLVMBuildAnd(builder, zs_dst, z_bitmask, "z_dst"); + } else { z_dst = zs_dst; + lp_build_name(z_dst, "z_dst"); } - - lp_build_name(z_dst, "zsbuf.z"); } if (get_s_shift_and_mask(format_desc, &s_shift, &s_mask)) { if (s_shift) { - LLVMValueRef shift = lp_build_const_int_vec(type, s_shift); + LLVMValueRef shift = lp_build_const_int_vec(s_type, s_shift); stencil_vals = LLVMBuildLShr(builder, zs_dst, shift, ""); stencil_shift = shift; /* used below */ } @@ -608,35 +589,85 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder, } if (s_mask != 0xffffffff) { - LLVMValueRef mask = lp_build_const_int_vec(type, s_mask); + LLVMValueRef mask = lp_build_const_int_vec(s_type, s_mask); stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, ""); } - lp_build_name(stencil_vals, "stencil"); + lp_build_name(stencil_vals, "s_dst"); } } - if (stencil[0].enabled) { + + if (face) { + LLVMValueRef zero = LLVMConstInt(LLVMInt32Type(), 0, 0); + + /* front_facing = face != 0 ? ~0 : 0 */ + front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, ""); + front_facing = LLVMBuildSExt(builder, front_facing, + LLVMIntType(s_bld.type.length*s_bld.type.width), + ""); + front_facing = LLVMBuildBitCast(builder, front_facing, + s_bld.int_vec_type, ""); + } + /* convert scalar stencil refs into vectors */ - stencil_refs[0] = lp_build_broadcast_scalar(&bld, stencil_refs[0]); - stencil_refs[1] = lp_build_broadcast_scalar(&bld, stencil_refs[1]); + stencil_refs[0] = lp_build_broadcast_scalar(&s_bld, stencil_refs[0]); + stencil_refs[1] = lp_build_broadcast_scalar(&s_bld, stencil_refs[1]); - s_pass_mask = lp_build_stencil_test(&sbld, stencil, - stencil_refs, stencil_vals, face); + s_pass_mask = lp_build_stencil_test(&s_bld, stencil, + stencil_refs, stencil_vals, + front_facing); /* apply stencil-fail operator */ { - LLVMValueRef s_fail_mask = lp_build_andnot(&bld, orig_mask, s_pass_mask); - stencil_vals = lp_build_stencil_op(&sbld, stencil, S_FAIL_OP, + LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, orig_mask, s_pass_mask); + stencil_vals = lp_build_stencil_op(&s_bld, stencil, S_FAIL_OP, stencil_refs, stencil_vals, - s_fail_mask, face); + s_fail_mask, front_facing); } } if (depth->enabled) { + /* + * Convert fragment Z to the desired type, aligning the LSB to the right. + */ + + assert(z_type.width == z_src_type.width); + assert(z_type.length == z_src_type.length); + assert(lp_check_value(z_src_type, z_src)); + if (z_src_type.floating) { + /* + * Convert from floating point values + */ + + if (!z_type.floating) { + z_src = lp_build_clamped_float_to_unsigned_norm(builder, + z_src_type, + z_width, + z_src); + } + } else { + /* + * Convert from unsigned normalized values. + */ + + assert(!z_src_type.sign); + assert(!z_src_type.fixed); + assert(z_src_type.norm); + assert(!z_type.floating); + if (z_src_type.width > z_width) { + LLVMValueRef shift = lp_build_const_int_vec(z_src_type, + z_src_type.width - z_width); + z_src = LLVMBuildLShr(builder, z_src, shift, ""); + } + } + assert(lp_check_value(z_type, z_src)); + + lp_build_name(z_src, "z_src"); + /* compare src Z to dst Z, returning 'pass' mask */ - z_pass = lp_build_cmp(&bld, depth->func, z_src, z_dst); + z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst); if (!stencil[0].enabled) { /* We can potentially skip all remaining operations here, but only @@ -644,28 +675,28 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder, * buffer values. Don't need to update Z buffer values. */ lp_build_mask_update(mask, z_pass); + + if (do_branch) { + lp_build_mask_check(mask); + do_branch = FALSE; + } } if (depth->writemask) { - LLVMValueRef zselectmask = mask->value; + LLVMValueRef zselectmask; /* mask off bits that failed Z test */ - zselectmask = LLVMBuildAnd(builder, zselectmask, z_pass, ""); + zselectmask = LLVMBuildAnd(builder, orig_mask, z_pass, ""); /* mask off bits that failed stencil test */ if (s_pass_mask) { zselectmask = LLVMBuildAnd(builder, zselectmask, s_pass_mask, ""); } - /* if combined Z/stencil format, mask off the stencil bits */ - if (z_bitmask) { - zselectmask = LLVMBuildAnd(builder, zselectmask, z_bitmask, ""); - } - /* Mix the old and new Z buffer values. - * z_dst[i] = (zselectmask[i] & z_src[i]) | (~zselectmask[i] & z_dst[i]) + * z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i] */ - z_dst = lp_build_select_bitwise(&bld, zselectmask, z_src, z_dst); + z_dst = lp_build_select(&z_bld, zselectmask, z_src, z_dst); } if (stencil[0].enabled) { @@ -673,33 +704,35 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder, LLVMValueRef z_fail_mask, z_pass_mask; /* apply Z-fail operator */ - z_fail_mask = lp_build_andnot(&bld, orig_mask, z_pass); - stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_FAIL_OP, + z_fail_mask = lp_build_andnot(&z_bld, orig_mask, z_pass); + stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_FAIL_OP, stencil_refs, stencil_vals, - z_fail_mask, face); + z_fail_mask, front_facing); /* apply Z-pass operator */ - z_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, z_pass, ""); - stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP, + z_pass_mask = LLVMBuildAnd(z_bld.builder, orig_mask, z_pass, ""); + stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP, stencil_refs, stencil_vals, - z_pass_mask, face); + z_pass_mask, front_facing); } } else { /* No depth test: apply Z-pass operator to stencil buffer values which * passed the stencil test. */ - s_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, s_pass_mask, ""); - stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP, + s_pass_mask = LLVMBuildAnd(s_bld.builder, orig_mask, s_pass_mask, ""); + stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP, stencil_refs, stencil_vals, - s_pass_mask, face); + s_pass_mask, front_facing); } - /* The Z bits are already in the right place but we may need to shift the - * stencil bits before ORing Z with Stencil to make the final pixel value. - */ + /* Put Z and ztencil bits in the right place */ + if (z_dst && z_shift) { + LLVMValueRef shift = lp_build_const_int_vec(z_type, z_shift); + z_dst = LLVMBuildShl(builder, z_dst, shift, ""); + } if (stencil_vals && stencil_shift) - stencil_vals = LLVMBuildShl(bld.builder, stencil_vals, + stencil_vals = LLVMBuildShl(s_bld.builder, stencil_vals, stencil_shift, ""); /* Finally, merge/store the z/stencil values */ @@ -707,13 +740,13 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder, (stencil[0].enabled && stencil[0].writemask)) { if (z_dst && stencil_vals) - zs_dst = LLVMBuildOr(bld.builder, z_dst, stencil_vals, ""); + zs_dst = LLVMBuildOr(z_bld.builder, z_dst, stencil_vals, ""); else if (z_dst) zs_dst = z_dst; else zs_dst = stencil_vals; - LLVMBuildStore(builder, zs_dst, zs_dst_ptr); + *zs_value = zs_dst; } if (s_pass_mask) @@ -722,6 +755,47 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder, if (depth->enabled && stencil[0].enabled) lp_build_mask_update(mask, z_pass); - if (counter) - lp_build_occlusion_count(builder, type, mask->value, counter); + if (do_branch) + lp_build_mask_check(mask); + +} + + +void +lp_build_depth_write(LLVMBuilderRef builder, + const struct util_format_description *format_desc, + LLVMValueRef zs_dst_ptr, + LLVMValueRef zs_value) +{ + zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, + LLVMPointerType(LLVMTypeOf(zs_value), 0), ""); + + LLVMBuildStore(builder, zs_value, zs_dst_ptr); +} + + +void +lp_build_deferred_depth_write(LLVMBuilderRef builder, + struct lp_type z_src_type, + const struct util_format_description *format_desc, + struct lp_build_mask_context *mask, + LLVMValueRef zs_dst_ptr, + LLVMValueRef zs_value) +{ + struct lp_type z_type; + struct lp_build_context z_bld; + LLVMValueRef z_dst; + + /* XXX: pointlessly redo type logic: + */ + z_type = lp_depth_type(format_desc, z_src_type.width*z_src_type.length); + lp_build_context_init(&z_bld, builder, z_type); + + zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, + LLVMPointerType(z_bld.vec_type, 0), ""); + + z_dst = LLVMBuildLoad(builder, zs_dst_ptr, "zsbufval"); + z_dst = lp_build_select(&z_bld, lp_build_mask_value(mask), zs_value, z_dst); + + LLVMBuildStore(builder, z_dst, zs_dst_ptr); } diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.h b/src/gallium/drivers/llvmpipe/lp_bld_depth.h index e257a5bd7d0..a54ef3a711e 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_depth.h +++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.h @@ -61,7 +61,27 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder, LLVMValueRef zs_src, LLVMValueRef zs_dst_ptr, LLVMValueRef facing, - LLVMValueRef counter); + LLVMValueRef *zs_value, + boolean do_branch); +void +lp_build_depth_write(LLVMBuilderRef builder, + const struct util_format_description *format_desc, + LLVMValueRef zs_dst_ptr, + LLVMValueRef zs_value); + +void +lp_build_deferred_depth_write(LLVMBuilderRef builder, + struct lp_type z_src_type, + const struct util_format_description *format_desc, + struct lp_build_mask_context *mask, + LLVMValueRef zs_dst_ptr, + LLVMValueRef zs_value); + +void +lp_build_occlusion_count(LLVMBuilderRef builder, + struct lp_type type, + LLVMValueRef maskvalue, + LLVMValueRef counter); #endif /* !LP_BLD_DEPTH_H */ diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c index 2a374f8c390..c9da8900d0c 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c +++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c @@ -206,7 +206,7 @@ coeffs_init(struct lp_build_interp_soa_context *bld, dadq2 = LLVMBuildFAdd(builder, dadq, dadq, ""); /* - * a = a0 + x * dadx + y * dady + * a = a0 + (x * dadx + y * dady) */ if (attrib == 0 && chan == 0) { @@ -219,11 +219,11 @@ coeffs_init(struct lp_build_interp_soa_context *bld, a = a0; if (interp != LP_INTERP_CONSTANT && interp != LP_INTERP_FACING) { - LLVMValueRef tmp; - tmp = LLVMBuildFMul(builder, bld->x, dadx, ""); - a = LLVMBuildFAdd(builder, a, tmp, ""); - tmp = LLVMBuildFMul(builder, bld->y, dady, ""); - a = LLVMBuildFAdd(builder, a, tmp, ""); + LLVMValueRef ax, ay, axy; + ax = LLVMBuildFMul(builder, bld->x, dadx, ""); + ay = LLVMBuildFMul(builder, bld->y, dady, ""); + axy = LLVMBuildFAdd(builder, ax, ay, ""); + a = LLVMBuildFAdd(builder, a, axy, ""); } } @@ -272,7 +272,10 @@ coeffs_init(struct lp_build_interp_soa_context *bld, * This is called when we move from one quad to the next. */ static void -attribs_update(struct lp_build_interp_soa_context *bld, int quad_index) +attribs_update(struct lp_build_interp_soa_context *bld, + int quad_index, + int start, + int end) { struct lp_build_context *coeff_bld = &bld->coeff_bld; LLVMValueRef shuffle = lp_build_const_int_vec(coeff_bld->type, quad_index); @@ -282,7 +285,7 @@ attribs_update(struct lp_build_interp_soa_context *bld, int quad_index) assert(quad_index < 4); - for(attrib = 0; attrib < bld->num_attribs; ++attrib) { + for(attrib = start; attrib < end; ++attrib) { const unsigned mask = bld->mask[attrib]; const unsigned interp = bld->interp[attrib]; for(chan = 0; chan < NUM_CHANNELS; ++chan) { @@ -350,6 +353,14 @@ attribs_update(struct lp_build_interp_soa_context *bld, int quad_index) } #endif + if (attrib == 0 && chan == 2) { + /* FIXME: Depth values can exceed 1.0, due to the fact that + * setup interpolation coefficients refer to (0,0) which causes + * precision loss. So we must clamp to 1.0 here to avoid artifacts + */ + a = lp_build_min(coeff_bld, a, coeff_bld->one); + } + attrib_name(a, attrib, chan, ""); } bld->attribs[attrib][chan] = a; @@ -434,8 +445,6 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld, pos_init(bld, x0, y0); coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr); - - attribs_update(bld, 0); } @@ -443,10 +452,20 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld, * Advance the position and inputs to the given quad within the block. */ void -lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld, - int quad_index) +lp_build_interp_soa_update_inputs(struct lp_build_interp_soa_context *bld, + int quad_index) +{ + assert(quad_index < 4); + + attribs_update(bld, quad_index, 1, bld->num_attribs); +} + +void +lp_build_interp_soa_update_pos(struct lp_build_interp_soa_context *bld, + int quad_index) { assert(quad_index < 4); - attribs_update(bld, quad_index); + attribs_update(bld, quad_index, 0, 1); } + diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h index 3054030f739..a7ebdd1bfa2 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h +++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h @@ -46,7 +46,31 @@ #include "tgsi/tgsi_exec.h" -#include "lp_setup.h" +/** + * Describes how to compute the interpolation coefficients (a0, dadx, dady) + * from the vertices passed into our triangle/line/point functions by the + * draw module. + * + * Vertices are treated as an array of float[4] values, indexed by + * src_index. + * + * LP_INTERP_COLOR is translated to either LP_INTERP_CONSTANT or + * LINEAR depending on flatshade state. + */ +enum lp_interp { + LP_INTERP_CONSTANT, + LP_INTERP_COLOR, + LP_INTERP_LINEAR, + LP_INTERP_PERSPECTIVE, + LP_INTERP_POSITION, + LP_INTERP_FACING +}; + +struct lp_shader_input { + ushort interp:4; /* enum lp_interp */ + ushort usage_mask:4; /* bitmask of TGSI_WRITEMASK_x flags */ + ushort src_index:8; /* where to find values in incoming vertices */ +}; struct lp_build_interp_soa_context @@ -89,7 +113,11 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld, LLVMValueRef y); void -lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld, +lp_build_interp_soa_update_inputs(struct lp_build_interp_soa_context *bld, + int quad_index); + +void +lp_build_interp_soa_update_pos(struct lp_build_interp_soa_context *bld, int quad_index); diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c index 39f2c6085ef..763432ed712 100644 --- a/src/gallium/drivers/llvmpipe/lp_context.c +++ b/src/gallium/drivers/llvmpipe/lp_context.c @@ -82,6 +82,8 @@ static void llvmpipe_destroy( struct pipe_context *pipe ) } } + lp_delete_setup_variants(llvmpipe); + align_free( llvmpipe ); } @@ -108,6 +110,7 @@ llvmpipe_create_context( struct pipe_screen *screen, void *priv ) memset(llvmpipe, 0, sizeof *llvmpipe); make_empty_list(&llvmpipe->fs_variants_list); + make_empty_list(&llvmpipe->setup_variants_list); llvmpipe->pipe.winsys = screen->winsys; llvmpipe->pipe.screen = screen; diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h index 34fa20e204a..db09c95b272 100644 --- a/src/gallium/drivers/llvmpipe/lp_context.h +++ b/src/gallium/drivers/llvmpipe/lp_context.h @@ -39,6 +39,7 @@ #include "lp_jit.h" #include "lp_setup.h" #include "lp_state_fs.h" +#include "lp_state_setup.h" struct llvmpipe_vbuf_render; @@ -48,6 +49,7 @@ struct lp_fragment_shader; struct lp_vertex_shader; struct lp_blend_state; struct lp_setup_context; +struct lp_setup_variant; struct lp_velems_state; struct llvmpipe_context { @@ -105,12 +107,9 @@ struct llvmpipe_context { /** Which vertex shader output slot contains point size */ int psize_slot; - /** Fragment shader input interpolation info */ - unsigned num_inputs; - struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS]; - /** The tiling engine */ struct lp_setup_context *setup; + struct lp_setup_variant setup_variant; /** The primitive drawing context */ struct draw_context *draw; @@ -120,6 +119,9 @@ struct llvmpipe_context { struct lp_fs_variant_list_item fs_variants_list; unsigned nr_fs_variants; + + struct lp_setup_variant_list_item setup_variants_list; + unsigned nr_setup_variants; }; diff --git a/src/gallium/drivers/llvmpipe/lp_flush.h b/src/gallium/drivers/llvmpipe/lp_flush.h index bb538b2bd83..3626ce4a86c 100644 --- a/src/gallium/drivers/llvmpipe/lp_flush.h +++ b/src/gallium/drivers/llvmpipe/lp_flush.h @@ -32,6 +32,7 @@ struct pipe_context; struct pipe_fence_handle; +struct pipe_resource; void llvmpipe_flush(struct pipe_context *pipe, diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c index 04b12dedccf..c540f9b3628 100644 --- a/src/gallium/drivers/llvmpipe/lp_jit.c +++ b/src/gallium/drivers/llvmpipe/lp_jit.c @@ -36,7 +36,6 @@ #include <llvm-c/Transforms/Scalar.h> #include "util/u_memory.h" -#include "util/u_cpu_detect.h" #include "gallivm/lp_bld_init.h" #include "gallivm/lp_bld_debug.h" #include "lp_screen.h" @@ -162,9 +161,6 @@ lp_jit_init_globals(struct llvmpipe_screen *screen) void lp_jit_screen_cleanup(struct llvmpipe_screen *screen) { - if(screen->engine) - LLVMDisposeExecutionEngine(screen->engine); - if(screen->pass) LLVMDisposePassManager(screen->pass); } @@ -190,13 +186,7 @@ lp_jit_screen_init(struct llvmpipe_screen *screen) LLVMAddCFGSimplificationPass(screen->pass); LLVMAddPromoteMemoryToRegisterPass(screen->pass); LLVMAddConstantPropagationPass(screen->pass); - if(util_cpu_caps.has_sse4_1) { - /* FIXME: There is a bug in this pass, whereby the combination of fptosi - * and sitofp (necessary for trunc/floor/ceil/round implementation) - * somehow becomes invalid code. - */ - LLVMAddInstructionCombiningPass(screen->pass); - } + LLVMAddInstructionCombiningPass(screen->pass); LLVMAddGVNPass(screen->pass); } else { /* We need at least this pass to prevent the backends to fail in diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h index 16e04fce0cd..114f21f2d16 100644 --- a/src/gallium/drivers/llvmpipe/lp_jit.h +++ b/src/gallium/drivers/llvmpipe/lp_jit.h @@ -144,7 +144,7 @@ typedef void (*lp_jit_frag_func)(const struct lp_jit_context *context, uint32_t x, uint32_t y, - float facing, + uint32_t facing, const void *a0, const void *dadx, const void *dady, diff --git a/src/gallium/drivers/llvmpipe/lp_limits.h b/src/gallium/drivers/llvmpipe/lp_limits.h index d1c431475d8..2538164ffaa 100644 --- a/src/gallium/drivers/llvmpipe/lp_limits.h +++ b/src/gallium/drivers/llvmpipe/lp_limits.h @@ -72,4 +72,14 @@ */ #define LP_MAX_SHADER_VARIANTS 1024 +/** + * Max number of setup variants that will be kept around. + * + * These are determined by the combination of the fragment shader + * input signature and a small amount of rasterization state (eg + * flatshading). It is likely that many active fragment shaders will + * share the same setup variant. + */ +#define LP_MAX_SETUP_VARIANTS 64 + #endif /* LP_LIMITS_H */ diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c index d7e6415e139..d358a983943 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.c +++ b/src/gallium/drivers/llvmpipe/lp_rast.c @@ -211,8 +211,8 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) { const struct lp_scene *scene = task->scene; - unsigned clear_value = arg.clear_zstencil.value; - unsigned clear_mask = arg.clear_zstencil.mask; + uint32_t clear_value = arg.clear_zstencil.value; + uint32_t clear_mask = arg.clear_zstencil.mask; const unsigned height = TILE_SIZE / TILE_VECTOR_HEIGHT; const unsigned width = TILE_SIZE * TILE_VECTOR_HEIGHT; const unsigned block_size = scene->zsbuf.blocksize; @@ -220,7 +220,8 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task, uint8_t *dst; unsigned i, j; - LP_DBG(DEBUG_RAST, "%s 0x%x%x\n", __FUNCTION__, clear_value, clear_mask); + LP_DBG(DEBUG_RAST, "%s: value=0x%08x, mask=0x%08x\n", + __FUNCTION__, clear_value, clear_mask); /* * Clear the aera of the swizzled depth/depth buffer matching this tile, in @@ -232,16 +233,31 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task, dst = task->depth_tile; + clear_value &= clear_mask; + switch (block_size) { case 1: + assert(clear_mask == 0xff); memset(dst, (uint8_t) clear_value, height * width); break; case 2: - for (i = 0; i < height; i++) { - uint16_t *row = (uint16_t *)dst; - for (j = 0; j < width; j++) - *row++ = (uint16_t) clear_value; - dst += dst_stride; + if (clear_mask == 0xffff) { + for (i = 0; i < height; i++) { + uint16_t *row = (uint16_t *)dst; + for (j = 0; j < width; j++) + *row++ = (uint16_t) clear_value; + dst += dst_stride; + } + } + else { + for (i = 0; i < height; i++) { + uint16_t *row = (uint16_t *)dst; + for (j = 0; j < width; j++) { + uint16_t tmp = ~clear_mask & *row; + *row++ = clear_value | tmp; + } + dst += dst_stride; + } } break; case 4: @@ -258,7 +274,7 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task, uint32_t *row = (uint32_t *)dst; for (j = 0; j < width; j++) { uint32_t tmp = ~clear_mask & *row; - *row++ = (clear_value & clear_mask) | tmp; + *row++ = clear_value | tmp; } dst += dst_stride; } @@ -318,7 +334,7 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task, { const struct lp_scene *scene = task->scene; const struct lp_rast_shader_inputs *inputs = arg.shade_tile; - const struct lp_rast_state *state = inputs->state; + const struct lp_rast_state *state = task->state; struct lp_fragment_shader_variant *variant = state->variant; const unsigned tile_x = task->x, tile_y = task->y; unsigned x, y; @@ -349,10 +365,10 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task, BEGIN_JIT_CALL(state); variant->jit_function[RAST_WHOLE]( &state->jit_context, tile_x + x, tile_y + y, - inputs->facing, - inputs->a0, - inputs->dadx, - inputs->dady, + inputs->frontfacing, + GET_A0(inputs), + GET_DADX(inputs), + GET_DADY(inputs), color, depth, 0xffff, @@ -398,7 +414,7 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task, unsigned x, unsigned y, unsigned mask) { - const struct lp_rast_state *state = inputs->state; + const struct lp_rast_state *state = task->state; struct lp_fragment_shader_variant *variant = state->variant; const struct lp_scene *scene = task->scene; uint8_t *color[PIPE_MAX_COLOR_BUFS]; @@ -430,10 +446,10 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task, BEGIN_JIT_CALL(state); variant->jit_function[RAST_EDGE_TEST](&state->jit_context, x, y, - inputs->facing, - inputs->a0, - inputs->dadx, - inputs->dady, + inputs->frontfacing, + GET_A0(inputs), + GET_DADX(inputs), + GET_DADY(inputs), color, depth, mask, @@ -474,6 +490,14 @@ lp_rast_end_query(struct lp_rasterizer_task *task, } +void +lp_rast_set_state(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + task->state = arg.state; +} + + /** * Set top row and left column of the tile's pixels to white. For debugging. @@ -581,10 +605,12 @@ static lp_rast_cmd_func dispatch[LP_RAST_OP_MAX] = lp_rast_triangle_8, lp_rast_triangle_3_4, lp_rast_triangle_3_16, + lp_rast_triangle_4_16, lp_rast_shade_tile, lp_rast_shade_tile_opaque, lp_rast_begin_query, lp_rast_end_query, + lp_rast_set_state, }; diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h index c55b97a9d13..a64c152cf83 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.h +++ b/src/gallium/drivers/llvmpipe/lp_rast.h @@ -78,30 +78,28 @@ struct lp_rast_state { * These pointers point into the bin data buffer. */ struct lp_rast_shader_inputs { - float facing; /** Positive for front-facing, negative for back-facing */ - unsigned disable:1; /** Partially binned, disable this command */ - unsigned opaque:1; /** Is opaque */ - - float (*a0)[4]; - float (*dadx)[4]; - float (*dady)[4]; - - const struct lp_rast_state *state; + unsigned frontfacing:1; /** True for front-facing */ + unsigned disable:1; /** Partially binned, disable this command */ + unsigned opaque:1; /** Is opaque */ + unsigned pad0:29; /* wasted space */ + unsigned stride; /* how much to advance data between a0, dadx, dady */ + unsigned pad2; /* wasted space */ + unsigned pad3; /* wasted space */ + /* followed by a0, dadx, dady and planes[] */ }; - +/* Note: the order of these values is important as they are loaded by + * sse code in rasterization: + */ struct lp_rast_plane { - /* one-pixel sized trivial accept offsets for each plane */ - int ei; - - /* one-pixel sized trivial reject offsets for each plane */ - int eo; - /* edge function values at minx,miny ?? */ int c; int dcdx; int dcdy; + + /* one-pixel sized trivial reject offsets for each plane */ + int eo; }; /** @@ -111,17 +109,24 @@ struct lp_rast_plane { * Objects of this type are put into the lp_setup_context::data buffer. */ struct lp_rast_triangle { - /* inputs for the shader */ - struct lp_rast_shader_inputs inputs; - #ifdef DEBUG float v[3][2]; + float pad0; + float pad1; #endif - struct lp_rast_plane plane[8]; /* NOTE: may allocate fewer planes */ + /* inputs for the shader */ + struct lp_rast_shader_inputs inputs; + /* planes are also allocated here */ }; +#define GET_A0(inputs) ((float (*)[4])((inputs)+1)) +#define GET_DADX(inputs) ((float (*)[4])((char *)((inputs) + 1) + (inputs)->stride)) +#define GET_DADY(inputs) ((float (*)[4])((char *)((inputs) + 1) + 2 * (inputs)->stride)) +#define GET_PLANES(tri) ((struct lp_rast_plane *)((char *)(&(tri)->inputs + 1) + 3 * (tri)->inputs.stride)) + + struct lp_rasterizer * lp_rast_create( unsigned num_threads ); @@ -149,9 +154,10 @@ union lp_rast_cmd_arg { const struct lp_rast_state *set_state; uint8_t clear_color[4]; struct { - unsigned value; - unsigned mask; + uint32_t value; + uint32_t mask; } clear_zstencil; + const struct lp_rast_state *state; struct lp_fence *fence; struct llvmpipe_query *query_obj; }; @@ -238,12 +244,14 @@ lp_rast_arg_null( void ) #define LP_RAST_OP_TRIANGLE_8 0x9 #define LP_RAST_OP_TRIANGLE_3_4 0xa #define LP_RAST_OP_TRIANGLE_3_16 0xb -#define LP_RAST_OP_SHADE_TILE 0xc -#define LP_RAST_OP_SHADE_TILE_OPAQUE 0xd -#define LP_RAST_OP_BEGIN_QUERY 0xe -#define LP_RAST_OP_END_QUERY 0xf - -#define LP_RAST_OP_MAX 0x10 +#define LP_RAST_OP_TRIANGLE_4_16 0xc +#define LP_RAST_OP_SHADE_TILE 0xd +#define LP_RAST_OP_SHADE_TILE_OPAQUE 0xe +#define LP_RAST_OP_BEGIN_QUERY 0xf +#define LP_RAST_OP_END_QUERY 0x10 +#define LP_RAST_OP_SET_STATE 0x11 + +#define LP_RAST_OP_MAX 0x12 #define LP_RAST_OP_MASK 0xff void diff --git a/src/gallium/drivers/llvmpipe/lp_rast_debug.c b/src/gallium/drivers/llvmpipe/lp_rast_debug.c index 9fc78645a3a..64ac616f629 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_debug.c +++ b/src/gallium/drivers/llvmpipe/lp_rast_debug.c @@ -12,6 +12,7 @@ static INLINE int u_bit_scan(unsigned *mask) struct tile { int coverage; int overdraw; + const struct lp_rast_state *state; char data[TILE_SIZE][TILE_SIZE]; }; @@ -42,10 +43,12 @@ static const char *cmd_names[LP_RAST_OP_MAX] = "triangle_8", "triangle_3_4", "triangle_3_16", + "triangle_4_16", "shade_tile", "shade_tile_opaque", "begin_query", "end_query", + "set_state", }; static const char *cmd_name(unsigned cmd) @@ -55,31 +58,31 @@ static const char *cmd_name(unsigned cmd) } static const struct lp_fragment_shader_variant * -get_variant( const struct cmd_block *block, - int k ) +get_variant( const struct lp_rast_state *state, + const struct cmd_block *block, + int k ) { if (block->cmd[k] == LP_RAST_OP_SHADE_TILE || - block->cmd[k] == LP_RAST_OP_SHADE_TILE_OPAQUE) - return block->arg[k].shade_tile->state->variant; - - if (block->cmd[k] == LP_RAST_OP_TRIANGLE_1 || + block->cmd[k] == LP_RAST_OP_SHADE_TILE_OPAQUE || + block->cmd[k] == LP_RAST_OP_TRIANGLE_1 || block->cmd[k] == LP_RAST_OP_TRIANGLE_2 || block->cmd[k] == LP_RAST_OP_TRIANGLE_3 || block->cmd[k] == LP_RAST_OP_TRIANGLE_4 || block->cmd[k] == LP_RAST_OP_TRIANGLE_5 || block->cmd[k] == LP_RAST_OP_TRIANGLE_6 || block->cmd[k] == LP_RAST_OP_TRIANGLE_7) - return block->arg[k].triangle.tri->inputs.state->variant; + return state->variant; return NULL; } static boolean -is_blend( const struct cmd_block *block, +is_blend( const struct lp_rast_state *state, + const struct cmd_block *block, int k ) { - const struct lp_fragment_shader_variant *variant = get_variant(block, k); + const struct lp_fragment_shader_variant *variant = get_variant(state, block, k); if (variant) return variant->key.blend.rt[0].blend_enable; @@ -92,6 +95,7 @@ is_blend( const struct cmd_block *block, static void debug_bin( const struct cmd_bin *bin ) { + const struct lp_rast_state *state = NULL; const struct cmd_block *head = bin->head; int i, j = 0; @@ -99,9 +103,12 @@ debug_bin( const struct cmd_bin *bin ) while (head) { for (i = 0; i < head->count; i++, j++) { + if (head->cmd[i] == LP_RAST_OP_SET_STATE) + state = head->arg[i].state; + debug_printf("%d: %s %s\n", j, cmd_name(head->cmd[i]), - is_blend(head, i) ? "blended" : ""); + is_blend(state, head, i) ? "blended" : ""); } head = head->next; } @@ -133,7 +140,7 @@ debug_shade_tile(int x, int y, char val) { const struct lp_rast_shader_inputs *inputs = arg.shade_tile; - boolean blend = inputs->state->variant->key.blend.rt[0].blend_enable; + boolean blend = tile->state->variant->key.blend.rt[0].blend_enable; unsigned i,j; if (inputs->disable) @@ -171,11 +178,12 @@ debug_triangle(int tilex, int tiley, { const struct lp_rast_triangle *tri = arg.triangle.tri; unsigned plane_mask = arg.triangle.plane_mask; + const struct lp_rast_plane *tri_plane = GET_PLANES(tri); struct lp_rast_plane plane[8]; int x, y; int count = 0; unsigned i, nr_planes = 0; - boolean blend = tri->inputs.state->variant->key.blend.rt[0].blend_enable; + boolean blend = tile->state->variant->key.blend.rt[0].blend_enable; if (tri->inputs.disable) { /* This triangle was partially binned and has been disabled */ @@ -183,7 +191,7 @@ debug_triangle(int tilex, int tiley, } while (plane_mask) { - plane[nr_planes] = tri->plane[u_bit_scan(&plane_mask)]; + plane[nr_planes] = tri_plane[u_bit_scan(&plane_mask)]; plane[nr_planes].c = (plane[nr_planes].c + plane[nr_planes].dcdy * tiley - plane[nr_planes].dcdx * tilex); @@ -232,15 +240,19 @@ do_debug_bin( struct tile *tile, memset(tile->data, ' ', sizeof tile->data); tile->coverage = 0; tile->overdraw = 0; + tile->state = NULL; for (block = bin->head; block; block = block->next) { for (k = 0; k < block->count; k++, j++) { - boolean blend = is_blend(block, k); + boolean blend = is_blend(tile->state, block, k); char val = get_label(j); int count = 0; if (print_cmds) debug_printf("%c: %15s", val, cmd_name(block->cmd[k])); + + if (block->cmd[k] == LP_RAST_OP_SET_STATE) + tile->state = block->arg[k].state; if (block->cmd[k] == LP_RAST_OP_CLEAR_COLOR || block->cmd[k] == LP_RAST_OP_CLEAR_ZSTENCIL) diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h index 7370119e966..b30408f097b 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h +++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h @@ -77,6 +77,7 @@ struct cmd_bin; struct lp_rasterizer_task { const struct cmd_bin *bin; + const struct lp_rast_state *state; struct lp_scene *scene; unsigned x, y; /**< Pos of this tile in framebuffer, in pixels */ @@ -244,7 +245,7 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task, unsigned x, unsigned y ) { const struct lp_scene *scene = task->scene; - const struct lp_rast_state *state = inputs->state; + const struct lp_rast_state *state = task->state; struct lp_fragment_shader_variant *variant = state->variant; uint8_t *color[PIPE_MAX_COLOR_BUFS]; void *depth; @@ -260,10 +261,10 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task, BEGIN_JIT_CALL(state); variant->jit_function[RAST_WHOLE]( &state->jit_context, x, y, - inputs->facing, - inputs->a0, - inputs->dadx, - inputs->dady, + inputs->frontfacing, + GET_A0(inputs), + GET_DADX(inputs), + GET_DADY(inputs), color, depth, 0xffff, @@ -293,6 +294,14 @@ void lp_rast_triangle_3_4(struct lp_rasterizer_task *, void lp_rast_triangle_3_16( struct lp_rasterizer_task *, const union lp_rast_cmd_arg ); + +void lp_rast_triangle_4_16( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); + +void +lp_rast_set_state(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg); + void lp_debug_bin( const struct cmd_bin *bin ); diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c index a1f309d4b01..042c315635e 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c @@ -123,6 +123,16 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task, } void +lp_rast_triangle_4_16(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + union lp_rast_cmd_arg arg2; + arg2.triangle.tri = arg.triangle.tri; + arg2.triangle.plane_mask = (1<<4)-1; + lp_rast_triangle_3(task, arg2); +} + +void lp_rast_triangle_3_4(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) { @@ -230,144 +240,207 @@ sign_bits4(const __m128i *cstep, int cdiff) } -/* Special case for 3 plane triangle which is contained entirely - * within a 16x16 block. - */ +#define NR_PLANES 3 + + + + + + + void lp_rast_triangle_3_16(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) { const struct lp_rast_triangle *tri = arg.triangle.tri; - const struct lp_rast_plane *plane = tri->plane; - unsigned mask = arg.triangle.plane_mask; - const int x = task->x + (mask & 0xff); - const int y = task->y + (mask >> 8); - unsigned outmask, inmask, partmask, partial_mask; - unsigned j; - __m128i cstep4[3][4]; - - outmask = 0; /* outside one or more trivial reject planes */ - partmask = 0; /* outside one or more trivial accept planes */ - - for (j = 0; j < 3; j++) { - const int dcdx = -plane[j].dcdx * 4; - const int dcdy = plane[j].dcdy * 4; - __m128i xdcdy = _mm_set1_epi32(dcdy); - - cstep4[j][0] = _mm_setr_epi32(0, dcdx, dcdx*2, dcdx*3); - cstep4[j][1] = _mm_add_epi32(cstep4[j][0], xdcdy); - cstep4[j][2] = _mm_add_epi32(cstep4[j][1], xdcdy); - cstep4[j][3] = _mm_add_epi32(cstep4[j][2], xdcdy); - - { - const int c = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x; - const int cox = plane[j].eo * 4; - const int cio = plane[j].ei * 4 - 1; - - outmask |= sign_bits4(cstep4[j], c + cox); - partmask |= sign_bits4(cstep4[j], c + cio); - } - } + const struct lp_rast_plane *plane = GET_PLANES(tri); + int x = (arg.triangle.plane_mask & 0xff) + task->x; + int y = (arg.triangle.plane_mask >> 8) + task->y; + unsigned i, j; + + struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16]; + unsigned nr = 0; + + __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */ + __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */ + __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */ + __m128i zero = _mm_setzero_si128(); + + __m128i c; + __m128i dcdx; + __m128i dcdy; + __m128i rej4; + + __m128i dcdx2; + __m128i dcdx3; + + __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ + __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ + __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ + __m128i unused; + + transpose4_epi32(&p0, &p1, &p2, &zero, + &c, &dcdx, &dcdy, &rej4); + + /* Adjust dcdx; + */ + dcdx = _mm_sub_epi32(zero, dcdx); - if (outmask == 0xffff) - return; + c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); + c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); + rej4 = _mm_slli_epi32(rej4, 2); - /* Mask of sub-blocks which are inside all trivial accept planes: - */ - inmask = ~partmask & 0xffff; + dcdx2 = _mm_add_epi32(dcdx, dcdx); + dcdx3 = _mm_add_epi32(dcdx2, dcdx); - /* Mask of sub-blocks which are inside all trivial reject planes, - * but outside at least one trivial accept plane: - */ - partial_mask = partmask & ~outmask; + transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, + &span_0, &span_1, &span_2, &unused); - assert((partial_mask & inmask) == 0); + for (i = 0; i < 4; i++) { + __m128i cx = c; - /* Iterate over partials: - */ - while (partial_mask) { - int i = ffs(partial_mask) - 1; - int ix = (i & 3) * 4; - int iy = (i >> 2) * 4; - int px = x + ix; - int py = y + iy; - unsigned mask = 0xffff; - - partial_mask &= ~(1 << i); - - for (j = 0; j < 3; j++) { - const int cx = (plane[j].c - - plane[j].dcdx * px - + plane[j].dcdy * py) * 4; - - mask &= ~sign_bits4(cstep4[j], cx); - } + for (j = 0; j < 4; j++) { + __m128i c4rej = _mm_add_epi32(cx, rej4); + __m128i rej_masks = _mm_srai_epi32(c4rej, 31); - if (mask) - lp_rast_shade_quads_mask(task, &tri->inputs, px, py, mask); - } + /* if (is_zero(rej_masks)) */ + if (_mm_movemask_epi8(rej_masks) == 0) { + __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0); + __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1); + __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2); - /* Iterate over fulls: - */ - while (inmask) { - int i = ffs(inmask) - 1; - int ix = (i & 3) * 4; - int iy = (i >> 2) * 4; - int px = x + ix; - int py = y + iy; + __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); + + __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); + __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); + __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); + + __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); + __m128i c_01 = _mm_packs_epi32(c_0, c_1); + + __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); + __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); + __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); + + __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); - inmask &= ~(1 << i); + __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); + __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); + __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); - block_full_4(task, tri, px, py); + __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); + __m128i c_23 = _mm_packs_epi32(c_2, c_3); + __m128i c_0123 = _mm_packs_epi16(c_01, c_23); + + unsigned mask = _mm_movemask_epi8(c_0123); + + out[nr].i = i; + out[nr].j = j; + out[nr].mask = mask; + if (mask != 0xffff) + nr++; + } + cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2)); + } + + c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2)); } + + for (i = 0; i < nr; i++) + lp_rast_shade_quads_mask(task, + &tri->inputs, + x + 4 * out[i].j, + y + 4 * out[i].i, + 0xffff & ~out[i].mask); } + + + void lp_rast_triangle_3_4(struct lp_rasterizer_task *task, - const union lp_rast_cmd_arg arg) + const union lp_rast_cmd_arg arg) { const struct lp_rast_triangle *tri = arg.triangle.tri; - const struct lp_rast_plane *plane = tri->plane; - unsigned mask = arg.triangle.plane_mask; - const int x = task->x + (mask & 0xff); - const int y = task->y + (mask >> 8); - unsigned j; - - /* Iterate over partials: + const struct lp_rast_plane *plane = GET_PLANES(tri); + int x = (arg.triangle.plane_mask & 0xff) + task->x; + int y = (arg.triangle.plane_mask >> 8) + task->y; + + __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */ + __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */ + __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */ + __m128i zero = _mm_setzero_si128(); + + __m128i c; + __m128i dcdx; + __m128i dcdy; + + __m128i dcdx2; + __m128i dcdx3; + + __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ + __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ + __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ + __m128i unused; + + transpose4_epi32(&p0, &p1, &p2, &zero, + &c, &dcdx, &dcdy, &unused); + + /* Adjust dcdx; */ + dcdx = _mm_sub_epi32(zero, dcdx); + + c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); + c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); + + dcdx2 = _mm_add_epi32(dcdx, dcdx); + dcdx3 = _mm_add_epi32(dcdx2, dcdx); + + transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, + &span_0, &span_1, &span_2, &unused); + + { - unsigned mask = 0xffff; + __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0); + __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1); + __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2); + + __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); - for (j = 0; j < 3; j++) { - const int cx = (plane[j].c - - plane[j].dcdx * x - + plane[j].dcdy * y); + __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); + __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); + __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); - const int dcdx = -plane[j].dcdx; - const int dcdy = plane[j].dcdy; - __m128i xdcdy = _mm_set1_epi32(dcdy); + __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); + __m128i c_01 = _mm_packs_epi32(c_0, c_1); - __m128i cstep0 = _mm_setr_epi32(cx, cx + dcdx, cx + dcdx*2, cx + dcdx*3); - __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy); - __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy); - __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy); + __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); + __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); + __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); - __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1); - __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3); - __m128i result = _mm_packs_epi16(cstep01, cstep23); + __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); - /* Extract the sign bits - */ - mask &= ~_mm_movemask_epi8(result); - } + __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); + __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); + __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); - if (mask) - lp_rast_shade_quads_mask(task, &tri->inputs, x, y, mask); + __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); + __m128i c_23 = _mm_packs_epi32(c_2, c_3); + __m128i c_0123 = _mm_packs_epi16(c_01, c_23); + + unsigned mask = _mm_movemask_epi8(c_0123); + + if (mask != 0xffff) + lp_rast_shade_quads_mask(task, + &tri->inputs, + x, + y, + 0xffff & ~mask); } } - +#undef NR_PLANES #endif @@ -383,10 +456,13 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task, #define TAG(x) x##_3 #define NR_PLANES 3 +/*#define TRI_4 lp_rast_triangle_3_4*/ +/*#define TRI_16 lp_rast_triangle_3_16*/ #include "lp_rast_tri_tmp.h" #define TAG(x) x##_4 #define NR_PLANES 4 +#define TRI_16 lp_rast_triangle_4_16 #include "lp_rast_tri_tmp.h" #define TAG(x) x##_5 diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h index 9830a43ba55..4825d651c04 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h @@ -82,7 +82,8 @@ TAG(do_block_16)(struct lp_rasterizer_task *task, const int dcdx = -plane[j].dcdx * 4; const int dcdy = plane[j].dcdy * 4; const int cox = plane[j].eo * 4; - const int cio = plane[j].ei * 4 - 1; + const int ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo; + const int cio = ei * 4 - 1; build_masks(c[j] + cox, cio - cox, @@ -156,6 +157,7 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task, { const struct lp_rast_triangle *tri = arg.triangle.tri; unsigned plane_mask = arg.triangle.plane_mask; + const struct lp_rast_plane *tri_plane = GET_PLANES(tri); const int x = task->x, y = task->y; struct lp_rast_plane plane[NR_PLANES]; int c[NR_PLANES]; @@ -172,7 +174,7 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task, while (plane_mask) { int i = ffs(plane_mask) - 1; - plane[j] = tri->plane[i]; + plane[j] = tri_plane[i]; plane_mask &= ~(1 << i); c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x; @@ -180,7 +182,8 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task, const int dcdx = -plane[j].dcdx * 16; const int dcdy = plane[j].dcdy * 16; const int cox = plane[j].eo * 16; - const int cio = plane[j].ei * 16 - 1; + const int ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo; + const int cio = ei * 16 - 1; build_masks(c[j] + cox, cio - cox, @@ -245,6 +248,133 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task, } } +#if defined(PIPE_ARCH_SSE) && defined(TRI_16) +/* XXX: special case this when intersection is not required. + * - tile completely within bbox, + * - bbox completely within tile. + */ +void +TRI_16(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + const struct lp_rast_triangle *tri = arg.triangle.tri; + const struct lp_rast_plane *plane = GET_PLANES(tri); + unsigned mask = arg.triangle.plane_mask; + unsigned outmask, partial_mask; + unsigned j; + __m128i cstep4[NR_PLANES][4]; + + int x = (mask & 0xff); + int y = (mask >> 8); + + outmask = 0; /* outside one or more trivial reject planes */ + + x += task->x; + y += task->y; + + for (j = 0; j < NR_PLANES; j++) { + const int dcdx = -plane[j].dcdx * 4; + const int dcdy = plane[j].dcdy * 4; + __m128i xdcdy = _mm_set1_epi32(dcdy); + + cstep4[j][0] = _mm_setr_epi32(0, dcdx, dcdx*2, dcdx*3); + cstep4[j][1] = _mm_add_epi32(cstep4[j][0], xdcdy); + cstep4[j][2] = _mm_add_epi32(cstep4[j][1], xdcdy); + cstep4[j][3] = _mm_add_epi32(cstep4[j][2], xdcdy); + + { + const int c = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x; + const int cox = plane[j].eo * 4; + + outmask |= sign_bits4(cstep4[j], c + cox); + } + } + + if (outmask == 0xffff) + return; + + + /* Mask of sub-blocks which are inside all trivial reject planes, + * but outside at least one trivial accept plane: + */ + partial_mask = 0xffff & ~outmask; + + /* Iterate over partials: + */ + while (partial_mask) { + int i = ffs(partial_mask) - 1; + int ix = (i & 3) * 4; + int iy = (i >> 2) * 4; + int px = x + ix; + int py = y + iy; + unsigned mask = 0xffff; + + partial_mask &= ~(1 << i); + + for (j = 0; j < NR_PLANES; j++) { + const int cx = (plane[j].c - 1 + - plane[j].dcdx * px + + plane[j].dcdy * py) * 4; + + mask &= ~sign_bits4(cstep4[j], cx); + } + + if (mask) + lp_rast_shade_quads_mask(task, &tri->inputs, px, py, mask); + } +} +#endif + +#if defined(PIPE_ARCH_SSE) && defined(TRI_4) +void +TRI_4(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + const struct lp_rast_triangle *tri = arg.triangle.tri; + const struct lp_rast_plane *plane = GET_PLANES(tri); + unsigned mask = arg.triangle.plane_mask; + const int x = task->x + (mask & 0xff); + const int y = task->y + (mask >> 8); + unsigned j; + + /* Iterate over partials: + */ + { + unsigned mask = 0xffff; + + for (j = 0; j < NR_PLANES; j++) { + const int cx = (plane[j].c + - plane[j].dcdx * x + + plane[j].dcdy * y); + + const int dcdx = -plane[j].dcdx; + const int dcdy = plane[j].dcdy; + __m128i xdcdy = _mm_set1_epi32(dcdy); + + __m128i cstep0 = _mm_setr_epi32(cx, cx + dcdx, cx + dcdx*2, cx + dcdx*3); + __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy); + __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy); + __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy); + + __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1); + __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3); + __m128i result = _mm_packs_epi16(cstep01, cstep23); + + /* Extract the sign bits + */ + mask &= ~_mm_movemask_epi8(result); + } + + if (mask) + lp_rast_shade_quads_mask(task, &tri->inputs, x, y, mask); + } +} +#endif + + + #undef TAG +#undef TRI_4 +#undef TRI_16 #undef NR_PLANES diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c b/src/gallium/drivers/llvmpipe/lp_scene.c index 8b504f23a33..a4fdf7cff36 100644 --- a/src/gallium/drivers/llvmpipe/lp_scene.c +++ b/src/gallium/drivers/llvmpipe/lp_scene.c @@ -203,7 +203,9 @@ lp_scene_end_rasterization(struct lp_scene *scene ) for (i = 0; i < scene->tiles_x; i++) { for (j = 0; j < scene->tiles_y; j++) { struct cmd_bin *bin = lp_scene_get_bin(scene, i, j); - bin->head = bin->tail = NULL; + bin->head = NULL; + bin->tail = NULL; + bin->last_state = NULL; } } diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h index dbef7692e42..622c522f11a 100644 --- a/src/gallium/drivers/llvmpipe/lp_scene.h +++ b/src/gallium/drivers/llvmpipe/lp_scene.h @@ -41,6 +41,7 @@ #include "lp_debug.h" struct lp_scene_queue; +struct lp_rast_state; /* We're limited to 2K by 2K for 32bit fixed point rasterization. * Will need a 64-bit version for larger framebuffers. @@ -94,6 +95,7 @@ struct data_block { struct cmd_bin { ushort x; ushort y; + const struct lp_rast_state *last_state; /* most recent state set in bin */ struct cmd_block *head; struct cmd_block *tail; }; @@ -297,7 +299,7 @@ lp_scene_bin_command( struct lp_scene *scene, assert(x < scene->tiles_x); assert(y < scene->tiles_y); - assert(cmd <= LP_RAST_OP_END_QUERY); + assert(cmd < LP_RAST_OP_MAX); if (tail == NULL || tail->count == CMD_BLOCK_MAX) { tail = lp_scene_new_cmd_block( scene, bin ); @@ -318,6 +320,30 @@ lp_scene_bin_command( struct lp_scene *scene, } +static INLINE boolean +lp_scene_bin_cmd_with_state( struct lp_scene *scene, + unsigned x, unsigned y, + const struct lp_rast_state *state, + unsigned cmd, + union lp_rast_cmd_arg arg ) +{ + struct cmd_bin *bin = lp_scene_get_bin(scene, x, y); + + if (state != bin->last_state) { + bin->last_state = state; + if (!lp_scene_bin_command(scene, x, y, + LP_RAST_OP_SET_STATE, + lp_rast_arg_state(state))) + return FALSE; + } + + if (!lp_scene_bin_command( scene, x, y, cmd, arg )) + return FALSE; + + return TRUE; +} + + /* Add a command to all active bins. */ static INLINE boolean diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c index 96633d93654..ad0ea75b3a3 100644 --- a/src/gallium/drivers/llvmpipe/lp_screen.c +++ b/src/gallium/drivers/llvmpipe/lp_screen.c @@ -158,6 +158,8 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT: case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER: return 0; + case PIPE_CAP_PRIMITIVE_RESTART: + return 1; case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE: return 1; case PIPE_CAP_DEPTH_CLAMP: diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c index 5ff11a33632..6118434d3d3 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.c +++ b/src/gallium/drivers/llvmpipe/lp_setup.c @@ -56,7 +56,7 @@ #include "draw/draw_vbuf.h" -static void set_scene_state( struct lp_setup_context *, enum setup_state, +static boolean set_scene_state( struct lp_setup_context *, enum setup_state, const char *reason); static boolean try_update_scene_state( struct lp_setup_context *setup ); @@ -167,7 +167,7 @@ lp_setup_rasterize_scene( struct lp_setup_context *setup ) -static void +static boolean begin_binning( struct lp_setup_context *setup ) { struct lp_scene *scene = setup->scene; @@ -181,6 +181,8 @@ begin_binning( struct lp_setup_context *setup ) /* Always create a fence: */ scene->fence = lp_fence_create(MAX2(1, setup->num_threads)); + if (!scene->fence) + return FALSE; /* Initialize the bin flags and x/y coords: */ @@ -192,7 +194,8 @@ begin_binning( struct lp_setup_context *setup ) } ok = try_update_scene_state(setup); - assert(ok); + if (!ok) + return FALSE; if (setup->fb.zsbuf && ((setup->clear.flags & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL) && @@ -208,7 +211,8 @@ begin_binning( struct lp_setup_context *setup ) ok = lp_scene_bin_everywhere( scene, LP_RAST_OP_CLEAR_COLOR, setup->clear.color ); - assert(ok); + if (!ok) + return FALSE; } } @@ -216,12 +220,14 @@ begin_binning( struct lp_setup_context *setup ) if (setup->clear.flags & PIPE_CLEAR_DEPTHSTENCIL) { if (!need_zsload) scene->has_depthstencil_clear = TRUE; + ok = lp_scene_bin_everywhere( scene, LP_RAST_OP_CLEAR_ZSTENCIL, lp_rast_arg_clearzs( setup->clear.zsvalue, setup->clear.zsmask)); - assert(ok); + if (!ok) + return FALSE; } } @@ -229,15 +235,16 @@ begin_binning( struct lp_setup_context *setup ) ok = lp_scene_bin_everywhere( scene, LP_RAST_OP_BEGIN_QUERY, lp_rast_arg_query(setup->active_query) ); - assert(ok); + if (!ok) + return FALSE; } - setup->clear.flags = 0; setup->clear.zsmask = 0; setup->clear.zsvalue = 0; LP_DBG(DEBUG_SETUP, "%s done\n", __FUNCTION__); + return TRUE; } @@ -246,12 +253,12 @@ begin_binning( struct lp_setup_context *setup ) * * TODO: fast path for fullscreen clears and no triangles. */ -static void +static boolean execute_clears( struct lp_setup_context *setup ) { LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__); - begin_binning( setup ); + return begin_binning( setup ); } const char *states[] = { @@ -262,7 +269,7 @@ const char *states[] = { }; -static void +static boolean set_scene_state( struct lp_setup_context *setup, enum setup_state new_state, const char *reason) @@ -270,7 +277,7 @@ set_scene_state( struct lp_setup_context *setup, unsigned old_state = setup->state; if (old_state == new_state) - return; + return TRUE; if (LP_DEBUG & DEBUG_SCENE) { debug_printf("%s old %s new %s%s%s\n", @@ -294,12 +301,14 @@ set_scene_state( struct lp_setup_context *setup, break; case SETUP_ACTIVE: - begin_binning( setup ); + if (!begin_binning( setup )) + goto fail; break; case SETUP_FLUSHED: if (old_state == SETUP_CLEARED) - execute_clears( setup ); + if (!execute_clears( setup )) + goto fail; lp_setup_rasterize_scene( setup ); assert(setup->scene == NULL); @@ -307,9 +316,21 @@ set_scene_state( struct lp_setup_context *setup, default: assert(0 && "invalid setup state mode"); + goto fail; } setup->state = new_state; + return TRUE; + +fail: + if (setup->scene) { + lp_scene_end_rasterization(setup->scene); + setup->scene = NULL; + } + + setup->state = SETUP_FLUSHED; + lp_setup_reset( setup ); + return FALSE; } @@ -377,16 +398,19 @@ lp_setup_try_clear( struct lp_setup_context *setup, } if (flags & PIPE_CLEAR_DEPTHSTENCIL) { - unsigned zmask = (flags & PIPE_CLEAR_DEPTH) ? ~0 : 0; - unsigned smask = (flags & PIPE_CLEAR_STENCIL) ? ~0 : 0; + uint32_t zmask = (flags & PIPE_CLEAR_DEPTH) ? ~0 : 0; + uint32_t smask = (flags & PIPE_CLEAR_STENCIL) ? ~0 : 0; zsvalue = util_pack_z_stencil(setup->fb.zsbuf->format, depth, stencil); - zsmask = util_pack_uint_z_stencil(setup->fb.zsbuf->format, + + zsmask = util_pack_mask_z_stencil(setup->fb.zsbuf->format, zmask, smask); + + zsvalue &= zsmask; } if (setup->state == SETUP_ACTIVE) { @@ -431,7 +455,7 @@ lp_setup_try_clear( struct lp_setup_context *setup, if (flags & PIPE_CLEAR_COLOR) { memcpy(setup->clear.color.clear_color, &color_arg, - sizeof color_arg); + sizeof setup->clear.color.clear_color); } } @@ -502,14 +526,12 @@ lp_setup_set_point_state( struct lp_setup_context *setup, } void -lp_setup_set_fs_inputs( struct lp_setup_context *setup, - const struct lp_shader_input *input, - unsigned nr ) +lp_setup_set_setup_variant( struct lp_setup_context *setup, + const struct lp_setup_variant *variant) { - LP_DBG(DEBUG_SETUP, "%s %p %u\n", __FUNCTION__, (void *) input, nr); - - memcpy( setup->fs.input, input, nr * sizeof input[0] ); - setup->fs.nr_inputs = nr; + LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__); + + setup->setup.variant = variant; } void @@ -875,7 +897,7 @@ try_update_scene_state( struct lp_setup_context *setup ) return TRUE; } -void +boolean lp_setup_update_state( struct lp_setup_context *setup, boolean update_scene ) { @@ -897,22 +919,47 @@ lp_setup_update_state( struct lp_setup_context *setup, setup->psize = lp->psize_slot; assert(lp->dirty == 0); + + assert(lp->setup_variant.key.size == + setup->setup.variant->key.size); + + assert(memcmp(&lp->setup_variant.key, + &setup->setup.variant->key, + setup->setup.variant->key.size) == 0); } - if (update_scene) - set_scene_state( setup, SETUP_ACTIVE, __FUNCTION__ ); + if (update_scene) { + if (!set_scene_state( setup, SETUP_ACTIVE, __FUNCTION__ )) + return FALSE; + } /* Only call into update_scene_state() if we already have a * scene: */ if (update_scene && setup->scene) { assert(setup->state == SETUP_ACTIVE); - if (!try_update_scene_state(setup)) { - lp_setup_flush_and_restart(setup); - if (!try_update_scene_state(setup)) - assert(0); - } + + if (try_update_scene_state(setup)) + return TRUE; + + /* Update failed, try to restart the scene. + * + * Cannot call lp_setup_flush_and_restart() directly here + * because of potential recursion. + */ + if (!set_scene_state(setup, SETUP_FLUSHED, __FUNCTION__)) + return FALSE; + + if (!set_scene_state(setup, SETUP_ACTIVE, __FUNCTION__)) + return FALSE; + + if (!setup->scene) + return FALSE; + + return try_update_scene_state(setup); } + + return TRUE; } @@ -1016,12 +1063,12 @@ lp_setup_begin_query(struct lp_setup_context *setup, LP_RAST_OP_BEGIN_QUERY, lp_rast_arg_query(pq))) { - lp_setup_flush_and_restart(setup); + if (!lp_setup_flush_and_restart(setup)) + return; if (!lp_scene_bin_everywhere(setup->scene, LP_RAST_OP_BEGIN_QUERY, lp_rast_arg_query(pq))) { - assert(0); return; } } @@ -1065,14 +1112,20 @@ lp_setup_end_query(struct lp_setup_context *setup, struct llvmpipe_query *pq) } -void +boolean lp_setup_flush_and_restart(struct lp_setup_context *setup) { if (0) debug_printf("%s\n", __FUNCTION__); assert(setup->state == SETUP_ACTIVE); - set_scene_state(setup, SETUP_FLUSHED, __FUNCTION__); - lp_setup_update_state(setup, TRUE); + + if (!set_scene_state(setup, SETUP_FLUSHED, __FUNCTION__)) + return FALSE; + + if (!lp_setup_update_state(setup, TRUE)) + return FALSE; + + return TRUE; } diff --git a/src/gallium/drivers/llvmpipe/lp_setup.h b/src/gallium/drivers/llvmpipe/lp_setup.h index 25dab78f64f..ebb18f81344 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.h +++ b/src/gallium/drivers/llvmpipe/lp_setup.h @@ -33,28 +33,6 @@ struct draw_context; struct vertex_info; -enum lp_interp { - LP_INTERP_CONSTANT, - LP_INTERP_LINEAR, - LP_INTERP_PERSPECTIVE, - LP_INTERP_POSITION, - LP_INTERP_FACING -}; - - -/** - * Describes how to compute the interpolation coefficients (a0, dadx, dady) - * from the vertices passed into our triangle/line/point functions by the - * draw module. - * - * Vertices are treated as an array of float[4] values, indexed by - * src_index. - */ -struct lp_shader_input { - enum lp_interp interp; /* how to interpolate values */ - unsigned src_index; /* where to find values in incoming vertices */ - unsigned usage_mask; /* bitmask of TGSI_WRITEMASK_x flags */ -}; struct pipe_resource; struct pipe_query; @@ -66,7 +44,7 @@ struct lp_fragment_shader_variant; struct lp_jit_context; struct llvmpipe_query; struct pipe_fence_handle; - +struct lp_setup_variant; struct lp_setup_context * lp_setup_create( struct pipe_context *pipe, @@ -111,9 +89,8 @@ lp_setup_set_point_state( struct lp_setup_context *setup, uint sprite_coord_origin); void -lp_setup_set_fs_inputs( struct lp_setup_context *setup, - const struct lp_shader_input *interp, - unsigned nr ); +lp_setup_set_setup_variant( struct lp_setup_context *setup, + const struct lp_setup_variant *variant ); void lp_setup_set_fs_variant( struct lp_setup_context *setup, diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef.c b/src/gallium/drivers/llvmpipe/lp_setup_coef.c deleted file mode 100644 index 8dc2688ddb6..00000000000 --- a/src/gallium/drivers/llvmpipe/lp_setup_coef.c +++ /dev/null @@ -1,279 +0,0 @@ -/************************************************************************** - * - * Copyright 2010, VMware. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -/* - * Binning code for triangles - */ - -#include "util/u_math.h" -#include "util/u_memory.h" -#include "lp_perf.h" -#include "lp_setup_context.h" -#include "lp_setup_coef.h" -#include "lp_rast.h" -#include "lp_state_fs.h" - -#if !defined(PIPE_ARCH_SSE) - -/** - * Compute a0 for a constant-valued coefficient (GL_FLAT shading). - */ -static void constant_coef( struct lp_rast_shader_inputs *inputs, - unsigned slot, - const float value, - unsigned i ) -{ - inputs->a0[slot][i] = value; - inputs->dadx[slot][i] = 0.0f; - inputs->dady[slot][i] = 0.0f; -} - - - -static void linear_coef( struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot, - unsigned vert_attr, - unsigned i) -{ - float a0 = info->v0[vert_attr][i]; - float a1 = info->v1[vert_attr][i]; - float a2 = info->v2[vert_attr][i]; - - float da01 = a0 - a1; - float da20 = a2 - a0; - float dadx = (da01 * info->dy20_ooa - info->dy01_ooa * da20); - float dady = (da20 * info->dx01_ooa - info->dx20_ooa * da01); - - inputs->dadx[slot][i] = dadx; - inputs->dady[slot][i] = dady; - - /* calculate a0 as the value which would be sampled for the - * fragment at (0,0), taking into account that we want to sample at - * pixel centers, in other words (0.5, 0.5). - * - * this is neat but unfortunately not a good way to do things for - * triangles with very large values of dadx or dady as it will - * result in the subtraction and re-addition from a0 of a very - * large number, which means we'll end up loosing a lot of the - * fractional bits and precision from a0. the way to fix this is - * to define a0 as the sample at a pixel center somewhere near vmin - * instead - i'll switch to this later. - */ - inputs->a0[slot][i] = a0 - (dadx * info->x0_center + - dady * info->y0_center); -} - - -/** - * Compute a0, dadx and dady for a perspective-corrected interpolant, - * for a triangle. - * We basically multiply the vertex value by 1/w before computing - * the plane coefficients (a0, dadx, dady). - * Later, when we compute the value at a particular fragment position we'll - * divide the interpolated value by the interpolated W at that fragment. - */ -static void perspective_coef( struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot, - unsigned vert_attr, - unsigned i) -{ - /* premultiply by 1/w (v[0][3] is always 1/w): - */ - float a0 = info->v0[vert_attr][i] * info->v0[0][3]; - float a1 = info->v1[vert_attr][i] * info->v1[0][3]; - float a2 = info->v2[vert_attr][i] * info->v2[0][3]; - float da01 = a0 - a1; - float da20 = a2 - a0; - float dadx = da01 * info->dy20_ooa - info->dy01_ooa * da20; - float dady = da20 * info->dx01_ooa - info->dx20_ooa * da01; - - inputs->dadx[slot][i] = dadx; - inputs->dady[slot][i] = dady; - inputs->a0[slot][i] = a0 - (dadx * info->x0_center + - dady * info->y0_center); -} - - -/** - * Special coefficient setup for gl_FragCoord. - * X and Y are trivial - * Z and W are copied from position_coef which should have already been computed. - * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask. - */ -static void -setup_fragcoord_coef(struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot, - unsigned usage_mask) -{ - /*X*/ - if (usage_mask & TGSI_WRITEMASK_X) { - inputs->a0[slot][0] = 0.0; - inputs->dadx[slot][0] = 1.0; - inputs->dady[slot][0] = 0.0; - } - - /*Y*/ - if (usage_mask & TGSI_WRITEMASK_Y) { - inputs->a0[slot][1] = 0.0; - inputs->dadx[slot][1] = 0.0; - inputs->dady[slot][1] = 1.0; - } - - /*Z*/ - if (usage_mask & TGSI_WRITEMASK_Z) { - linear_coef(inputs, info, slot, 0, 2); - } - - /*W*/ - if (usage_mask & TGSI_WRITEMASK_W) { - linear_coef(inputs, info, slot, 0, 3); - } -} - - -/** - * Setup the fragment input attribute with the front-facing value. - * \param frontface is the triangle front facing? - */ -static void setup_facing_coef( struct lp_rast_shader_inputs *inputs, - unsigned slot, - boolean frontface, - unsigned usage_mask) -{ - /* convert TRUE to 1.0 and FALSE to -1.0 */ - if (usage_mask & TGSI_WRITEMASK_X) - constant_coef( inputs, slot, 2.0f * frontface - 1.0f, 0 ); - - if (usage_mask & TGSI_WRITEMASK_Y) - constant_coef( inputs, slot, 0.0f, 1 ); /* wasted */ - - if (usage_mask & TGSI_WRITEMASK_Z) - constant_coef( inputs, slot, 0.0f, 2 ); /* wasted */ - - if (usage_mask & TGSI_WRITEMASK_W) - constant_coef( inputs, slot, 0.0f, 3 ); /* wasted */ -} - - -/** - * Compute the tri->coef[] array dadx, dady, a0 values. - */ -void lp_setup_tri_coef( struct lp_setup_context *setup, - struct lp_rast_shader_inputs *inputs, - const float (*v0)[4], - const float (*v1)[4], - const float (*v2)[4], - boolean frontfacing) -{ - unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ; - unsigned slot; - unsigned i; - struct lp_tri_info info; - float dx01 = v0[0][0] - v1[0][0]; - float dy01 = v0[0][1] - v1[0][1]; - float dx20 = v2[0][0] - v0[0][0]; - float dy20 = v2[0][1] - v0[0][1]; - float oneoverarea = 1.0f / (dx01 * dy20 - dx20 * dy01); - - info.v0 = v0; - info.v1 = v1; - info.v2 = v2; - info.frontfacing = frontfacing; - info.x0_center = v0[0][0] - setup->pixel_offset; - info.y0_center = v0[0][1] - setup->pixel_offset; - info.dx01_ooa = dx01 * oneoverarea; - info.dx20_ooa = dx20 * oneoverarea; - info.dy01_ooa = dy01 * oneoverarea; - info.dy20_ooa = dy20 * oneoverarea; - - - /* setup interpolation for all the remaining attributes: - */ - for (slot = 0; slot < setup->fs.nr_inputs; slot++) { - unsigned vert_attr = setup->fs.input[slot].src_index; - unsigned usage_mask = setup->fs.input[slot].usage_mask; - - switch (setup->fs.input[slot].interp) { - case LP_INTERP_CONSTANT: - if (setup->flatshade_first) { - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) - constant_coef(inputs, slot+1, info.v0[vert_attr][i], i); - } - else { - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) - constant_coef(inputs, slot+1, info.v2[vert_attr][i], i); - } - break; - - case LP_INTERP_LINEAR: - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) - linear_coef(inputs, &info, slot+1, vert_attr, i); - break; - - case LP_INTERP_PERSPECTIVE: - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) - perspective_coef(inputs, &info, slot+1, vert_attr, i); - fragcoord_usage_mask |= TGSI_WRITEMASK_W; - break; - - case LP_INTERP_POSITION: - /* - * The generated pixel interpolators will pick up the coeffs from - * slot 0, so all need to ensure that the usage mask is covers all - * usages. - */ - fragcoord_usage_mask |= usage_mask; - break; - - case LP_INTERP_FACING: - setup_facing_coef(inputs, slot+1, info.frontfacing, usage_mask); - break; - - default: - assert(0); - } - } - - /* The internal position input is in slot zero: - */ - setup_fragcoord_coef(inputs, &info, 0, fragcoord_usage_mask); -} - -#else -extern void lp_setup_coef_dummy(void); -void lp_setup_coef_dummy(void) -{ -} - -#endif diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef.h b/src/gallium/drivers/llvmpipe/lp_setup_coef.h deleted file mode 100644 index 87a3255ccc6..00000000000 --- a/src/gallium/drivers/llvmpipe/lp_setup_coef.h +++ /dev/null @@ -1,64 +0,0 @@ -/************************************************************************** - * - * Copyright 2010 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - -/** - * The setup code is concerned with point/line/triangle setup and - * putting commands/data into the bins. - */ - - -#ifndef LP_SETUP_COEF_H -#define LP_SETUP_COEF_H - - -struct lp_tri_info { - - float x0_center; - float y0_center; - - /* turn these into an aligned float[4] */ - float dy01_ooa; - float dy20_ooa; - float dx01_ooa; - float dx20_ooa; - - const float (*v0)[4]; - const float (*v1)[4]; - const float (*v2)[4]; - - boolean frontfacing; /* remove eventually */ -}; - -void lp_setup_tri_coef( struct lp_setup_context *setup, - struct lp_rast_shader_inputs *inputs, - const float (*v0)[4], - const float (*v1)[4], - const float (*v2)[4], - boolean frontfacing); - -#endif diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c b/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c deleted file mode 100644 index 3742fd672b2..00000000000 --- a/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c +++ /dev/null @@ -1,228 +0,0 @@ -/************************************************************************** - * - * Copyright 2010 VMware. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -/* - * Binning code for triangles - */ - -#include "util/u_math.h" -#include "util/u_memory.h" -#include "lp_perf.h" -#include "lp_setup_context.h" -#include "lp_setup_coef.h" -#include "lp_rast.h" - -#if defined(PIPE_ARCH_SSE) -#include <emmintrin.h> - - -static void constant_coef4( struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot, - const float *attr) -{ - *(__m128 *)inputs->a0[slot] = *(__m128 *)attr; - *(__m128 *)inputs->dadx[slot] = _mm_set1_ps(0.0); - *(__m128 *)inputs->dady[slot] = _mm_set1_ps(0.0); -} - - - -/** - * Setup the fragment input attribute with the front-facing value. - * \param frontface is the triangle front facing? - */ -static void setup_facing_coef( struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot ) -{ - /* XXX: just pass frontface directly to the shader, don't bother - * treating it as an input. - */ - __m128 a0 = _mm_setr_ps(info->frontfacing ? 1.0 : -1.0, - 0, 0, 0); - - *(__m128 *)inputs->a0[slot] = a0; - *(__m128 *)inputs->dadx[slot] = _mm_set1_ps(0.0); - *(__m128 *)inputs->dady[slot] = _mm_set1_ps(0.0); -} - - - -static void calc_coef4( struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot, - __m128 a0, - __m128 a1, - __m128 a2) -{ - __m128 da01 = _mm_sub_ps(a0, a1); - __m128 da20 = _mm_sub_ps(a2, a0); - - __m128 da01_dy20_ooa = _mm_mul_ps(da01, _mm_set1_ps(info->dy20_ooa)); - __m128 da20_dy01_ooa = _mm_mul_ps(da20, _mm_set1_ps(info->dy01_ooa)); - __m128 dadx = _mm_sub_ps(da01_dy20_ooa, da20_dy01_ooa); - - __m128 da01_dx20_ooa = _mm_mul_ps(da01, _mm_set1_ps(info->dx20_ooa)); - __m128 da20_dx01_ooa = _mm_mul_ps(da20, _mm_set1_ps(info->dx01_ooa)); - __m128 dady = _mm_sub_ps(da20_dx01_ooa, da01_dx20_ooa); - - __m128 dadx_x0 = _mm_mul_ps(dadx, _mm_set1_ps(info->x0_center)); - __m128 dady_y0 = _mm_mul_ps(dady, _mm_set1_ps(info->y0_center)); - __m128 attr_v0 = _mm_add_ps(dadx_x0, dady_y0); - __m128 attr_0 = _mm_sub_ps(a0, attr_v0); - - *(__m128 *)inputs->a0[slot] = attr_0; - *(__m128 *)inputs->dadx[slot] = dadx; - *(__m128 *)inputs->dady[slot] = dady; -} - - -static void linear_coef( struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot, - unsigned vert_attr) -{ - __m128 a0 = *(const __m128 *)info->v0[vert_attr]; - __m128 a1 = *(const __m128 *)info->v1[vert_attr]; - __m128 a2 = *(const __m128 *)info->v2[vert_attr]; - - calc_coef4(inputs, info, slot, a0, a1, a2); -} - - - -/** - * Compute a0, dadx and dady for a perspective-corrected interpolant, - * for a triangle. - * We basically multiply the vertex value by 1/w before computing - * the plane coefficients (a0, dadx, dady). - * Later, when we compute the value at a particular fragment position we'll - * divide the interpolated value by the interpolated W at that fragment. - */ -static void perspective_coef( struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot, - unsigned vert_attr) -{ - /* premultiply by 1/w (v[0][3] is always 1/w): - */ - __m128 a0 = *(const __m128 *)info->v0[vert_attr]; - __m128 a1 = *(const __m128 *)info->v1[vert_attr]; - __m128 a2 = *(const __m128 *)info->v2[vert_attr]; - - __m128 a0_oow = _mm_mul_ps(a0, _mm_set1_ps(info->v0[0][3])); - __m128 a1_oow = _mm_mul_ps(a1, _mm_set1_ps(info->v1[0][3])); - __m128 a2_oow = _mm_mul_ps(a2, _mm_set1_ps(info->v2[0][3])); - - calc_coef4(inputs, info, slot, a0_oow, a1_oow, a2_oow); -} - - - - - -/** - * Compute the inputs-> dadx, dady, a0 values. - */ -void lp_setup_tri_coef( struct lp_setup_context *setup, - struct lp_rast_shader_inputs *inputs, - const float (*v0)[4], - const float (*v1)[4], - const float (*v2)[4], - boolean frontfacing) -{ - unsigned slot; - struct lp_tri_info info; - float dx01 = v0[0][0] - v1[0][0]; - float dy01 = v0[0][1] - v1[0][1]; - float dx20 = v2[0][0] - v0[0][0]; - float dy20 = v2[0][1] - v0[0][1]; - float oneoverarea = 1.0f / (dx01 * dy20 - dx20 * dy01); - - info.v0 = v0; - info.v1 = v1; - info.v2 = v2; - info.frontfacing = frontfacing; - info.x0_center = v0[0][0] - setup->pixel_offset; - info.y0_center = v0[0][1] - setup->pixel_offset; - info.dx01_ooa = dx01 * oneoverarea; - info.dx20_ooa = dx20 * oneoverarea; - info.dy01_ooa = dy01 * oneoverarea; - info.dy20_ooa = dy20 * oneoverarea; - - - /* The internal position input is in slot zero: - */ - linear_coef(inputs, &info, 0, 0); - - /* setup interpolation for all the remaining attributes: - */ - for (slot = 0; slot < setup->fs.nr_inputs; slot++) { - unsigned vert_attr = setup->fs.input[slot].src_index; - - switch (setup->fs.input[slot].interp) { - case LP_INTERP_CONSTANT: - if (setup->flatshade_first) { - constant_coef4(inputs, &info, slot+1, info.v0[vert_attr]); - } - else { - constant_coef4(inputs, &info, slot+1, info.v2[vert_attr]); - } - break; - - case LP_INTERP_LINEAR: - linear_coef(inputs, &info, slot+1, vert_attr); - break; - - case LP_INTERP_PERSPECTIVE: - perspective_coef(inputs, &info, slot+1, vert_attr); - break; - - case LP_INTERP_POSITION: - /* - * The generated pixel interpolators will pick up the coeffs from - * slot 0. - */ - break; - - case LP_INTERP_FACING: - setup_facing_coef(inputs, &info, slot+1); - break; - - default: - assert(0); - } - } -} - -#else -extern void lp_setup_coef_dummy(void); -void lp_setup_coef_dummy(void) -{ -} -#endif diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h index 8506ed2dc9e..dc2533bedc5 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_context.h +++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h @@ -39,6 +39,7 @@ #include "lp_rast.h" #include "lp_tile_soa.h" /* for TILE_SIZE */ #include "lp_scene.h" +#include "lp_bld_interp.h" /* for struct lp_shader_input */ #include "draw/draw_vbuf.h" #include "util/u_rect.h" @@ -49,6 +50,8 @@ #define LP_SETUP_NEW_SCISSOR 0x08 +struct lp_setup_variant; + /** Max number of scenes */ #define MAX_SCENES 2 @@ -118,9 +121,6 @@ struct lp_setup_context } state; struct { - struct lp_shader_input input[PIPE_MAX_ATTRIBS]; - unsigned nr_inputs; - const struct lp_rast_state *stored; /**< what's in the scene */ struct lp_rast_state current; /**< currently set state */ struct pipe_resource *current_tex[PIPE_MAX_SAMPLERS]; @@ -139,6 +139,10 @@ struct lp_setup_context } blend_color; + struct { + const struct lp_setup_variant *variant; + } setup; + unsigned dirty; /**< bitmask of LP_SETUP_NEW_x bits */ void (*point)( struct lp_setup_context *, @@ -160,12 +164,12 @@ void lp_setup_choose_point( struct lp_setup_context *setup ); void lp_setup_init_vbuf(struct lp_setup_context *setup); -void lp_setup_update_state( struct lp_setup_context *setup, +boolean lp_setup_update_state( struct lp_setup_context *setup, boolean update_scene); void lp_setup_destroy( struct lp_setup_context *setup ); -void lp_setup_flush_and_restart(struct lp_setup_context *setup); +boolean lp_setup_flush_and_restart(struct lp_setup_context *setup); void lp_setup_print_triangle(struct lp_setup_context *setup, @@ -181,7 +185,7 @@ lp_setup_print_vertex(struct lp_setup_context *setup, struct lp_rast_triangle * lp_setup_alloc_triangle(struct lp_scene *scene, - unsigned nr_inputs, + unsigned num_inputs, unsigned nr_planes, unsigned *tri_size); @@ -191,6 +195,4 @@ lp_setup_bin_triangle( struct lp_setup_context *setup, const struct u_rect *bbox, int nr_planes ); -void lp_setup_flush_and_restart(struct lp_setup_context *setup); - #endif diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c index 156bd633754..827413bb33e 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_line.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c @@ -35,6 +35,7 @@ #include "lp_setup_context.h" #include "lp_rast.h" #include "lp_state_fs.h" +#include "lp_state_setup.h" #define NUM_CHANNELS 4 @@ -46,6 +47,10 @@ struct lp_line_info { const float (*v1)[4]; const float (*v2)[4]; + + float (*a0)[4]; + float (*dadx)[4]; + float (*dady)[4]; }; @@ -53,14 +58,14 @@ struct lp_line_info { * Compute a0 for a constant-valued coefficient (GL_FLAT shading). */ static void constant_coef( struct lp_setup_context *setup, - struct lp_rast_triangle *tri, + struct lp_line_info *info, unsigned slot, const float value, unsigned i ) { - tri->inputs.a0[slot][i] = value; - tri->inputs.dadx[slot][i] = 0.0f; - tri->inputs.dady[slot][i] = 0.0f; + info->a0[slot][i] = value; + info->dadx[slot][i] = 0.0f; + info->dady[slot][i] = 0.0f; } @@ -69,7 +74,6 @@ static void constant_coef( struct lp_setup_context *setup, * for a triangle. */ static void linear_coef( struct lp_setup_context *setup, - struct lp_rast_triangle *tri, struct lp_line_info *info, unsigned slot, unsigned vert_attr, @@ -82,10 +86,10 @@ static void linear_coef( struct lp_setup_context *setup, float dadx = da21 * info->dx * info->oneoverarea; float dady = da21 * info->dy * info->oneoverarea; - tri->inputs.dadx[slot][i] = dadx; - tri->inputs.dady[slot][i] = dady; + info->dadx[slot][i] = dadx; + info->dady[slot][i] = dady; - tri->inputs.a0[slot][i] = (a1 - + info->a0[slot][i] = (a1 - (dadx * (info->v1[0][0] - setup->pixel_offset) + dady * (info->v1[0][1] - setup->pixel_offset))); } @@ -100,7 +104,6 @@ static void linear_coef( struct lp_setup_context *setup, * divide the interpolated value by the interpolated W at that fragment. */ static void perspective_coef( struct lp_setup_context *setup, - struct lp_rast_triangle *tri, struct lp_line_info *info, unsigned slot, unsigned vert_attr, @@ -115,43 +118,42 @@ static void perspective_coef( struct lp_setup_context *setup, float dadx = da21 * info->dx * info->oneoverarea; float dady = da21 * info->dy * info->oneoverarea; - tri->inputs.dadx[slot][i] = dadx; - tri->inputs.dady[slot][i] = dady; + info->dadx[slot][i] = dadx; + info->dady[slot][i] = dady; - tri->inputs.a0[slot][i] = (a1 - - (dadx * (info->v1[0][0] - setup->pixel_offset) + - dady * (info->v1[0][1] - setup->pixel_offset))); + info->a0[slot][i] = (a1 - + (dadx * (info->v1[0][0] - setup->pixel_offset) + + dady * (info->v1[0][1] - setup->pixel_offset))); } static void setup_fragcoord_coef( struct lp_setup_context *setup, - struct lp_rast_triangle *tri, struct lp_line_info *info, unsigned slot, unsigned usage_mask) { /*X*/ if (usage_mask & TGSI_WRITEMASK_X) { - tri->inputs.a0[slot][0] = 0.0; - tri->inputs.dadx[slot][0] = 1.0; - tri->inputs.dady[slot][0] = 0.0; + info->a0[slot][0] = 0.0; + info->dadx[slot][0] = 1.0; + info->dady[slot][0] = 0.0; } /*Y*/ if (usage_mask & TGSI_WRITEMASK_Y) { - tri->inputs.a0[slot][1] = 0.0; - tri->inputs.dadx[slot][1] = 0.0; - tri->inputs.dady[slot][1] = 1.0; + info->a0[slot][1] = 0.0; + info->dadx[slot][1] = 0.0; + info->dady[slot][1] = 1.0; } /*Z*/ if (usage_mask & TGSI_WRITEMASK_Z) { - linear_coef(setup, tri, info, slot, 0, 2); + linear_coef(setup, info, slot, 0, 2); } /*W*/ if (usage_mask & TGSI_WRITEMASK_W) { - linear_coef(setup, tri, info, slot, 0, 3); + linear_coef(setup, info, slot, 0, 3); } } @@ -159,43 +161,43 @@ setup_fragcoord_coef( struct lp_setup_context *setup, * Compute the tri->coef[] array dadx, dady, a0 values. */ static void setup_line_coefficients( struct lp_setup_context *setup, - struct lp_rast_triangle *tri, struct lp_line_info *info) { + const struct lp_setup_variant_key *key = &setup->setup.variant->key; unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ; unsigned slot; /* setup interpolation for all the remaining attributes: */ - for (slot = 0; slot < setup->fs.nr_inputs; slot++) { - unsigned vert_attr = setup->fs.input[slot].src_index; - unsigned usage_mask = setup->fs.input[slot].usage_mask; + for (slot = 0; slot < key->num_inputs; slot++) { + unsigned vert_attr = key->inputs[slot].src_index; + unsigned usage_mask = key->inputs[slot].usage_mask; unsigned i; - switch (setup->fs.input[slot].interp) { + switch (key->inputs[slot].interp) { case LP_INTERP_CONSTANT: - if (setup->flatshade_first) { + if (key->flatshade_first) { for (i = 0; i < NUM_CHANNELS; i++) if (usage_mask & (1 << i)) - constant_coef(setup, tri, slot+1, info->v1[vert_attr][i], i); + constant_coef(setup, info, slot+1, info->v1[vert_attr][i], i); } else { for (i = 0; i < NUM_CHANNELS; i++) if (usage_mask & (1 << i)) - constant_coef(setup, tri, slot+1, info->v2[vert_attr][i], i); + constant_coef(setup, info, slot+1, info->v2[vert_attr][i], i); } break; case LP_INTERP_LINEAR: for (i = 0; i < NUM_CHANNELS; i++) if (usage_mask & (1 << i)) - linear_coef(setup, tri, info, slot+1, vert_attr, i); + linear_coef(setup, info, slot+1, vert_attr, i); break; case LP_INTERP_PERSPECTIVE: for (i = 0; i < NUM_CHANNELS; i++) if (usage_mask & (1 << i)) - perspective_coef(setup, tri, info, slot+1, vert_attr, i); + perspective_coef(setup, info, slot+1, vert_attr, i); fragcoord_usage_mask |= TGSI_WRITEMASK_W; break; @@ -211,7 +213,7 @@ static void setup_line_coefficients( struct lp_setup_context *setup, case LP_INTERP_FACING: for (i = 0; i < NUM_CHANNELS; i++) if (usage_mask & (1 << i)) - constant_coef(setup, tri, slot+1, 1.0, i); + constant_coef(setup, info, slot+1, 1.0, i); break; default: @@ -221,7 +223,7 @@ static void setup_line_coefficients( struct lp_setup_context *setup, /* The internal position input is in slot zero: */ - setup_fragcoord_coef(setup, tri, info, 0, + setup_fragcoord_coef(setup, info, 0, fragcoord_usage_mask); } @@ -241,14 +243,15 @@ print_line(struct lp_setup_context *setup, const float (*v1)[4], const float (*v2)[4]) { + const struct lp_setup_variant_key *key = &setup->setup.variant->key; uint i; debug_printf("llvmpipe line\n"); - for (i = 0; i < 1 + setup->fs.nr_inputs; i++) { + for (i = 0; i < 1 + key->num_inputs; i++) { debug_printf(" v1[%d]: %f %f %f %f\n", i, v1[i][0], v1[i][1], v1[i][2], v1[i][3]); } - for (i = 0; i < 1 + setup->fs.nr_inputs; i++) { + for (i = 0; i < 1 + key->num_inputs; i++) { debug_printf(" v2[%d]: %f %f %f %f\n", i, v2[i][0], v2[i][1], v2[i][2], v2[i][3]); } @@ -275,7 +278,9 @@ try_setup_line( struct lp_setup_context *setup, const float (*v2)[4]) { struct lp_scene *scene = setup->scene; + const struct lp_setup_variant_key *key = &setup->setup.variant->key; struct lp_rast_triangle *line; + struct lp_rast_plane *plane; struct lp_line_info info; float width = MAX2(1.0, setup->line_width); struct u_rect bbox; @@ -475,7 +480,7 @@ try_setup_line( struct lp_setup_context *setup, else { /* do intersection test */ float xintersect = fracf(v2[0][0]) + y2diff * dxdy; - draw_end = (xintersect < 1.0 && xintersect > 0.0); + draw_end = (xintersect < 1.0 && xintersect >= 0.0); } /* Are we already drawing start/end? @@ -513,7 +518,7 @@ try_setup_line( struct lp_setup_context *setup, x_offset_end = y_offset_end * dxdy; } } - + /* x/y positions in fixed point */ x[0] = subpixel_snap(v1[0][0] + x_offset - setup->pixel_offset) - fixed_width/2; x[1] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset) - fixed_width/2; @@ -567,7 +572,7 @@ try_setup_line( struct lp_setup_context *setup, u_rect_find_intersection(&setup->draw_region, &bbox); line = lp_setup_alloc_triangle(scene, - setup->fs.nr_inputs, + key->num_inputs, nr_planes, &tri_bytes); if (!line) @@ -581,33 +586,35 @@ try_setup_line( struct lp_setup_context *setup, #endif /* calculate the deltas */ - line->plane[0].dcdy = x[0] - x[1]; - line->plane[1].dcdy = x[1] - x[2]; - line->plane[2].dcdy = x[2] - x[3]; - line->plane[3].dcdy = x[3] - x[0]; + plane = GET_PLANES(line); + plane[0].dcdy = x[0] - x[1]; + plane[1].dcdy = x[1] - x[2]; + plane[2].dcdy = x[2] - x[3]; + plane[3].dcdy = x[3] - x[0]; - line->plane[0].dcdx = y[0] - y[1]; - line->plane[1].dcdx = y[1] - y[2]; - line->plane[2].dcdx = y[2] - y[3]; - line->plane[3].dcdx = y[3] - y[0]; + plane[0].dcdx = y[0] - y[1]; + plane[1].dcdx = y[1] - y[2]; + plane[2].dcdx = y[2] - y[3]; + plane[3].dcdx = y[3] - y[0]; /* Setup parameter interpolants: */ - setup_line_coefficients( setup, line, &info); + info.a0 = GET_A0(&line->inputs); + info.dadx = GET_DADX(&line->inputs); + info.dady = GET_DADY(&line->inputs); + setup_line_coefficients(setup, &info); - line->inputs.facing = 1.0F; - line->inputs.state = setup->fs.stored; + line->inputs.frontfacing = TRUE; line->inputs.disable = FALSE; line->inputs.opaque = FALSE; for (i = 0; i < 4; i++) { - struct lp_rast_plane *plane = &line->plane[i]; /* half-edge constants, will be interated over the whole render * target. */ - plane->c = plane->dcdx * x[i] - plane->dcdy * y[i]; + plane[i].c = plane[i].dcdx * x[i] - plane[i].dcdy * y[i]; /* correct for top-left vs. bottom-left fill convention. @@ -623,38 +630,34 @@ try_setup_line( struct lp_setup_context *setup, * to its usual method, in which case it will probably want * to use the opposite, top-left convention. */ - if (plane->dcdx < 0) { + if (plane[i].dcdx < 0) { /* both fill conventions want this - adjust for left edges */ - plane->c++; + plane[i].c++; } - else if (plane->dcdx == 0) { + else if (plane[i].dcdx == 0) { if (setup->pixel_offset == 0) { /* correct for top-left fill convention: */ - if (plane->dcdy > 0) plane->c++; + if (plane[i].dcdy > 0) plane[i].c++; } else { /* correct for bottom-left fill convention: */ - if (plane->dcdy < 0) plane->c++; + if (plane[i].dcdy < 0) plane[i].c++; } } - plane->dcdx *= FIXED_ONE; - plane->dcdy *= FIXED_ONE; + plane[i].dcdx *= FIXED_ONE; + plane[i].dcdy *= FIXED_ONE; /* find trivial reject offsets for each edge for a single-pixel * sized block. These will be scaled up at each recursive level to * match the active blocksize. Scaling in this way works best if * the blocks are square. */ - plane->eo = 0; - if (plane->dcdx < 0) plane->eo -= plane->dcdx; - if (plane->dcdy > 0) plane->eo += plane->dcdy; - - /* Calculate trivial accept offsets from the above. - */ - plane->ei = plane->dcdy - plane->dcdx - plane->eo; + plane[i].eo = 0; + if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx; + if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy; } @@ -677,29 +680,25 @@ try_setup_line( struct lp_setup_context *setup, * these planes elsewhere. */ if (nr_planes == 8) { - line->plane[4].dcdx = -1; - line->plane[4].dcdy = 0; - line->plane[4].c = 1-bbox.x0; - line->plane[4].ei = 0; - line->plane[4].eo = 1; - - line->plane[5].dcdx = 1; - line->plane[5].dcdy = 0; - line->plane[5].c = bbox.x1+1; - line->plane[5].ei = -1; - line->plane[5].eo = 0; - - line->plane[6].dcdx = 0; - line->plane[6].dcdy = 1; - line->plane[6].c = 1-bbox.y0; - line->plane[6].ei = 0; - line->plane[6].eo = 1; - - line->plane[7].dcdx = 0; - line->plane[7].dcdy = -1; - line->plane[7].c = bbox.y1+1; - line->plane[7].ei = -1; - line->plane[7].eo = 0; + plane[4].dcdx = -1; + plane[4].dcdy = 0; + plane[4].c = 1-bbox.x0; + plane[4].eo = 1; + + plane[5].dcdx = 1; + plane[5].dcdy = 0; + plane[5].c = bbox.x1+1; + plane[5].eo = 0; + + plane[6].dcdx = 0; + plane[6].dcdy = 1; + plane[6].c = 1-bbox.y0; + plane[6].eo = 1; + + plane[7].dcdx = 0; + plane[7].dcdy = -1; + plane[7].c = bbox.y1+1; + plane[7].eo = 0; } return lp_setup_bin_triangle(setup, line, &bbox, nr_planes); @@ -712,10 +711,11 @@ static void lp_setup_line( struct lp_setup_context *setup, { if (!try_setup_line( setup, v0, v1 )) { - lp_setup_flush_and_restart(setup); + if (!lp_setup_flush_and_restart(setup)) + return; if (!try_setup_line( setup, v0, v1 )) - assert(0); + return; } } diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c index 3b217f95447..146f1bd07ca 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_point.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c @@ -35,6 +35,7 @@ #include "lp_perf.h" #include "lp_rast.h" #include "lp_state_fs.h" +#include "lp_state_setup.h" #include "tgsi/tgsi_scan.h" #define NUM_CHANNELS 4 @@ -45,6 +46,10 @@ struct point_info { int dx01, dx12; const float (*v0)[4]; + + float (*a0)[4]; + float (*dadx)[4]; + float (*dady)[4]; }; @@ -53,14 +58,37 @@ struct point_info { */ static void constant_coef(struct lp_setup_context *setup, - struct lp_rast_triangle *point, + struct point_info *info, unsigned slot, const float value, unsigned i) { - point->inputs.a0[slot][i] = value; - point->inputs.dadx[slot][i] = 0.0f; - point->inputs.dady[slot][i] = 0.0f; + info->a0[slot][i] = value; + info->dadx[slot][i] = 0.0f; + info->dady[slot][i] = 0.0f; +} + + +static void +point_persp_coeff(struct lp_setup_context *setup, + const struct point_info *info, + unsigned slot, + unsigned i) +{ + /* + * Fragment shader expects pre-multiplied w for LP_INTERP_PERSPECTIVE. A + * better stratergy would be to take the primitive in consideration when + * generating the fragment shader key, and therefore avoid the per-fragment + * perspective divide. + */ + + float w0 = info->v0[0][3]; + + assert(i < 4); + + info->a0[slot][i] = info->v0[slot][i]*w0; + info->dadx[slot][i] = 0.0f; + info->dady[slot][i] = 0.0f; } @@ -69,17 +97,19 @@ constant_coef(struct lp_setup_context *setup, * \param slot the vertex attribute slot to setup * \param i the attribute channel in [0,3] * \param sprite_coord_origin one of PIPE_SPRITE_COORD_x - * \param perspective_proj will the TEX instruction do a divide by Q? + * \param perspective does the shader expects pre-multiplied w, i.e., + * LP_INTERP_PERSPECTIVE is specified in the shader key */ static void texcoord_coef(struct lp_setup_context *setup, - struct lp_rast_triangle *point, const struct point_info *info, unsigned slot, unsigned i, unsigned sprite_coord_origin, - boolean perspective_proj) + boolean perspective) { + float w0 = info->v0[0][3]; + assert(i < 4); if (i == 0) { @@ -88,21 +118,14 @@ texcoord_coef(struct lp_setup_context *setup, float x0 = info->v0[0][0] - setup->pixel_offset; float y0 = info->v0[0][1] - setup->pixel_offset; - point->inputs.dadx[slot][0] = dadx; - point->inputs.dady[slot][0] = dady; - point->inputs.a0[slot][0] = 0.5 - (dadx * x0 + dady * y0); + info->dadx[slot][0] = dadx; + info->dady[slot][0] = dady; + info->a0[slot][0] = 0.5 - (dadx * x0 + dady * y0); - if (!perspective_proj) { - /* Divide coefficients by vertex.w here. - * - * It would be clearer to always multiply by w0 above and - * then divide it out for perspective projection here, but - * doing it this way involves less algebra. - */ - float w0 = info->v0[0][3]; - point->inputs.dadx[slot][0] *= w0; - point->inputs.dady[slot][0] *= w0; - point->inputs.a0[slot][0] *= w0; + if (perspective) { + info->dadx[slot][0] *= w0; + info->dady[slot][0] *= w0; + info->a0[slot][0] *= w0; } } else if (i == 1) { @@ -115,26 +138,25 @@ texcoord_coef(struct lp_setup_context *setup, dady = -dady; } - point->inputs.dadx[slot][1] = dadx; - point->inputs.dady[slot][1] = dady; - point->inputs.a0[slot][1] = 0.5 - (dadx * x0 + dady * y0); + info->dadx[slot][1] = dadx; + info->dady[slot][1] = dady; + info->a0[slot][1] = 0.5 - (dadx * x0 + dady * y0); - if (!perspective_proj) { - float w0 = info->v0[0][3]; - point->inputs.dadx[slot][1] *= w0; - point->inputs.dady[slot][1] *= w0; - point->inputs.a0[slot][1] *= w0; + if (perspective) { + info->dadx[slot][1] *= w0; + info->dady[slot][1] *= w0; + info->a0[slot][1] *= w0; } } else if (i == 2) { - point->inputs.a0[slot][2] = 0.0f; - point->inputs.dadx[slot][2] = 0.0f; - point->inputs.dady[slot][2] = 0.0f; + info->a0[slot][2] = 0.0f; + info->dadx[slot][2] = 0.0f; + info->dady[slot][2] = 0.0f; } else { - point->inputs.a0[slot][3] = 1.0f; - point->inputs.dadx[slot][3] = 0.0f; - point->inputs.dady[slot][3] = 0.0f; + info->a0[slot][3] = perspective ? w0 : 1.0f; + info->dadx[slot][3] = 0.0f; + info->dady[slot][3] = 0.0f; } } @@ -147,33 +169,32 @@ texcoord_coef(struct lp_setup_context *setup, */ static void setup_point_fragcoord_coef(struct lp_setup_context *setup, - struct lp_rast_triangle *point, - const struct point_info *info, + struct point_info *info, unsigned slot, unsigned usage_mask) { /*X*/ if (usage_mask & TGSI_WRITEMASK_X) { - point->inputs.a0[slot][0] = 0.0; - point->inputs.dadx[slot][0] = 1.0; - point->inputs.dady[slot][0] = 0.0; + info->a0[slot][0] = 0.0; + info->dadx[slot][0] = 1.0; + info->dady[slot][0] = 0.0; } /*Y*/ if (usage_mask & TGSI_WRITEMASK_Y) { - point->inputs.a0[slot][1] = 0.0; - point->inputs.dadx[slot][1] = 0.0; - point->inputs.dady[slot][1] = 1.0; + info->a0[slot][1] = 0.0; + info->dadx[slot][1] = 0.0; + info->dady[slot][1] = 1.0; } /*Z*/ if (usage_mask & TGSI_WRITEMASK_Z) { - constant_coef(setup, point, slot, info->v0[0][2], 2); + constant_coef(setup, info, slot, info->v0[0][2], 2); } /*W*/ if (usage_mask & TGSI_WRITEMASK_W) { - constant_coef(setup, point, slot, info->v0[0][3], 3); + constant_coef(setup, info, slot, info->v0[0][3], 3); } } @@ -183,21 +204,27 @@ setup_point_fragcoord_coef(struct lp_setup_context *setup, */ static void setup_point_coefficients( struct lp_setup_context *setup, - struct lp_rast_triangle *point, - const struct point_info *info) + struct point_info *info) { + const struct lp_setup_variant_key *key = &setup->setup.variant->key; const struct lp_fragment_shader *shader = setup->fs.current.variant->shader; unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ; unsigned slot; /* setup interpolation for all the remaining attributes: */ - for (slot = 0; slot < setup->fs.nr_inputs; slot++) { - unsigned vert_attr = setup->fs.input[slot].src_index; - unsigned usage_mask = setup->fs.input[slot].usage_mask; + for (slot = 0; slot < key->num_inputs; slot++) { + unsigned vert_attr = key->inputs[slot].src_index; + unsigned usage_mask = key->inputs[slot].usage_mask; + enum lp_interp interp = key->inputs[slot].interp; + boolean perspective = !!(interp == LP_INTERP_PERSPECTIVE); unsigned i; + + if (perspective & usage_mask) { + fragcoord_usage_mask |= TGSI_WRITEMASK_W; + } - switch (setup->fs.input[slot].interp) { + switch (interp) { case LP_INTERP_POSITION: /* * The generated pixel interpolators will pick up the coeffs from @@ -210,39 +237,45 @@ setup_point_coefficients( struct lp_setup_context *setup, case LP_INTERP_LINEAR: /* Sprite tex coords may use linear interpolation someday */ /* fall-through */ - case LP_INTERP_PERSPECTIVE: /* check if the sprite coord flag is set for this attribute. * If so, set it up so it up so x and y vary from 0 to 1. */ - if (shader->info.input_semantic_name[slot] == TGSI_SEMANTIC_GENERIC) { - const int index = shader->info.input_semantic_index[slot]; + if (shader->info.base.input_semantic_name[slot] == TGSI_SEMANTIC_GENERIC) { + unsigned semantic_index = shader->info.base.input_semantic_index[slot]; /* Note that sprite_coord enable is a bitfield of * PIPE_MAX_SHADER_OUTPUTS bits. */ - if (index < PIPE_MAX_SHADER_OUTPUTS && - (setup->sprite_coord_enable & (1 << index))) { - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) - texcoord_coef(setup, point, info, slot + 1, i, + if (semantic_index < PIPE_MAX_SHADER_OUTPUTS && + (setup->sprite_coord_enable & (1 << semantic_index))) { + for (i = 0; i < NUM_CHANNELS; i++) { + if (usage_mask & (1 << i)) { + texcoord_coef(setup, info, slot + 1, i, setup->sprite_coord_origin, - (usage_mask & TGSI_WRITEMASK_W)); - fragcoord_usage_mask |= TGSI_WRITEMASK_W; - break; + perspective); + } + } + break; } } - /* FALLTHROUGH */ + /* fall-through */ case LP_INTERP_CONSTANT: for (i = 0; i < NUM_CHANNELS; i++) { - if (usage_mask & (1 << i)) - constant_coef(setup, point, slot+1, info->v0[vert_attr][i], i); + if (usage_mask & (1 << i)) { + if (perspective) { + point_persp_coeff(setup, info, slot+1, i); + } + else { + constant_coef(setup, info, slot+1, info->v0[vert_attr][i], i); + } + } } break; case LP_INTERP_FACING: for (i = 0; i < NUM_CHANNELS; i++) if (usage_mask & (1 << i)) - constant_coef(setup, point, slot+1, 1.0, i); + constant_coef(setup, info, slot+1, 1.0, i); break; default: @@ -253,7 +286,7 @@ setup_point_coefficients( struct lp_setup_context *setup, /* The internal position input is in slot zero: */ - setup_point_fragcoord_coef(setup, point, info, 0, + setup_point_fragcoord_coef(setup, info, 0, fragcoord_usage_mask); } @@ -270,6 +303,7 @@ try_setup_point( struct lp_setup_context *setup, const float (*v0)[4] ) { /* x/y positions in fixed point */ + const struct lp_setup_variant_key *key = &setup->setup.variant->key; const int sizeAttr = setup->psize; const float size = (setup->point_size_per_vertex && sizeAttr > 0) ? v0[sizeAttr][0] @@ -321,7 +355,7 @@ try_setup_point( struct lp_setup_context *setup, u_rect_find_intersection(&setup->draw_region, &bbox); point = lp_setup_alloc_triangle(scene, - setup->fs.nr_inputs, + key->num_inputs, nr_planes, &bytes); if (!point) @@ -337,40 +371,40 @@ try_setup_point( struct lp_setup_context *setup, info.dx12 = fixed_width; info.dy01 = fixed_width; info.dy12 = 0; + info.a0 = GET_A0(&point->inputs); + info.dadx = GET_DADX(&point->inputs); + info.dady = GET_DADY(&point->inputs); /* Setup parameter interpolants: */ - setup_point_coefficients(setup, point, &info); + setup_point_coefficients(setup, &info); - point->inputs.facing = 1.0F; - point->inputs.state = setup->fs.stored; + point->inputs.frontfacing = TRUE; point->inputs.disable = FALSE; point->inputs.opaque = FALSE; { - point->plane[0].dcdx = -1; - point->plane[0].dcdy = 0; - point->plane[0].c = 1-bbox.x0; - point->plane[0].ei = 0; - point->plane[0].eo = 1; - - point->plane[1].dcdx = 1; - point->plane[1].dcdy = 0; - point->plane[1].c = bbox.x1+1; - point->plane[1].ei = -1; - point->plane[1].eo = 0; - - point->plane[2].dcdx = 0; - point->plane[2].dcdy = 1; - point->plane[2].c = 1-bbox.y0; - point->plane[2].ei = 0; - point->plane[2].eo = 1; - - point->plane[3].dcdx = 0; - point->plane[3].dcdy = -1; - point->plane[3].c = bbox.y1+1; - point->plane[3].ei = -1; - point->plane[3].eo = 0; + struct lp_rast_plane *plane = GET_PLANES(point); + + plane[0].dcdx = -1; + plane[0].dcdy = 0; + plane[0].c = 1-bbox.x0; + plane[0].eo = 1; + + plane[1].dcdx = 1; + plane[1].dcdy = 0; + plane[1].c = bbox.x1+1; + plane[1].eo = 0; + + plane[2].dcdx = 0; + plane[2].dcdy = 1; + plane[2].c = 1-bbox.y0; + plane[2].eo = 1; + + plane[3].dcdx = 0; + plane[3].dcdy = -1; + plane[3].c = bbox.y1+1; + plane[3].eo = 0; } return lp_setup_bin_triangle(setup, point, &bbox, nr_planes); @@ -383,10 +417,11 @@ lp_setup_point(struct lp_setup_context *setup, { if (!try_setup_point( setup, v0 )) { - lp_setup_flush_and_restart(setup); + if (!lp_setup_flush_and_restart(setup)) + return; if (!try_setup_point( setup, v0 )) - assert(0); + return; } } diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c index 9016bb8e249..4ab0b72a574 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c @@ -32,15 +32,18 @@ #include "util/u_math.h" #include "util/u_memory.h" #include "util/u_rect.h" +#include "util/u_sse.h" #include "lp_perf.h" #include "lp_setup_context.h" -#include "lp_setup_coef.h" #include "lp_rast.h" #include "lp_state_fs.h" +#include "lp_state_setup.h" #define NUM_CHANNELS 4 - +#if defined(PIPE_ARCH_SSE) +#include <emmintrin.h> +#endif static INLINE int subpixel_snap(float a) @@ -65,7 +68,7 @@ fixed_to_float(int a) * immediately after it. * The memory is allocated from the per-scene pool, not per-tile. * \param tri_size returns number of bytes allocated - * \param nr_inputs number of fragment shader inputs + * \param num_inputs number of fragment shader inputs * \return pointer to triangle space */ struct lp_rast_triangle * @@ -75,22 +78,23 @@ lp_setup_alloc_triangle(struct lp_scene *scene, unsigned *tri_size) { unsigned input_array_sz = NUM_CHANNELS * (nr_inputs + 1) * sizeof(float); + unsigned plane_sz = nr_planes * sizeof(struct lp_rast_plane); struct lp_rast_triangle *tri; - unsigned tri_bytes, bytes; - char *inputs; - tri_bytes = align(Offset(struct lp_rast_triangle, plane[nr_planes]), 16); - bytes = tri_bytes + (3 * input_array_sz); + *tri_size = (sizeof(struct lp_rast_triangle) + + 3 * input_array_sz + + plane_sz); - tri = lp_scene_alloc_aligned( scene, bytes, 16 ); + tri = lp_scene_alloc_aligned( scene, *tri_size, 16 ); + if (tri == NULL) + return NULL; - if (tri) { - inputs = ((char *)tri) + tri_bytes; - tri->inputs.a0 = (float (*)[4]) inputs; - tri->inputs.dadx = (float (*)[4]) (inputs + input_array_sz); - tri->inputs.dady = (float (*)[4]) (inputs + 2 * input_array_sz); + tri->inputs.stride = input_array_sz; - *tri_size = bytes; + { + char *a = (char *)tri; + char *b = (char *)&GET_PLANES(tri)[nr_planes]; + assert(b - a == *tri_size); } return tri; @@ -101,25 +105,26 @@ lp_setup_print_vertex(struct lp_setup_context *setup, const char *name, const float (*v)[4]) { + const struct lp_setup_variant_key *key = &setup->setup.variant->key; int i, j; debug_printf(" wpos (%s[0]) xyzw %f %f %f %f\n", name, v[0][0], v[0][1], v[0][2], v[0][3]); - for (i = 0; i < setup->fs.nr_inputs; i++) { - const float *in = v[setup->fs.input[i].src_index]; + for (i = 0; i < key->num_inputs; i++) { + const float *in = v[key->inputs[i].src_index]; debug_printf(" in[%d] (%s[%d]) %s%s%s%s ", i, - name, setup->fs.input[i].src_index, - (setup->fs.input[i].usage_mask & 0x1) ? "x" : " ", - (setup->fs.input[i].usage_mask & 0x2) ? "y" : " ", - (setup->fs.input[i].usage_mask & 0x4) ? "z" : " ", - (setup->fs.input[i].usage_mask & 0x8) ? "w" : " "); + name, key->inputs[i].src_index, + (key->inputs[i].usage_mask & 0x1) ? "x" : " ", + (key->inputs[i].usage_mask & 0x2) ? "y" : " ", + (key->inputs[i].usage_mask & 0x4) ? "z" : " ", + (key->inputs[i].usage_mask & 0x8) ? "w" : " "); for (j = 0; j < 4; j++) - if (setup->fs.input[i].usage_mask & (1<<j)) + if (key->inputs[i].usage_mask & (1<<j)) debug_printf("%.5f ", in[j]); debug_printf("\n"); @@ -200,14 +205,16 @@ lp_setup_whole_tile(struct lp_setup_context *setup, } LP_COUNT(nr_shade_opaque_64); - return lp_scene_bin_command( scene, tx, ty, - LP_RAST_OP_SHADE_TILE_OPAQUE, - lp_rast_arg_inputs(inputs) ); + return lp_scene_bin_cmd_with_state( scene, tx, ty, + setup->fs.stored, + LP_RAST_OP_SHADE_TILE_OPAQUE, + lp_rast_arg_inputs(inputs) ); } else { LP_COUNT(nr_shade_64); - return lp_scene_bin_command( scene, tx, ty, - LP_RAST_OP_SHADE_TILE, - lp_rast_arg_inputs(inputs) ); + return lp_scene_bin_cmd_with_state( scene, tx, ty, + setup->fs.stored, + LP_RAST_OP_SHADE_TILE, + lp_rast_arg_inputs(inputs) ); } } @@ -225,13 +232,13 @@ do_triangle_ccw(struct lp_setup_context *setup, boolean frontfacing ) { struct lp_scene *scene = setup->scene; + const struct lp_setup_variant_key *key = &setup->setup.variant->key; struct lp_rast_triangle *tri; - int x[3]; - int y[3]; - int area; + struct lp_rast_plane *plane; + int x[4]; + int y[4]; struct u_rect bbox; unsigned tri_bytes; - int i; int nr_planes = 3; if (0) @@ -248,10 +255,12 @@ do_triangle_ccw(struct lp_setup_context *setup, x[0] = subpixel_snap(v0[0][0] - setup->pixel_offset); x[1] = subpixel_snap(v1[0][0] - setup->pixel_offset); x[2] = subpixel_snap(v2[0][0] - setup->pixel_offset); + x[3] = 0; y[0] = subpixel_snap(v0[0][1] - setup->pixel_offset); y[1] = subpixel_snap(v1[0][1] - setup->pixel_offset); y[2] = subpixel_snap(v2[0][1] - setup->pixel_offset); - + y[3] = 0; + /* Bounding rectangle (in pixels) */ { @@ -289,13 +298,13 @@ do_triangle_ccw(struct lp_setup_context *setup, u_rect_find_intersection(&setup->draw_region, &bbox); tri = lp_setup_alloc_triangle(scene, - setup->fs.nr_inputs, + key->num_inputs, nr_planes, &tri_bytes); if (!tri) return FALSE; -#ifdef DEBUG +#if 0 tri->v[0][0] = v0[0][0]; tri->v[1][0] = v1[0][0]; tri->v[2][0] = v2[0][0]; @@ -304,92 +313,172 @@ do_triangle_ccw(struct lp_setup_context *setup, tri->v[2][1] = v2[0][1]; #endif - tri->plane[0].dcdy = x[0] - x[1]; - tri->plane[1].dcdy = x[1] - x[2]; - tri->plane[2].dcdy = x[2] - x[0]; - - tri->plane[0].dcdx = y[0] - y[1]; - tri->plane[1].dcdx = y[1] - y[2]; - tri->plane[2].dcdx = y[2] - y[0]; - - area = (tri->plane[0].dcdy * tri->plane[2].dcdx - - tri->plane[2].dcdy * tri->plane[0].dcdx); - LP_COUNT(nr_tris); - /* Cull non-ccw and zero-sized triangles. - * - * XXX: subject to overflow?? - */ - if (area <= 0) { - lp_scene_putback_data( scene, tri_bytes ); - LP_COUNT(nr_culled_tris); - return TRUE; - } - /* Setup parameter interpolants: */ - lp_setup_tri_coef( setup, &tri->inputs, v0, v1, v2, frontfacing ); - - tri->inputs.facing = frontfacing ? 1.0F : -1.0F; + setup->setup.variant->jit_function( v0, + v1, + v2, + frontfacing, + GET_A0(&tri->inputs), + GET_DADX(&tri->inputs), + GET_DADY(&tri->inputs) ); + + tri->inputs.frontfacing = frontfacing; tri->inputs.disable = FALSE; tri->inputs.opaque = setup->fs.current.variant->opaque; - tri->inputs.state = setup->fs.stored; - - for (i = 0; i < 3; i++) { - struct lp_rast_plane *plane = &tri->plane[i]; + if (0) + lp_dump_setup_coef(&setup->setup.variant->key, + (const float (*)[4])GET_A0(&tri->inputs), + (const float (*)[4])GET_DADX(&tri->inputs), + (const float (*)[4])GET_DADY(&tri->inputs)); + + plane = GET_PLANES(tri); + +#if defined(PIPE_ARCH_SSE) + { + __m128i vertx, verty; + __m128i shufx, shufy; + __m128i dcdx, dcdy, c; + __m128i unused; + __m128i dcdx_neg_mask; + __m128i dcdy_neg_mask; + __m128i dcdx_zero_mask; + __m128i top_left_flag; + __m128i c_inc_mask, c_inc; + __m128i eo, p0, p1, p2; + __m128i zero = _mm_setzero_si128(); + + vertx = _mm_loadu_si128((__m128i *)x); /* vertex x coords */ + verty = _mm_loadu_si128((__m128i *)y); /* vertex y coords */ + + shufx = _mm_shuffle_epi32(vertx, _MM_SHUFFLE(3,0,2,1)); + shufy = _mm_shuffle_epi32(verty, _MM_SHUFFLE(3,0,2,1)); + + dcdx = _mm_sub_epi32(verty, shufy); + dcdy = _mm_sub_epi32(vertx, shufx); + + dcdx_neg_mask = _mm_srai_epi32(dcdx, 31); + dcdx_zero_mask = _mm_cmpeq_epi32(dcdx, zero); + dcdy_neg_mask = _mm_srai_epi32(dcdy, 31); + + top_left_flag = _mm_set1_epi32((setup->pixel_offset == 0) ? ~0 : 0); - /* half-edge constants, will be interated over the whole render - * target. + c_inc_mask = _mm_or_si128(dcdx_neg_mask, + _mm_and_si128(dcdx_zero_mask, + _mm_xor_si128(dcdy_neg_mask, + top_left_flag))); + + c_inc = _mm_srli_epi32(c_inc_mask, 31); + + c = _mm_sub_epi32(mm_mullo_epi32(dcdx, vertx), + mm_mullo_epi32(dcdy, verty)); + + c = _mm_add_epi32(c, c_inc); + + /* Scale up to match c: */ - plane->c = plane->dcdx * x[i] - plane->dcdy * y[i]; - - /* correct for top-left vs. bottom-left fill convention. - * - * note that we're overloading gl_rasterization_rules to mean - * both (0.5,0.5) pixel centers *and* bottom-left filling - * convention. - * - * GL actually has a top-left filling convention, but GL's - * notion of "top" differs from gallium's... - * - * Also, sometimes (in FBO cases) GL will render upside down - * to its usual method, in which case it will probably want - * to use the opposite, top-left convention. - */ - if (plane->dcdx < 0) { - /* both fill conventions want this - adjust for left edges */ - plane->c++; - } - else if (plane->dcdx == 0) { - if (setup->pixel_offset == 0) { - /* correct for top-left fill convention: - */ - if (plane->dcdy > 0) plane->c++; + dcdx = _mm_slli_epi32(dcdx, FIXED_ORDER); + dcdy = _mm_slli_epi32(dcdy, FIXED_ORDER); + + /* Calculate trivial reject values: + */ + eo = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy), + _mm_and_si128(dcdx_neg_mask, dcdx)); + + /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */ + + /* Pointless transpose which gets undone immediately in + * rasterization: + */ + transpose4_epi32(&c, &dcdx, &dcdy, &eo, + &p0, &p1, &p2, &unused); + + _mm_store_si128((__m128i *)&plane[0], p0); + _mm_store_si128((__m128i *)&plane[1], p1); + _mm_store_si128((__m128i *)&plane[2], p2); + } +#else + { + int i; + plane[0].dcdy = x[0] - x[1]; + plane[1].dcdy = x[1] - x[2]; + plane[2].dcdy = x[2] - x[0]; + plane[0].dcdx = y[0] - y[1]; + plane[1].dcdx = y[1] - y[2]; + plane[2].dcdx = y[2] - y[0]; + + for (i = 0; i < 3; i++) { + /* half-edge constants, will be interated over the whole render + * target. + */ + plane[i].c = plane[i].dcdx * x[i] - plane[i].dcdy * y[i]; + + /* correct for top-left vs. bottom-left fill convention. + * + * note that we're overloading gl_rasterization_rules to mean + * both (0.5,0.5) pixel centers *and* bottom-left filling + * convention. + * + * GL actually has a top-left filling convention, but GL's + * notion of "top" differs from gallium's... + * + * Also, sometimes (in FBO cases) GL will render upside down + * to its usual method, in which case it will probably want + * to use the opposite, top-left convention. + */ + if (plane[i].dcdx < 0) { + /* both fill conventions want this - adjust for left edges */ + plane[i].c++; } - else { - /* correct for bottom-left fill convention: - */ - if (plane->dcdy < 0) plane->c++; + else if (plane[i].dcdx == 0) { + if (setup->pixel_offset == 0) { + /* correct for top-left fill convention: + */ + if (plane[i].dcdy > 0) plane[i].c++; + } + else { + /* correct for bottom-left fill convention: + */ + if (plane[i].dcdy < 0) plane[i].c++; + } } - } - plane->dcdx *= FIXED_ONE; - plane->dcdy *= FIXED_ONE; + plane[i].dcdx *= FIXED_ONE; + plane[i].dcdy *= FIXED_ONE; - /* find trivial reject offsets for each edge for a single-pixel - * sized block. These will be scaled up at each recursive level to - * match the active blocksize. Scaling in this way works best if - * the blocks are square. - */ - plane->eo = 0; - if (plane->dcdx < 0) plane->eo -= plane->dcdx; - if (plane->dcdy > 0) plane->eo += plane->dcdy; + /* find trivial reject offsets for each edge for a single-pixel + * sized block. These will be scaled up at each recursive level to + * match the active blocksize. Scaling in this way works best if + * the blocks are square. + */ + plane[i].eo = 0; + if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx; + if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy; + } + } +#endif - /* Calculate trivial accept offsets from the above. - */ - plane->ei = plane->dcdy - plane->dcdx - plane->eo; + if (0) { + debug_printf("p0: %08x/%08x/%08x/%08x\n", + plane[0].c, + plane[0].dcdx, + plane[0].dcdy, + plane[0].eo); + + debug_printf("p1: %08x/%08x/%08x/%08x\n", + plane[1].c, + plane[1].dcdx, + plane[1].dcdy, + plane[1].eo); + + debug_printf("p0: %08x/%08x/%08x/%08x\n", + plane[2].c, + plane[2].dcdx, + plane[2].dcdy, + plane[2].eo); } @@ -412,29 +501,25 @@ do_triangle_ccw(struct lp_setup_context *setup, * these planes elsewhere. */ if (nr_planes == 7) { - tri->plane[3].dcdx = -1; - tri->plane[3].dcdy = 0; - tri->plane[3].c = 1-bbox.x0; - tri->plane[3].ei = 0; - tri->plane[3].eo = 1; - - tri->plane[4].dcdx = 1; - tri->plane[4].dcdy = 0; - tri->plane[4].c = bbox.x1+1; - tri->plane[4].ei = -1; - tri->plane[4].eo = 0; - - tri->plane[5].dcdx = 0; - tri->plane[5].dcdy = 1; - tri->plane[5].c = 1-bbox.y0; - tri->plane[5].ei = 0; - tri->plane[5].eo = 1; - - tri->plane[6].dcdx = 0; - tri->plane[6].dcdy = -1; - tri->plane[6].c = bbox.y1+1; - tri->plane[6].ei = -1; - tri->plane[6].eo = 0; + plane[3].dcdx = -1; + plane[3].dcdy = 0; + plane[3].c = 1-bbox.x0; + plane[3].eo = 1; + + plane[4].dcdx = 1; + plane[4].dcdy = 0; + plane[4].c = bbox.x1+1; + plane[4].eo = 0; + + plane[5].dcdx = 0; + plane[5].dcdy = 1; + plane[5].c = 1-bbox.y0; + plane[5].eo = 1; + + plane[6].dcdx = 0; + plane[6].dcdy = -1; + plane[6].c = bbox.y1+1; + plane[6].eo = 0; } return lp_setup_bin_triangle( setup, tri, &bbox, nr_planes ); @@ -487,51 +572,58 @@ lp_setup_bin_triangle( struct lp_setup_context *setup, int sz = floor_pot((bbox->x1 - (bbox->x0 & ~3)) | (bbox->y1 - (bbox->y0 & ~3))); - if (nr_planes == 3) { - if (sz < 4 && dx < 64) - { - /* Triangle is contained in a single 4x4 stamp: - */ - int mask = (bbox->x0 & 63 & ~3) | ((bbox->y0 & 63 & ~3) << 8); - - return lp_scene_bin_command( scene, - bbox->x0/64, bbox->y0/64, - LP_RAST_OP_TRIANGLE_3_4, - lp_rast_arg_triangle(tri, mask) ); - } - - if (sz < 16 && dx < 64) - { - int mask = (bbox->x0 & 63 & ~3) | ((bbox->y0 & 63 & ~3) << 8); - - /* Triangle is contained in a single 16x16 block: - */ - return lp_scene_bin_command( scene, - bbox->x0/64, bbox->y0/64, - LP_RAST_OP_TRIANGLE_3_16, - lp_rast_arg_triangle(tri, mask) ); - } - } - - /* Determine which tile(s) intersect the triangle's bounding box */ if (dx < TILE_SIZE) { int ix0 = bbox->x0 / TILE_SIZE; int iy0 = bbox->y0 / TILE_SIZE; + int px = bbox->x0 & 63 & ~3; + int py = bbox->y0 & 63 & ~3; + int mask = px | (py << 8); assert(iy0 == bbox->y1 / TILE_SIZE && ix0 == bbox->x1 / TILE_SIZE); + if (nr_planes == 3) { + if (sz < 4) + { + /* Triangle is contained in a single 4x4 stamp: + */ + return lp_scene_bin_cmd_with_state( scene, ix0, iy0, + setup->fs.stored, + LP_RAST_OP_TRIANGLE_3_4, + lp_rast_arg_triangle(tri, mask) ); + } + + if (sz < 16) + { + /* Triangle is contained in a single 16x16 block: + */ + return lp_scene_bin_cmd_with_state( scene, ix0, iy0, + setup->fs.stored, + LP_RAST_OP_TRIANGLE_3_16, + lp_rast_arg_triangle(tri, mask) ); + } + } + else if (nr_planes == 4 && sz < 16) + { + return lp_scene_bin_cmd_with_state(scene, ix0, iy0, + setup->fs.stored, + LP_RAST_OP_TRIANGLE_4_16, + lp_rast_arg_triangle(tri, mask) ); + } + + /* Triangle is contained in a single tile: */ - return lp_scene_bin_command( scene, ix0, iy0, - lp_rast_tri_tab[nr_planes], - lp_rast_arg_triangle(tri, (1<<nr_planes)-1) ); + return lp_scene_bin_cmd_with_state( scene, ix0, iy0, setup->fs.stored, + lp_rast_tri_tab[nr_planes], + lp_rast_arg_triangle(tri, (1<<nr_planes)-1) ); } else { + struct lp_rast_plane *plane = GET_PLANES(tri); int c[MAX_PLANES]; int ei[MAX_PLANES]; int eo[MAX_PLANES]; @@ -545,14 +637,17 @@ lp_setup_bin_triangle( struct lp_setup_context *setup, int iy1 = bbox->y1 / TILE_SIZE; for (i = 0; i < nr_planes; i++) { - c[i] = (tri->plane[i].c + - tri->plane[i].dcdy * iy0 * TILE_SIZE - - tri->plane[i].dcdx * ix0 * TILE_SIZE); - - ei[i] = tri->plane[i].ei << TILE_ORDER; - eo[i] = tri->plane[i].eo << TILE_ORDER; - xstep[i] = -(tri->plane[i].dcdx << TILE_ORDER); - ystep[i] = tri->plane[i].dcdy << TILE_ORDER; + c[i] = (plane[i].c + + plane[i].dcdy * iy0 * TILE_SIZE - + plane[i].dcdx * ix0 * TILE_SIZE); + + ei[i] = (plane[i].dcdy - + plane[i].dcdx - + plane[i].eo) << TILE_ORDER; + + eo[i] = plane[i].eo << TILE_ORDER; + xstep[i] = -(plane[i].dcdx << TILE_ORDER); + ystep[i] = plane[i].dcdy << TILE_ORDER; } @@ -594,9 +689,11 @@ lp_setup_bin_triangle( struct lp_setup_context *setup, */ int count = util_bitcount(partial); in = TRUE; - if (!lp_scene_bin_command( scene, x, y, - lp_rast_tri_tab[count], - lp_rast_arg_triangle(tri, partial) )) + + if (!lp_scene_bin_cmd_with_state( scene, x, y, + setup->fs.stored, + lp_rast_tri_tab[count], + lp_rast_arg_triangle(tri, partial) )) goto fail; LP_COUNT(nr_partially_covered_64); @@ -635,40 +732,62 @@ fail: /** - * Draw triangle if it's CW, cull otherwise. + * Try to draw the triangle, restart the scene on failure. */ -static void triangle_cw( struct lp_setup_context *setup, - const float (*v0)[4], - const float (*v1)[4], - const float (*v2)[4] ) +static void retry_triangle_ccw( struct lp_setup_context *setup, + const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4], + boolean front) { - if (!do_triangle_ccw( setup, v1, v0, v2, !setup->ccw_is_frontface )) + if (!do_triangle_ccw( setup, v0, v1, v2, front )) { - lp_setup_flush_and_restart(setup); + if (!lp_setup_flush_and_restart(setup)) + return; - if (!do_triangle_ccw( setup, v1, v0, v2, !setup->ccw_is_frontface )) - assert(0); + if (!do_triangle_ccw( setup, v0, v1, v2, front )) + return; } } +static INLINE float +calc_area(const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4]) +{ + float dx01 = v0[0][0] - v1[0][0]; + float dy01 = v0[0][1] - v1[0][1]; + float dx20 = v2[0][0] - v0[0][0]; + float dy20 = v2[0][1] - v0[0][1]; + return dx01 * dy20 - dx20 * dy01; +} + /** - * Draw triangle if it's CCW, cull otherwise. + * Draw triangle if it's CW, cull otherwise. */ -static void triangle_ccw( struct lp_setup_context *setup, +static void triangle_cw( struct lp_setup_context *setup, const float (*v0)[4], const float (*v1)[4], const float (*v2)[4] ) { - if (!do_triangle_ccw( setup, v0, v1, v2, setup->ccw_is_frontface )) - { - lp_setup_flush_and_restart(setup); - if (!do_triangle_ccw( setup, v0, v1, v2, setup->ccw_is_frontface )) - assert(0); - } + float area = calc_area(v0, v1, v2); + + if (area < 0.0f) + retry_triangle_ccw(setup, v0, v2, v1, !setup->ccw_is_frontface); } +static void triangle_ccw( struct lp_setup_context *setup, + const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4]) +{ + float area = calc_area(v0, v1, v2); + + if (area > 0.0f) + retry_triangle_ccw(setup, v0, v1, v2, setup->ccw_is_frontface); +} /** * Draw triangle whether it's CW or CCW. @@ -678,18 +797,12 @@ static void triangle_both( struct lp_setup_context *setup, const float (*v1)[4], const float (*v2)[4] ) { - /* edge vectors e = v0 - v2, f = v1 - v2 */ - const float ex = v0[0][0] - v2[0][0]; - const float ey = v0[0][1] - v2[0][1]; - const float fx = v1[0][0] - v2[0][0]; - const float fy = v1[0][1] - v2[0][1]; - - /* det = cross(e,f).z */ - const float det = ex * fy - ey * fx; - if (det < 0.0f) - triangle_ccw( setup, v0, v1, v2 ); - else if (det > 0.0f) - triangle_cw( setup, v0, v1, v2 ); + float area = calc_area(v0, v1, v2); + + if (area > 0.0f) + retry_triangle_ccw( setup, v0, v1, v2, setup->ccw_is_frontface ); + else if (area < 0.0f) + retry_triangle_ccw( setup, v0, v2, v1, !setup->ccw_is_frontface ); } diff --git a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c index 6308561f242..9c1f0fe7939 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c @@ -141,7 +141,8 @@ lp_setup_draw_elements(struct vbuf_render *vbr, const ushort *indices, uint nr) const boolean flatshade_first = setup->flatshade_first; unsigned i; - lp_setup_update_state(setup, TRUE); + if (!lp_setup_update_state(setup, TRUE)) + return; switch (setup->prim) { case PIPE_PRIM_POINTS: @@ -338,7 +339,8 @@ lp_setup_draw_arrays(struct vbuf_render *vbr, uint start, uint nr) const boolean flatshade_first = setup->flatshade_first; unsigned i; - lp_setup_update_state(setup, TRUE); + if (!lp_setup_update_state(setup, TRUE)) + return; switch (setup->prim) { case PIPE_PRIM_POINTS: diff --git a/src/gallium/drivers/llvmpipe/lp_state.h b/src/gallium/drivers/llvmpipe/lp_state.h index 86313e1c484..7893e9cdc0c 100644 --- a/src/gallium/drivers/llvmpipe/lp_state.h +++ b/src/gallium/drivers/llvmpipe/lp_state.h @@ -97,6 +97,9 @@ llvmpipe_set_framebuffer_state(struct pipe_context *, void llvmpipe_update_fs(struct llvmpipe_context *lp); +void +llvmpipe_update_setup(struct llvmpipe_context *lp); + void llvmpipe_update_derived(struct llvmpipe_context *llvmpipe); diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c index bb059d04599..0f5f7369e04 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_derived.c +++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c @@ -50,12 +50,13 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe) { const struct lp_fragment_shader *lpfs = llvmpipe->fs; struct vertex_info *vinfo = &llvmpipe->vertex_info; - struct lp_shader_input *inputs = llvmpipe->inputs; unsigned vs_index; uint i; /* - * Match FS inputs against VS outputs, emitting the necessary attributes. + * Match FS inputs against VS outputs, emitting the necessary + * attributes. Could cache these structs and look them up with a + * combination of fragment shader, vertex shader ids. */ vinfo->num_attribs = 0; @@ -66,72 +67,18 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe) draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index); - for (i = 0; i < lpfs->info.num_inputs; i++) { + for (i = 0; i < lpfs->info.base.num_inputs; i++) { /* * Search for each input in current vs output: */ vs_index = draw_find_shader_output(llvmpipe->draw, - lpfs->info.input_semantic_name[i], - lpfs->info.input_semantic_index[i]); - if (vs_index < 0) { - /* - * This can happen with sprite coordinates - the vertex - * shader doesn't need to provide an output as we generate - * them internally. However, lets keep pretending that there - * is something there to not confuse other code. - */ - vs_index = 0; - } - - /* This can be pre-computed, except for flatshade: - */ - inputs[i].usage_mask = lpfs->info.input_usage_mask[i]; - - switch (lpfs->info.input_interpolate[i]) { - case TGSI_INTERPOLATE_CONSTANT: - inputs[i].interp = LP_INTERP_CONSTANT; - break; - case TGSI_INTERPOLATE_LINEAR: - inputs[i].interp = LP_INTERP_LINEAR; - break; - case TGSI_INTERPOLATE_PERSPECTIVE: - inputs[i].interp = LP_INTERP_PERSPECTIVE; - break; - default: - assert(0); - break; - } - - switch (lpfs->info.input_semantic_name[i]) { - case TGSI_SEMANTIC_FACE: - inputs[i].interp = LP_INTERP_FACING; - break; - case TGSI_SEMANTIC_POSITION: - /* Position was already emitted above - */ - inputs[i].interp = LP_INTERP_POSITION; - inputs[i].src_index = 0; - continue; - case TGSI_SEMANTIC_COLOR: - /* Colors are linearly inputs[i].interpolated in the fragment shader - * even when flatshading is active. This just tells the - * setup module to use coefficients with ddx==0 and - * ddy==0. - */ - if (llvmpipe->rasterizer->flatshade) - inputs[i].interp = LP_INTERP_CONSTANT; - break; - - default: - break; - } + lpfs->info.base.input_semantic_name[i], + lpfs->info.base.input_semantic_index[i]); /* * Emit the requested fs attribute for all but position. */ - - inputs[i].src_index = vinfo->num_attribs; draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index); } @@ -145,15 +92,8 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe) draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); } - llvmpipe->num_inputs = lpfs->info.num_inputs; - draw_compute_vertex_size(vinfo); - lp_setup_set_vertex_info(llvmpipe->setup, vinfo); - - lp_setup_set_fs_inputs(llvmpipe->setup, - inputs, - lpfs->info.num_inputs); } @@ -190,6 +130,10 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe ) LP_NEW_QUERY)) llvmpipe_update_fs( llvmpipe ); + if (llvmpipe->dirty & (LP_NEW_FS | + LP_NEW_RASTERIZER)) + llvmpipe_update_setup( llvmpipe ); + if (llvmpipe->dirty & LP_NEW_BLEND_COLOR) lp_setup_set_blend_color(llvmpipe->setup, &llvmpipe->blend_color); diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index f0a15e11b9b..9fbedac165f 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -99,74 +99,12 @@ #include <llvm-c/Analysis.h> +#include <llvm-c/BitWriter.h> static unsigned fs_no = 0; -/** - * Generate the depth /stencil test code. - */ -static void -generate_depth_stencil(LLVMBuilderRef builder, - const struct lp_fragment_shader_variant_key *key, - struct lp_type src_type, - struct lp_build_mask_context *mask, - LLVMValueRef stencil_refs[2], - LLVMValueRef src, - LLVMValueRef dst_ptr, - LLVMValueRef facing, - LLVMValueRef counter) -{ - const struct util_format_description *format_desc; - struct lp_type dst_type; - - if (!key->depth.enabled && !key->stencil[0].enabled && !key->stencil[1].enabled) - return; - - format_desc = util_format_description(key->zsbuf_format); - assert(format_desc); - - /* - * Depths are expected to be between 0 and 1, even if they are stored in - * floats. Setting these bits here will ensure that the lp_build_conv() call - * below won't try to unnecessarily clamp the incoming values. - */ - if(src_type.floating) { - src_type.sign = FALSE; - src_type.norm = TRUE; - } - else { - assert(!src_type.sign); - assert(src_type.norm); - } - - /* Pick the depth type. */ - dst_type = lp_depth_type(format_desc, src_type.width*src_type.length); - - /* FIXME: Cope with a depth test type with a different bit width. */ - assert(dst_type.width == src_type.width); - assert(dst_type.length == src_type.length); - - /* Convert fragment Z from float to integer */ - lp_build_conv(builder, src_type, dst_type, &src, 1, &src, 1); - - dst_ptr = LLVMBuildBitCast(builder, - dst_ptr, - LLVMPointerType(lp_build_vec_type(dst_type), 0), ""); - lp_build_depth_stencil_test(builder, - &key->depth, - key->stencil, - dst_type, - format_desc, - mask, - stencil_refs, - src, - dst_ptr, - facing, - counter); -} - /** * Expand the relevent bits of mask_input to a 4-dword mask for the @@ -248,6 +186,26 @@ generate_quad_mask(LLVMBuilderRef builder, } +#define EARLY_DEPTH_TEST 0x1 +#define LATE_DEPTH_TEST 0x2 +#define EARLY_DEPTH_WRITE 0x4 +#define LATE_DEPTH_WRITE 0x8 + +static int +find_output_by_semantic( const struct tgsi_shader_info *info, + unsigned semantic, + unsigned index ) +{ + int i; + + for (i = 0; i < info->num_outputs; i++) + if (info->output_semantic_name[i] == semantic && + info->output_semantic_index[i] == index) + return i; + + return -1; +} + /** * Generate the fragment shader, depth/stencil test, and alpha tests. @@ -255,14 +213,13 @@ generate_quad_mask(LLVMBuilderRef builder, * \param partial_mask if 1, do mask_input testing */ static void -generate_fs(struct llvmpipe_context *lp, - struct lp_fragment_shader *shader, +generate_fs(struct lp_fragment_shader *shader, const struct lp_fragment_shader_variant_key *key, LLVMBuilderRef builder, struct lp_type type, LLVMValueRef context_ptr, unsigned i, - const struct lp_build_interp_soa_context *interp, + struct lp_build_interp_soa_context *interp, struct lp_build_sampler_soa *sampler, LLVMValueRef *pmask, LLVMValueRef (*color)[4], @@ -272,18 +229,52 @@ generate_fs(struct llvmpipe_context *lp, LLVMValueRef mask_input, LLVMValueRef counter) { + const struct util_format_description *zs_format_desc = NULL; const struct tgsi_token *tokens = shader->base.tokens; LLVMTypeRef vec_type; LLVMValueRef consts_ptr; LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS]; - LLVMValueRef z = interp->pos[2]; + LLVMValueRef z; + LLVMValueRef zs_value = NULL; LLVMValueRef stencil_refs[2]; - struct lp_build_flow_context *flow; struct lp_build_mask_context mask; - boolean early_depth_stencil_test; + boolean simple_shader = (shader->info.base.file_count[TGSI_FILE_SAMPLER] == 0 && + shader->info.base.num_inputs < 3 && + shader->info.base.num_instructions < 8); unsigned attrib; unsigned chan; unsigned cbuf; + unsigned depth_mode; + + if (key->depth.enabled || + key->stencil[0].enabled || + key->stencil[1].enabled) { + + zs_format_desc = util_format_description(key->zsbuf_format); + assert(zs_format_desc); + + if (!shader->info.base.writes_z) { + if (key->alpha.enabled || shader->info.base.uses_kill) + /* With alpha test and kill, can do the depth test early + * and hopefully eliminate some quads. But need to do a + * special deferred depth write once the final mask value + * is known. + */ + depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE; + else + depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE; + } + else { + depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE; + } + + if (!(key->depth.enabled && key->depth.writemask) && + !(key->stencil[0].enabled && key->stencil[0].writemask)) + depth_mode &= ~(LATE_DEPTH_WRITE | EARLY_DEPTH_WRITE); + } + else { + depth_mode = 0; + } assert(i < 4); @@ -294,20 +285,14 @@ generate_fs(struct llvmpipe_context *lp, consts_ptr = lp_jit_context_constants(builder, context_ptr); - flow = lp_build_flow_create(builder); - memset(outputs, 0, sizeof outputs); - lp_build_flow_scope_begin(flow); - /* Declare the color and z variables */ for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) { for(chan = 0; chan < NUM_CHANNELS; ++chan) { - color[cbuf][chan] = LLVMGetUndef(vec_type); - lp_build_flow_scope_declare(flow, &color[cbuf][chan]); + color[cbuf][chan] = lp_build_alloca(builder, vec_type, "color"); } } - lp_build_flow_scope_declare(flow, &z); /* do triangle edge testing */ if (partial_mask) { @@ -319,74 +304,126 @@ generate_fs(struct llvmpipe_context *lp, } /* 'mask' will control execution based on quad's pixel alive/killed state */ - lp_build_mask_begin(&mask, flow, type, *pmask); - - early_depth_stencil_test = - (key->depth.enabled || key->stencil[0].enabled) && - !key->alpha.enabled && - !shader->info.uses_kill && - !shader->info.writes_z; - - if (early_depth_stencil_test) - generate_depth_stencil(builder, key, - type, &mask, - stencil_refs, z, depth_ptr, facing, counter); + lp_build_mask_begin(&mask, builder, type, *pmask); + + if (!(depth_mode & EARLY_DEPTH_TEST) && !simple_shader) + lp_build_mask_check(&mask); + + lp_build_interp_soa_update_pos(interp, i); + z = interp->pos[2]; + + if (depth_mode & EARLY_DEPTH_TEST) { + lp_build_depth_stencil_test(builder, + &key->depth, + key->stencil, + type, + zs_format_desc, + &mask, + stencil_refs, + z, + depth_ptr, facing, + &zs_value, + !simple_shader); + + if (depth_mode & EARLY_DEPTH_WRITE) { + lp_build_depth_write(builder, zs_format_desc, depth_ptr, zs_value); + } + } + lp_build_interp_soa_update_inputs(interp, i); + + /* Build the actual shader */ lp_build_tgsi_soa(builder, tokens, type, &mask, consts_ptr, interp->pos, interp->inputs, - outputs, sampler, &shader->info); + outputs, sampler, &shader->info.base); - /* loop over fragment shader outputs/results */ - for (attrib = 0; attrib < shader->info.num_outputs; ++attrib) { - for(chan = 0; chan < NUM_CHANNELS; ++chan) { - if(outputs[attrib][chan]) { - LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], ""); - lp_build_name(out, "output%u.%u.%c", i, attrib, "xyzw"[chan]); - - switch (shader->info.output_semantic_name[attrib]) { - case TGSI_SEMANTIC_COLOR: - { - unsigned cbuf = shader->info.output_semantic_index[attrib]; - - lp_build_name(out, "color%u.%u.%c", i, attrib, "rgba"[chan]); - - /* Alpha test */ - /* XXX: should only test the final assignment to alpha */ - if (cbuf == 0 && chan == 3 && key->alpha.enabled) { - LLVMValueRef alpha = out; - LLVMValueRef alpha_ref_value; - alpha_ref_value = lp_jit_context_alpha_ref_value(builder, context_ptr); - alpha_ref_value = lp_build_broadcast(builder, vec_type, alpha_ref_value); - lp_build_alpha_test(builder, key->alpha.func, type, - &mask, alpha, alpha_ref_value); - } - - color[cbuf][chan] = out; - break; - } - - case TGSI_SEMANTIC_POSITION: - if(chan == 2) - z = out; - break; - } - } + + /* Alpha test */ + if (key->alpha.enabled) { + int color0 = find_output_by_semantic(&shader->info.base, + TGSI_SEMANTIC_COLOR, + 0); + + if (color0 != -1 && outputs[color0][3]) { + LLVMValueRef alpha = LLVMBuildLoad(builder, outputs[color0][3], "alpha"); + LLVMValueRef alpha_ref_value; + + alpha_ref_value = lp_jit_context_alpha_ref_value(builder, context_ptr); + alpha_ref_value = lp_build_broadcast(builder, vec_type, alpha_ref_value); + + lp_build_alpha_test(builder, key->alpha.func, type, + &mask, alpha, alpha_ref_value, + (depth_mode & LATE_DEPTH_TEST) != 0); } } - if (!early_depth_stencil_test) - generate_depth_stencil(builder, key, - type, &mask, - stencil_refs, z, depth_ptr, facing, counter); + /* Late Z test */ + if (depth_mode & LATE_DEPTH_TEST) { + int pos0 = find_output_by_semantic(&shader->info.base, + TGSI_SEMANTIC_POSITION, + 0); + + if (pos0 != -1 && outputs[pos0][2]) { + z = LLVMBuildLoad(builder, outputs[pos0][2], "output.z"); + } - lp_build_mask_end(&mask); + lp_build_depth_stencil_test(builder, + &key->depth, + key->stencil, + type, + zs_format_desc, + &mask, + stencil_refs, + z, + depth_ptr, facing, + &zs_value, + !simple_shader); + /* Late Z write */ + if (depth_mode & LATE_DEPTH_WRITE) { + lp_build_depth_write(builder, zs_format_desc, depth_ptr, zs_value); + } + } + else if ((depth_mode & EARLY_DEPTH_TEST) && + (depth_mode & LATE_DEPTH_WRITE)) + { + /* Need to apply a reduced mask to the depth write. Reload the + * depth value, update from zs_value with the new mask value and + * write that out. + */ + lp_build_deferred_depth_write(builder, + type, + zs_format_desc, + &mask, + depth_ptr, + zs_value); + } - lp_build_flow_scope_end(flow); - lp_build_flow_destroy(flow); + /* Color write */ + for (attrib = 0; attrib < shader->info.base.num_outputs; ++attrib) + { + if (shader->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR && + shader->info.base.output_semantic_index[attrib] < key->nr_cbufs) + { + unsigned cbuf = shader->info.base.output_semantic_index[attrib]; + for(chan = 0; chan < NUM_CHANNELS; ++chan) { + if(outputs[attrib][chan]) { + /* XXX: just initialize outputs to point at colors[] and + * skip this. + */ + LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], ""); + lp_build_name(out, "color%u.%u.%c", i, attrib, "rgba"[chan]); + LLVMBuildStore(builder, out, color[cbuf][chan]); + } + } + } + } - *pmask = mask.value; + if (counter) + lp_build_occlusion_count(builder, type, + lp_build_mask_value(&mask), counter); + *pmask = lp_build_mask_end(&mask); } @@ -407,10 +444,10 @@ generate_blend(const struct pipe_blend_state *blend, LLVMValueRef context_ptr, LLVMValueRef mask, LLVMValueRef *src, - LLVMValueRef dst_ptr) + LLVMValueRef dst_ptr, + boolean do_branch) { struct lp_build_context bld; - struct lp_build_flow_context *flow; struct lp_build_mask_context mask_ctx; LLVMTypeRef vec_type; LLVMValueRef const_ptr; @@ -421,10 +458,9 @@ generate_blend(const struct pipe_blend_state *blend, lp_build_context_init(&bld, builder, type); - flow = lp_build_flow_create(builder); - - /* we'll use this mask context to skip blending if all pixels are dead */ - lp_build_mask_begin(&mask_ctx, flow, type, mask); + lp_build_mask_begin(&mask_ctx, builder, type, mask); + if (do_branch) + lp_build_mask_check(&mask_ctx); vec_type = lp_build_vec_type(type); @@ -457,7 +493,6 @@ generate_blend(const struct pipe_blend_state *blend, } lp_build_mask_end(&mask_ctx); - lp_build_flow_destroy(flow); } @@ -468,13 +503,13 @@ generate_blend(const struct pipe_blend_state *blend, * 2x2 pixels. */ static void -generate_fragment(struct llvmpipe_context *lp, +generate_fragment(struct llvmpipe_screen *screen, struct lp_fragment_shader *shader, struct lp_fragment_shader_variant *variant, unsigned partial_mask) { - struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen); const struct lp_fragment_shader_variant_key *key = &variant->key; + struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS]; char func_name[256]; struct lp_type fs_type; struct lp_type blend_type; @@ -502,11 +537,24 @@ generate_fragment(struct llvmpipe_context *lp, LLVMValueRef blend_mask; LLVMValueRef function; LLVMValueRef facing; + const struct util_format_description *zs_format_desc; unsigned num_fs; unsigned i; unsigned chan; unsigned cbuf; + /* Adjust color input interpolation according to flatshade state: + */ + memcpy(inputs, shader->inputs, shader->info.base.num_inputs * sizeof inputs[0]); + for (i = 0; i < shader->info.base.num_inputs; i++) { + if (inputs[i].interp == LP_INTERP_COLOR) { + if (key->flatshade) + inputs[i].interp = LP_INTERP_CONSTANT; + else + inputs[i].interp = LP_INTERP_LINEAR; + } + } + /* TODO: actually pick these based on the fs and color buffer * characteristics. */ @@ -542,12 +590,12 @@ generate_fragment(struct llvmpipe_context *lp, arg_types[0] = screen->context_ptr_type; /* context */ arg_types[1] = LLVMInt32Type(); /* x */ arg_types[2] = LLVMInt32Type(); /* y */ - arg_types[3] = LLVMFloatType(); /* facing */ + arg_types[3] = LLVMInt32Type(); /* facing */ arg_types[4] = LLVMPointerType(fs_elem_type, 0); /* a0 */ arg_types[5] = LLVMPointerType(fs_elem_type, 0); /* dadx */ arg_types[6] = LLVMPointerType(fs_elem_type, 0); /* dady */ arg_types[7] = LLVMPointerType(LLVMPointerType(blend_vec_type, 0), 0); /* color */ - arg_types[8] = LLVMPointerType(fs_int_vec_type, 0); /* depth */ + arg_types[8] = LLVMPointerType(LLVMInt8Type(), 0); /* depth */ arg_types[9] = LLVMInt32Type(); /* mask_input */ arg_types[10] = LLVMPointerType(LLVMInt32Type(), 0);/* counter */ @@ -558,7 +606,6 @@ generate_fragment(struct llvmpipe_context *lp, variant->function[partial_mask] = function; - /* XXX: need to propagate noalias down into color param now we are * passing a pointer-to-pointer? */ @@ -606,8 +653,8 @@ generate_fragment(struct llvmpipe_context *lp, * already included in the shader key. */ lp_build_interp_soa_init(&interp, - lp->num_inputs, - lp->inputs, + shader->info.base.num_inputs, + inputs, builder, fs_type, a0_ptr, dadx_ptr, dady_ptr, x, y); @@ -616,17 +663,18 @@ generate_fragment(struct llvmpipe_context *lp, sampler = lp_llvm_sampler_soa_create(key->sampler, context_ptr); /* loop over quads in the block */ + zs_format_desc = util_format_description(key->zsbuf_format); + for(i = 0; i < num_fs; ++i) { - LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0); + LLVMValueRef depth_offset = LLVMConstInt(LLVMInt32Type(), + i*fs_type.length*zs_format_desc->block.bits/8, + 0); LLVMValueRef out_color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS]; LLVMValueRef depth_ptr_i; - if(i != 0) - lp_build_interp_soa_update(&interp, i); + depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &depth_offset, 1, ""); - depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &index, 1, ""); - - generate_fs(lp, shader, key, + generate_fs(shader, key, builder, fs_type, context_ptr, @@ -660,9 +708,18 @@ generate_fragment(struct llvmpipe_context *lp, * Convert the fs's output color and mask to fit to the blending type. */ for(chan = 0; chan < NUM_CHANNELS; ++chan) { + LLVMValueRef fs_color_vals[LP_MAX_VECTOR_LENGTH]; + + for (i = 0; i < num_fs; i++) { + fs_color_vals[i] = + LLVMBuildLoad(builder, fs_out_color[cbuf][chan][i], "fs_color_vals"); + } + lp_build_conv(builder, fs_type, blend_type, - fs_out_color[cbuf][chan], num_fs, + fs_color_vals, + num_fs, &blend_in_color[chan], 1); + lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]); } @@ -685,14 +742,23 @@ generate_fragment(struct llvmpipe_context *lp, /* * Blending. */ - generate_blend(&key->blend, - rt, - builder, - blend_type, - context_ptr, - blend_mask, - blend_in_color, - color_ptr); + { + /* Could the 4x4 have been killed? + */ + boolean do_branch = ((key->depth.enabled || key->stencil[0].enabled) && + !key->alpha.enabled && + !shader->info.base.uses_kill); + + generate_blend(&key->blend, + rt, + builder, + blend_type, + context_ptr, + blend_mask, + blend_in_color, + color_ptr, + do_branch); + } } #ifdef PIPE_ARCH_X86 @@ -717,12 +783,17 @@ generate_fragment(struct llvmpipe_context *lp, /* Apply optimizations to LLVM IR */ LLVMRunFunctionPassManager(screen->pass, function); - if (gallivm_debug & GALLIVM_DEBUG_IR) { + if ((gallivm_debug & GALLIVM_DEBUG_IR) || (LP_DEBUG & DEBUG_FS)) { /* Print the LLVM IR to stderr */ lp_debug_dump_value(function); debug_printf("\n"); } + /* Dump byte code to a file */ + if (0) { + LLVMWriteBitcodeToFile(lp_build_module, "llvmpipe.bc"); + } + /* * Translate the LLVM IR into machine code. */ @@ -731,7 +802,7 @@ generate_fragment(struct llvmpipe_context *lp, variant->jit_function[partial_mask] = (lp_jit_frag_func)pointer_to_func(f); - if (gallivm_debug & GALLIVM_DEBUG_ASM) { + if ((gallivm_debug & GALLIVM_DEBUG_ASM) || (LP_DEBUG & DEBUG_FS)) { lp_disassemble(f); } lp_func_delete_body(function); @@ -746,6 +817,9 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key) debug_printf("fs variant %p:\n", (void *) key); + if (key->flatshade) { + debug_printf("flatshade = 1\n"); + } for (i = 0; i < key->nr_cbufs; ++i) { debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i])); } @@ -770,6 +844,10 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key) debug_printf("alpha.func = %s\n", util_dump_func(key->alpha.func, TRUE)); } + if (key->occlusion_count) { + debug_printf("occlusion_count = 1\n"); + } + if (key->blend.logicop_enable) { debug_printf("blend.logicop_func = %s\n", util_dump_logicop(key->blend.logicop_func, TRUE)); } @@ -782,31 +860,33 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key) debug_printf("blend.alpha_dst_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE)); } debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask); - for (i = 0; i < PIPE_MAX_SAMPLERS; ++i) { - if (key->sampler[i].format) { - debug_printf("sampler[%u] = \n", i); - debug_printf(" .format = %s\n", - util_format_name(key->sampler[i].format)); - debug_printf(" .target = %s\n", - util_dump_tex_target(key->sampler[i].target, TRUE)); - debug_printf(" .pot = %u %u %u\n", - key->sampler[i].pot_width, - key->sampler[i].pot_height, - key->sampler[i].pot_depth); - debug_printf(" .wrap = %s %s %s\n", - util_dump_tex_wrap(key->sampler[i].wrap_s, TRUE), - util_dump_tex_wrap(key->sampler[i].wrap_t, TRUE), - util_dump_tex_wrap(key->sampler[i].wrap_r, TRUE)); - debug_printf(" .min_img_filter = %s\n", - util_dump_tex_filter(key->sampler[i].min_img_filter, TRUE)); - debug_printf(" .min_mip_filter = %s\n", - util_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE)); - debug_printf(" .mag_img_filter = %s\n", - util_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE)); - if (key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE) - debug_printf(" .compare_func = %s\n", util_dump_func(key->sampler[i].compare_func, TRUE)); - debug_printf(" .normalized_coords = %u\n", key->sampler[i].normalized_coords); - } + for (i = 0; i < key->nr_samplers; ++i) { + debug_printf("sampler[%u] = \n", i); + debug_printf(" .format = %s\n", + util_format_name(key->sampler[i].format)); + debug_printf(" .target = %s\n", + util_dump_tex_target(key->sampler[i].target, TRUE)); + debug_printf(" .pot = %u %u %u\n", + key->sampler[i].pot_width, + key->sampler[i].pot_height, + key->sampler[i].pot_depth); + debug_printf(" .wrap = %s %s %s\n", + util_dump_tex_wrap(key->sampler[i].wrap_s, TRUE), + util_dump_tex_wrap(key->sampler[i].wrap_t, TRUE), + util_dump_tex_wrap(key->sampler[i].wrap_r, TRUE)); + debug_printf(" .min_img_filter = %s\n", + util_dump_tex_filter(key->sampler[i].min_img_filter, TRUE)); + debug_printf(" .min_mip_filter = %s\n", + util_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE)); + debug_printf(" .mag_img_filter = %s\n", + util_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE)); + if (key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE) + debug_printf(" .compare_func = %s\n", util_dump_func(key->sampler[i].compare_func, TRUE)); + debug_printf(" .normalized_coords = %u\n", key->sampler[i].normalized_coords); + debug_printf(" .min_max_lod_equal = %u\n", key->sampler[i].min_max_lod_equal); + debug_printf(" .lod_bias_non_zero = %u\n", key->sampler[i].lod_bias_non_zero); + debug_printf(" .apply_min_lod = %u\n", key->sampler[i].apply_min_lod); + debug_printf(" .apply_max_lod = %u\n", key->sampler[i].apply_max_lod); } } @@ -823,7 +903,7 @@ lp_debug_fs_variant(const struct lp_fragment_shader_variant *variant) } static struct lp_fragment_shader_variant * -generate_variant(struct llvmpipe_context *lp, +generate_variant(struct llvmpipe_screen *screen, struct lp_fragment_shader *shader, const struct lp_fragment_shader_variant_key *key) { @@ -861,7 +941,7 @@ generate_variant(struct llvmpipe_context *lp, !key->stencil[0].enabled && !key->alpha.enabled && !key->depth.enabled && - !shader->info.uses_kill + !shader->info.base.uses_kill ? TRUE : FALSE; @@ -869,11 +949,11 @@ generate_variant(struct llvmpipe_context *lp, lp_debug_fs_variant(variant); } - generate_fragment(lp, shader, variant, RAST_EDGE_TEST); + generate_fragment(screen, shader, variant, RAST_EDGE_TEST); if (variant->opaque) { /* Specialized shader, which doesn't need to read the color buffer. */ - generate_fragment(lp, shader, variant, RAST_WHOLE); + generate_fragment(screen, shader, variant, RAST_WHOLE); } else { variant->jit_function[RAST_WHOLE] = variant->jit_function[RAST_EDGE_TEST]; } @@ -889,6 +969,7 @@ llvmpipe_create_fs_state(struct pipe_context *pipe, struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe); struct lp_fragment_shader *shader; int nr_samplers; + int i; shader = CALLOC_STRUCT(lp_fragment_shader); if (!shader) @@ -898,7 +979,7 @@ llvmpipe_create_fs_state(struct pipe_context *pipe, make_empty_list(&shader->variants); /* get/save the summary info for this shader */ - tgsi_scan_shader(templ->tokens, &shader->info); + lp_build_tgsi_info(templ->tokens, &shader->info); /* we need to keep a local copy of the tokens */ shader->base.tokens = tgsi_dup_tokens(templ->tokens); @@ -910,18 +991,58 @@ llvmpipe_create_fs_state(struct pipe_context *pipe, return NULL; } - nr_samplers = shader->info.file_max[TGSI_FILE_SAMPLER] + 1; + nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1; shader->variant_key_size = Offset(struct lp_fragment_shader_variant_key, sampler[nr_samplers]); + for (i = 0; i < shader->info.base.num_inputs; i++) { + shader->inputs[i].usage_mask = shader->info.base.input_usage_mask[i]; + + switch (shader->info.base.input_interpolate[i]) { + case TGSI_INTERPOLATE_CONSTANT: + shader->inputs[i].interp = LP_INTERP_CONSTANT; + break; + case TGSI_INTERPOLATE_LINEAR: + shader->inputs[i].interp = LP_INTERP_LINEAR; + break; + case TGSI_INTERPOLATE_PERSPECTIVE: + shader->inputs[i].interp = LP_INTERP_PERSPECTIVE; + break; + default: + assert(0); + break; + } + + switch (shader->info.base.input_semantic_name[i]) { + case TGSI_SEMANTIC_COLOR: + /* Colors may be either linearly or constant interpolated in + * the fragment shader, but that information isn't available + * here. Mark color inputs and fix them up later. + */ + shader->inputs[i].interp = LP_INTERP_COLOR; + break; + case TGSI_SEMANTIC_FACE: + shader->inputs[i].interp = LP_INTERP_FACING; + break; + case TGSI_SEMANTIC_POSITION: + /* Position was already emitted above + */ + shader->inputs[i].interp = LP_INTERP_POSITION; + shader->inputs[i].src_index = 0; + continue; + } + + shader->inputs[i].src_index = i+1; + } + if (LP_DEBUG & DEBUG_TGSI) { unsigned attrib; debug_printf("llvmpipe: Create fragment shader #%u %p:\n", shader->no, (void *) shader); tgsi_dump(templ->tokens, 0); debug_printf("usage masks:\n"); - for (attrib = 0; attrib < shader->info.num_inputs; ++attrib) { - unsigned usage_mask = shader->info.input_usage_mask[attrib]; + for (attrib = 0; attrib < shader->info.base.num_inputs; ++attrib) { + unsigned usage_mask = shader->info.base.input_usage_mask[attrib]; debug_printf(" IN[%u].%s%s%s%s\n", attrib, usage_mask & TGSI_WRITEMASK_X ? "x" : "", @@ -1150,10 +1271,10 @@ make_variant_key(struct llvmpipe_context *lp, /* This value will be the same for all the variants of a given shader: */ - key->nr_samplers = shader->info.file_max[TGSI_FILE_SAMPLER] + 1; + key->nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1; for(i = 0; i < key->nr_samplers; ++i) { - if(shader->info.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { + if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { lp_sampler_static_state(&key->sampler[i], lp->fragment_sampler_views[i], lp->sampler[i]); @@ -1168,6 +1289,7 @@ make_variant_key(struct llvmpipe_context *lp, void llvmpipe_update_fs(struct llvmpipe_context *lp) { + struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen); struct lp_fragment_shader *shader = lp->fs; struct lp_fragment_shader_variant_key key; struct lp_fragment_shader_variant *variant = NULL; @@ -1208,7 +1330,7 @@ llvmpipe_update_fs(struct llvmpipe_context *lp) } t0 = os_time_get(); - variant = generate_variant(lp, shader, &key); + variant = generate_variant(screen, shader, &key); t1 = os_time_get(); dt = t1 - t0; @@ -1228,6 +1350,10 @@ llvmpipe_update_fs(struct llvmpipe_context *lp) + + + + void llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe) { diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h index 4999b8dca1a..7d58c4936c7 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.h +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h @@ -34,6 +34,8 @@ #include "pipe/p_state.h" #include "tgsi/tgsi_scan.h" /* for tgsi_shader_info */ #include "gallivm/lp_bld_sample.h" /* for struct lp_sampler_static_state */ +#include "gallivm/lp_bld_tgsi.h" /* for lp_tgsi_info */ +#include "lp_bld_interp.h" /* for struct lp_shader_input */ struct tgsi_token; @@ -96,7 +98,7 @@ struct lp_fragment_shader { struct pipe_shader_state base; - struct tgsi_shader_info info; + struct lp_tgsi_info info; struct lp_fs_variant_list_item variants; @@ -107,6 +109,9 @@ struct lp_fragment_shader unsigned no; unsigned variants_created; unsigned variants_cached; + + /** Fragment shader input interpolation info */ + struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS]; }; diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c index 17a4a0ed02d..1dd866195d3 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c +++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c @@ -246,9 +246,9 @@ llvmpipe_prepare_vertex_sampling(struct llvmpipe_context *lp, struct pipe_sampler_view **views) { unsigned i; - uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS]; - uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS]; - const void *data[DRAW_MAX_TEXTURE_LEVELS]; + uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS]; + uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS]; + const void *data[PIPE_MAX_TEXTURE_LEVELS]; assert(num <= PIPE_MAX_VERTEX_SAMPLERS); if (!num) diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.c b/src/gallium/drivers/llvmpipe/lp_state_setup.c new file mode 100644 index 00000000000..2c8b8b9a928 --- /dev/null +++ b/src/gallium/drivers/llvmpipe/lp_state_setup.c @@ -0,0 +1,759 @@ +/************************************************************************** + * + * Copyright 2010 VMware. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#include "util/u_math.h" +#include "util/u_memory.h" +#include "util/u_simple_list.h" +#include "os/os_time.h" +#include "gallivm/lp_bld_debug.h" +#include "gallivm/lp_bld_init.h" +#include "gallivm/lp_bld_intr.h" +#include <llvm-c/Analysis.h> /* for LLVMVerifyFunction */ + +#include "lp_perf.h" +#include "lp_debug.h" +#include "lp_flush.h" +#include "lp_screen.h" +#include "lp_context.h" +#include "lp_setup_context.h" +#include "lp_rast.h" +#include "lp_state.h" +#include "lp_state_fs.h" +#include "lp_state_setup.h" + + + +/* currently organized to interpolate full float[4] attributes even + * when some elements are unused. Later, can pack vertex data more + * closely. + */ + + +struct lp_setup_args +{ + /* Function arguments: + */ + LLVMValueRef v0; + LLVMValueRef v1; + LLVMValueRef v2; + LLVMValueRef facing; /* boolean */ + LLVMValueRef a0; + LLVMValueRef dadx; + LLVMValueRef dady; + + /* Derived: + */ + LLVMValueRef x0_center; + LLVMValueRef y0_center; + LLVMValueRef dy20_ooa; + LLVMValueRef dy01_ooa; + LLVMValueRef dx20_ooa; + LLVMValueRef dx01_ooa; +}; + +static LLVMTypeRef type4f(void) +{ + return LLVMVectorType(LLVMFloatType(), 4); +} + + +/* Equivalent of _mm_setr_ps(a,b,c,d) + */ +static LLVMValueRef vec4f(LLVMBuilderRef bld, + LLVMValueRef a, LLVMValueRef b, LLVMValueRef c, LLVMValueRef d, + const char *name) +{ + LLVMValueRef i0 = LLVMConstInt(LLVMInt32Type(), 0, 0); + LLVMValueRef i1 = LLVMConstInt(LLVMInt32Type(), 1, 0); + LLVMValueRef i2 = LLVMConstInt(LLVMInt32Type(), 2, 0); + LLVMValueRef i3 = LLVMConstInt(LLVMInt32Type(), 3, 0); + + LLVMValueRef res = LLVMGetUndef(type4f()); + + res = LLVMBuildInsertElement(bld, res, a, i0, ""); + res = LLVMBuildInsertElement(bld, res, b, i1, ""); + res = LLVMBuildInsertElement(bld, res, c, i2, ""); + res = LLVMBuildInsertElement(bld, res, d, i3, name); + + return res; +} + +/* Equivalent of _mm_set1_ps(a) + */ +static LLVMValueRef vec4f_from_scalar(LLVMBuilderRef bld, + LLVMValueRef a, + const char *name) +{ + LLVMValueRef res = LLVMGetUndef(type4f()); + int i; + + for(i = 0; i < 4; ++i) { + LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0); + res = LLVMBuildInsertElement(bld, res, a, index, i == 3 ? name : ""); + } + + return res; +} + +static void +store_coef(LLVMBuilderRef builder, + struct lp_setup_args *args, + unsigned slot, + LLVMValueRef a0, + LLVMValueRef dadx, + LLVMValueRef dady) +{ + LLVMValueRef idx = LLVMConstInt(LLVMInt32Type(), slot, 0); + + LLVMBuildStore(builder, + a0, + LLVMBuildGEP(builder, args->a0, &idx, 1, "")); + + LLVMBuildStore(builder, + dadx, + LLVMBuildGEP(builder, args->dadx, &idx, 1, "")); + + LLVMBuildStore(builder, + dady, + LLVMBuildGEP(builder, args->dady, &idx, 1, "")); +} + + + +static void +emit_constant_coef4( LLVMBuilderRef builder, + struct lp_setup_args *args, + unsigned slot, + LLVMValueRef vert, + unsigned attr) +{ + LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0); + LLVMValueRef zerovec = vec4f_from_scalar(builder, zero, "zero"); + LLVMValueRef idx = LLVMConstInt(LLVMInt32Type(), attr, 0); + LLVMValueRef attr_ptr = LLVMBuildGEP(builder, vert, &idx, 1, "attr_ptr"); + LLVMValueRef vert_attr = LLVMBuildLoad(builder, attr_ptr, "vert_attr"); + + store_coef(builder, args, slot, vert_attr, zerovec, zerovec); +} + + + +/** + * Setup the fragment input attribute with the front-facing value. + * \param frontface is the triangle front facing? + */ +static void +emit_facing_coef( LLVMBuilderRef builder, + struct lp_setup_args *args, + unsigned slot ) +{ + LLVMValueRef a0_0 = args->facing; + LLVMValueRef a0_0f = LLVMBuildSIToFP(builder, a0_0, LLVMFloatType(), ""); + LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0); + LLVMValueRef a0 = vec4f(builder, a0_0f, zero, zero, zero, "facing"); + LLVMValueRef zerovec = vec4f_from_scalar(builder, zero, "zero"); + + store_coef(builder, args, slot, a0, zerovec, zerovec); +} + + +static LLVMValueRef +vert_attrib(LLVMBuilderRef b, + LLVMValueRef vert, + int attr, + int elem, + const char *name) +{ + LLVMValueRef idx[2]; + idx[0] = LLVMConstInt(LLVMInt32Type(), attr, 0); + idx[1] = LLVMConstInt(LLVMInt32Type(), elem, 0); + return LLVMBuildLoad(b, LLVMBuildGEP(b, vert, idx, 2, ""), name); +} + + + +static void +emit_coef4( LLVMBuilderRef b, + struct lp_setup_args *args, + unsigned slot, + LLVMValueRef a0, + LLVMValueRef a1, + LLVMValueRef a2) +{ + LLVMValueRef dy20_ooa = args->dy20_ooa; + LLVMValueRef dy01_ooa = args->dy01_ooa; + LLVMValueRef dx20_ooa = args->dx20_ooa; + LLVMValueRef dx01_ooa = args->dx01_ooa; + LLVMValueRef x0_center = args->x0_center; + LLVMValueRef y0_center = args->y0_center; + + /* XXX: using fsub, fmul on vector types -- does this work?? + */ + LLVMValueRef da01 = LLVMBuildFSub(b, a0, a1, "da01"); + LLVMValueRef da20 = LLVMBuildFSub(b, a2, a0, "da20"); + + /* Calculate dadx (vec4f) + */ + LLVMValueRef da01_dy20_ooa = LLVMBuildFMul(b, da01, dy20_ooa, "da01_dy20_ooa"); + LLVMValueRef da20_dy01_ooa = LLVMBuildFMul(b, da20, dy01_ooa, "da20_dy01_ooa"); + LLVMValueRef dadx = LLVMBuildFSub(b, da01_dy20_ooa, da20_dy01_ooa, "dadx"); + + /* Calculate dady (vec4f) + */ + LLVMValueRef da01_dx20_ooa = LLVMBuildFMul(b, da01, dx20_ooa, "da01_dx20_ooa"); + LLVMValueRef da20_dx01_ooa = LLVMBuildFMul(b, da20, dx01_ooa, "da20_dx01_ooa"); + LLVMValueRef dady = LLVMBuildFSub(b, da20_dx01_ooa, da01_dx20_ooa, "dady"); + + /* Calculate a0 - the attribute value at the origin + */ + LLVMValueRef dadx_x0 = LLVMBuildFMul(b, dadx, x0_center, "dadx_x0"); + LLVMValueRef dady_y0 = LLVMBuildFMul(b, dady, y0_center, "dady_y0"); + LLVMValueRef attr_v0 = LLVMBuildFAdd(b, dadx_x0, dady_y0, "attr_v0"); + LLVMValueRef attr_0 = LLVMBuildFSub(b, a0, attr_v0, "attr_0"); + + store_coef(b, args, slot, attr_0, dadx, dady); +} + + +static void +emit_linear_coef( LLVMBuilderRef b, + struct lp_setup_args *args, + unsigned slot, + unsigned vert_attr) +{ + LLVMValueRef idx = LLVMConstInt(LLVMInt32Type(), vert_attr, 0); + + LLVMValueRef a0 = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v0, &idx, 1, ""), "v0a"); + LLVMValueRef a1 = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v1, &idx, 1, ""), "v1a"); + LLVMValueRef a2 = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v2, &idx, 1, ""), "v2a"); + + emit_coef4(b, args, slot, a0, a1, a2); +} + + + +/** + * Compute a0, dadx and dady for a perspective-corrected interpolant, + * for a triangle. + * We basically multiply the vertex value by 1/w before computing + * the plane coefficients (a0, dadx, dady). + * Later, when we compute the value at a particular fragment position we'll + * divide the interpolated value by the interpolated W at that fragment. + */ +static void +emit_perspective_coef( LLVMBuilderRef b, + struct lp_setup_args *args, + unsigned slot, + unsigned vert_attr) +{ + /* premultiply by 1/w (v[0][3] is always 1/w): + */ + LLVMValueRef idx = LLVMConstInt(LLVMInt32Type(), vert_attr, 0); + + LLVMValueRef v0a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v0, &idx, 1, ""), "v0a"); + LLVMValueRef v1a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v1, &idx, 1, ""), "v1a"); + LLVMValueRef v2a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v2, &idx, 1, ""), "v2a"); + + LLVMValueRef v0_oow = vec4f_from_scalar(b, vert_attrib(b, args->v0, 0, 3, ""), "v0_oow"); + LLVMValueRef v1_oow = vec4f_from_scalar(b, vert_attrib(b, args->v1, 0, 3, ""), "v1_oow"); + LLVMValueRef v2_oow = vec4f_from_scalar(b, vert_attrib(b, args->v2, 0, 3, ""), "v2_oow"); + + LLVMValueRef v0_oow_v0a = LLVMBuildFMul(b, v0a, v0_oow, "v0_oow_v0a"); + LLVMValueRef v1_oow_v1a = LLVMBuildFMul(b, v1a, v1_oow, "v1_oow_v1a"); + LLVMValueRef v2_oow_v2a = LLVMBuildFMul(b, v2a, v2_oow, "v2_oow_v2a"); + + emit_coef4(b, args, slot, v0_oow_v0a, v1_oow_v1a, v2_oow_v2a); +} + + +static void +emit_position_coef( LLVMBuilderRef builder, + struct lp_setup_args *args, + int slot, int attrib ) +{ + emit_linear_coef(builder, args, slot, attrib); +} + + + + +/** + * Compute the inputs-> dadx, dady, a0 values. + */ +static void +emit_tri_coef( LLVMBuilderRef builder, + const struct lp_setup_variant_key *key, + struct lp_setup_args *args ) +{ + unsigned slot; + + /* The internal position input is in slot zero: + */ + emit_position_coef(builder, args, 0, 0); + + /* setup interpolation for all the remaining attributes: + */ + for (slot = 0; slot < key->num_inputs; slot++) { + unsigned vert_attr = key->inputs[slot].src_index; + + switch (key->inputs[slot].interp) { + case LP_INTERP_CONSTANT: + if (key->flatshade_first) { + emit_constant_coef4(builder, args, slot+1, args->v0, vert_attr); + } + else { + emit_constant_coef4(builder, args, slot+1, args->v2, vert_attr); + } + break; + + case LP_INTERP_LINEAR: + emit_linear_coef(builder, args, slot+1, vert_attr); + break; + + case LP_INTERP_PERSPECTIVE: + emit_perspective_coef(builder, args, slot+1, vert_attr); + break; + + case LP_INTERP_POSITION: + /* + * The generated pixel interpolators will pick up the coeffs from + * slot 0. + */ + break; + + case LP_INTERP_FACING: + emit_facing_coef(builder, args, slot+1); + break; + + default: + assert(0); + } + } +} + + +/* XXX: This is generic code, share with fs/vs codegen: + */ +static lp_jit_setup_triangle +finalize_function(struct llvmpipe_screen *screen, + LLVMBuilderRef builder, + LLVMValueRef function) +{ + void *f; + + /* Verify the LLVM IR. If invalid, dump and abort */ +#ifdef DEBUG + if (LLVMVerifyFunction(function, LLVMPrintMessageAction)) { + if (1) + lp_debug_dump_value(function); + abort(); + } +#endif + + /* Apply optimizations to LLVM IR */ + LLVMRunFunctionPassManager(screen->pass, function); + + if (gallivm_debug & GALLIVM_DEBUG_IR) + { + /* Print the LLVM IR to stderr */ + lp_debug_dump_value(function); + debug_printf("\n"); + } + + /* + * Translate the LLVM IR into machine code. + */ + f = LLVMGetPointerToGlobal(screen->engine, function); + + if (gallivm_debug & GALLIVM_DEBUG_ASM) + { + lp_disassemble(f); + } + + lp_func_delete_body(function); + + return f; +} + +/* XXX: Generic code: + */ +static void +lp_emit_emms(LLVMBuilderRef builder) +{ +#ifdef PIPE_ARCH_X86 + /* Avoid corrupting the FPU stack on 32bit OSes. */ + lp_build_intrinsic(builder, "llvm.x86.mmx.emms", LLVMVoidType(), NULL, 0); +#endif +} + + +/* XXX: generic code: + */ +static void +set_noalias(LLVMBuilderRef builder, + LLVMValueRef function, + const LLVMTypeRef *arg_types, + int nr_args) +{ + int i; + for(i = 0; i < Elements(arg_types); ++i) + if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) + LLVMAddAttribute(LLVMGetParam(function, i), + LLVMNoAliasAttribute); +} + +static void +init_args(LLVMBuilderRef b, + struct lp_setup_args *args, + const struct lp_setup_variant *variant) +{ + LLVMValueRef v0_x = vert_attrib(b, args->v0, 0, 0, "v0_x"); + LLVMValueRef v0_y = vert_attrib(b, args->v0, 0, 1, "v0_y"); + + LLVMValueRef v1_x = vert_attrib(b, args->v1, 0, 0, "v1_x"); + LLVMValueRef v1_y = vert_attrib(b, args->v1, 0, 1, "v1_y"); + + LLVMValueRef v2_x = vert_attrib(b, args->v2, 0, 0, "v2_x"); + LLVMValueRef v2_y = vert_attrib(b, args->v2, 0, 1, "v2_y"); + + LLVMValueRef pixel_center = LLVMConstReal(LLVMFloatType(), + variant->key.pixel_center_half ? 0.5 : 0); + + LLVMValueRef x0_center = LLVMBuildFSub(b, v0_x, pixel_center, "x0_center" ); + LLVMValueRef y0_center = LLVMBuildFSub(b, v0_y, pixel_center, "y0_center" ); + + LLVMValueRef dx01 = LLVMBuildFSub(b, v0_x, v1_x, "dx01"); + LLVMValueRef dy01 = LLVMBuildFSub(b, v0_y, v1_y, "dy01"); + LLVMValueRef dx20 = LLVMBuildFSub(b, v2_x, v0_x, "dx20"); + LLVMValueRef dy20 = LLVMBuildFSub(b, v2_y, v0_y, "dy20"); + + LLVMValueRef one = LLVMConstReal(LLVMFloatType(), 1.0); + LLVMValueRef e = LLVMBuildFMul(b, dx01, dy20, "e"); + LLVMValueRef f = LLVMBuildFMul(b, dx20, dy01, "f"); + LLVMValueRef ooa = LLVMBuildFDiv(b, one, LLVMBuildFSub(b, e, f, ""), "ooa"); + + LLVMValueRef dy20_ooa = LLVMBuildFMul(b, dy20, ooa, "dy20_ooa"); + LLVMValueRef dy01_ooa = LLVMBuildFMul(b, dy01, ooa, "dy01_ooa"); + LLVMValueRef dx20_ooa = LLVMBuildFMul(b, dx20, ooa, "dx20_ooa"); + LLVMValueRef dx01_ooa = LLVMBuildFMul(b, dx01, ooa, "dx01_ooa"); + + args->dy20_ooa = vec4f_from_scalar(b, dy20_ooa, "dy20_ooa_4f"); + args->dy01_ooa = vec4f_from_scalar(b, dy01_ooa, "dy01_ooa_4f"); + + args->dx20_ooa = vec4f_from_scalar(b, dx20_ooa, "dx20_ooa_4f"); + args->dx01_ooa = vec4f_from_scalar(b, dx01_ooa, "dx01_ooa_4f"); + + args->x0_center = vec4f_from_scalar(b, x0_center, "x0_center_4f"); + args->y0_center = vec4f_from_scalar(b, y0_center, "y0_center_4f"); +} + +/** + * Generate the runtime callable function for the coefficient calculation. + * + */ +static struct lp_setup_variant * +generate_setup_variant(struct llvmpipe_screen *screen, + struct lp_setup_variant_key *key) +{ + struct lp_setup_variant *variant = NULL; + struct lp_setup_args args; + char func_name[256]; + LLVMTypeRef vec4f_type; + LLVMTypeRef func_type; + LLVMTypeRef arg_types[7]; + LLVMBasicBlockRef block; + LLVMBuilderRef builder; + int64_t t0, t1; + + if (0) + goto fail; + + variant = CALLOC_STRUCT(lp_setup_variant); + if (variant == NULL) + goto fail; + + if (LP_DEBUG & DEBUG_COUNTERS) { + t0 = os_time_get(); + } + + memcpy(&variant->key, key, key->size); + variant->list_item_global.base = variant; + + util_snprintf(func_name, sizeof(func_name), "fs%u_setup%u", + 0, + variant->no); + + /* Currently always deal with full 4-wide vertex attributes from + * the vertices. + */ + + vec4f_type = LLVMVectorType(LLVMFloatType(), 4); + + arg_types[0] = LLVMPointerType(vec4f_type, 0); /* v0 */ + arg_types[1] = LLVMPointerType(vec4f_type, 0); /* v1 */ + arg_types[2] = LLVMPointerType(vec4f_type, 0); /* v2 */ + arg_types[3] = LLVMInt32Type(); /* facing */ + arg_types[4] = LLVMPointerType(vec4f_type, 0); /* a0, aligned */ + arg_types[5] = LLVMPointerType(vec4f_type, 0); /* dadx, aligned */ + arg_types[6] = LLVMPointerType(vec4f_type, 0); /* dady, aligned */ + + func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0); + + variant->function = LLVMAddFunction(screen->module, func_name, func_type); + if (!variant->function) + goto fail; + + LLVMSetFunctionCallConv(variant->function, LLVMCCallConv); + + args.v0 = LLVMGetParam(variant->function, 0); + args.v1 = LLVMGetParam(variant->function, 1); + args.v2 = LLVMGetParam(variant->function, 2); + args.facing = LLVMGetParam(variant->function, 3); + args.a0 = LLVMGetParam(variant->function, 4); + args.dadx = LLVMGetParam(variant->function, 5); + args.dady = LLVMGetParam(variant->function, 6); + + lp_build_name(args.v0, "in_v0"); + lp_build_name(args.v1, "in_v1"); + lp_build_name(args.v2, "in_v2"); + lp_build_name(args.facing, "in_facing"); + lp_build_name(args.a0, "out_a0"); + lp_build_name(args.dadx, "out_dadx"); + lp_build_name(args.dady, "out_dady"); + + /* + * Function body + */ + block = LLVMAppendBasicBlock(variant->function, "entry"); + builder = LLVMCreateBuilder(); + LLVMPositionBuilderAtEnd(builder, block); + + set_noalias(builder, variant->function, arg_types, Elements(arg_types)); + init_args(builder, &args, variant); + emit_tri_coef(builder, &variant->key, &args); + + lp_emit_emms(builder); + LLVMBuildRetVoid(builder); + LLVMDisposeBuilder(builder); + + variant->jit_function = finalize_function(screen, builder, + variant->function); + if (!variant->jit_function) + goto fail; + + /* + * Update timing information: + */ + if (LP_DEBUG & DEBUG_COUNTERS) { + t1 = os_time_get(); + LP_COUNT_ADD(llvm_compile_time, t1 - t0); + LP_COUNT_ADD(nr_llvm_compiles, 1); + } + + return variant; + +fail: + if (variant) { + if (variant->function) { + if (variant->jit_function) + LLVMFreeMachineCodeForFunction(screen->engine, + variant->function); + LLVMDeleteFunction(variant->function); + } + FREE(variant); + } + + return NULL; +} + + + +static void +lp_make_setup_variant_key(struct llvmpipe_context *lp, + struct lp_setup_variant_key *key) +{ + struct lp_fragment_shader *fs = lp->fs; + unsigned i; + + assert(sizeof key->inputs[0] == sizeof(ushort)); + + key->num_inputs = fs->info.base.num_inputs; + key->flatshade_first = lp->rasterizer->flatshade_first; + key->pixel_center_half = lp->rasterizer->gl_rasterization_rules; + key->size = Offset(struct lp_setup_variant_key, + inputs[key->num_inputs]); + key->pad = 0; + + memcpy(key->inputs, fs->inputs, key->num_inputs * sizeof key->inputs[0]); + for (i = 0; i < key->num_inputs; i++) { + if (key->inputs[i].interp == LP_INTERP_COLOR) { + if (lp->rasterizer->flatshade) + key->inputs[i].interp = LP_INTERP_CONSTANT; + else + key->inputs[i].interp = LP_INTERP_LINEAR; + } + } + +} + + +static void +remove_setup_variant(struct llvmpipe_context *lp, + struct lp_setup_variant *variant) +{ + struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen); + + if (gallivm_debug & GALLIVM_DEBUG_IR) { + debug_printf("llvmpipe: del setup_variant #%u total %u\n", + variant->no, lp->nr_setup_variants); + } + + if (variant->function) { + if (variant->jit_function) + LLVMFreeMachineCodeForFunction(screen->engine, + variant->function); + LLVMDeleteFunction(variant->function); + } + + remove_from_list(&variant->list_item_global); + lp->nr_setup_variants--; + FREE(variant); +} + + + +/* When the number of setup variants exceeds a threshold, cull a + * fraction (currently a quarter) of them. + */ +static void +cull_setup_variants(struct llvmpipe_context *lp) +{ + struct pipe_context *pipe = &lp->pipe; + int i; + + /* + * XXX: we need to flush the context until we have some sort of reference + * counting in fragment shaders as they may still be binned + * Flushing alone might not be sufficient we need to wait on it too. + */ + llvmpipe_finish(pipe, __FUNCTION__); + + for (i = 0; i < LP_MAX_SETUP_VARIANTS / 4; i++) { + struct lp_setup_variant_list_item *item = last_elem(&lp->setup_variants_list); + remove_setup_variant(lp, item->base); + } +} + + +/** + * Update fragment/vertex shader linkage state. This is called just + * prior to drawing something when some fragment-related state has + * changed. + */ +void +llvmpipe_update_setup(struct llvmpipe_context *lp) +{ + struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen); + + struct lp_setup_variant_key *key = &lp->setup_variant.key; + struct lp_setup_variant *variant = NULL; + struct lp_setup_variant_list_item *li; + + lp_make_setup_variant_key(lp, key); + + foreach(li, &lp->setup_variants_list) { + if(li->base->key.size == key->size && + memcmp(&li->base->key, key, key->size) == 0) { + variant = li->base; + break; + } + } + + if (variant) { + move_to_head(&lp->setup_variants_list, &variant->list_item_global); + } + else { + if (lp->nr_setup_variants >= LP_MAX_SETUP_VARIANTS) { + cull_setup_variants(lp); + } + + variant = generate_setup_variant(screen, key); + insert_at_head(&lp->setup_variants_list, &variant->list_item_global); + lp->nr_setup_variants++; + } + + lp_setup_set_setup_variant(lp->setup, + variant); +} + +void +lp_delete_setup_variants(struct llvmpipe_context *lp) +{ + struct lp_setup_variant_list_item *li; + li = first_elem(&lp->setup_variants_list); + while(!at_end(&lp->setup_variants_list, li)) { + struct lp_setup_variant_list_item *next = next_elem(li); + remove_setup_variant(lp, li->base); + li = next; + } +} + +void +lp_dump_setup_coef( const struct lp_setup_variant_key *key, + const float (*sa0)[4], + const float (*sdadx)[4], + const float (*sdady)[4]) +{ + int i, slot; + + for (i = 0; i < NUM_CHANNELS; i++) { + float a0 = sa0 [0][i]; + float dadx = sdadx[0][i]; + float dady = sdady[0][i]; + + debug_printf("POS.%c: a0 = %f, dadx = %f, dady = %f\n", + "xyzw"[i], + a0, dadx, dady); + } + + for (slot = 0; slot < key->num_inputs; slot++) { + unsigned usage_mask = key->inputs[slot].usage_mask; + for (i = 0; i < NUM_CHANNELS; i++) { + if (usage_mask & (1 << i)) { + float a0 = sa0 [1 + slot][i]; + float dadx = sdadx[1 + slot][i]; + float dady = sdady[1 + slot][i]; + + debug_printf("IN[%u].%c: a0 = %f, dadx = %f, dady = %f\n", + slot, + "xyzw"[i], + a0, dadx, dady); + } + } + } +} diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.h b/src/gallium/drivers/llvmpipe/lp_state_setup.h new file mode 100644 index 00000000000..b0c81baa75f --- /dev/null +++ b/src/gallium/drivers/llvmpipe/lp_state_setup.h @@ -0,0 +1,80 @@ +#ifndef LP_STATE_SETUP_H +#define LP_STATE_SETUP_H + +#include "lp_bld_interp.h" + + +struct llvmpipe_context; +struct lp_setup_variant; + +struct lp_setup_variant_list_item +{ + struct lp_setup_variant *base; + struct lp_setup_variant_list_item *next, *prev; +}; + + +struct lp_setup_variant_key { + unsigned num_inputs:8; + unsigned flatshade_first:1; + unsigned pixel_center_half:1; + unsigned pad:7; + unsigned size:16; + struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS]; +}; + + +typedef void (*lp_jit_setup_triangle)( const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4], + boolean front_facing, + float (*a0)[4], + float (*dadx)[4], + float (*dady)[4] ); + + + + +/* At this stage, for a given variant key, we create a + * draw_vertex_info struct telling the draw module how to format the + * vertices, and an llvm-generated function which calculates the + * attribute interpolants (a0, dadx, dady) from three of those + * vertices. + */ +struct lp_setup_variant { + struct lp_setup_variant_key key; + + struct lp_setup_variant_list_item list_item_global; + + /* XXX: this is a pointer to the LLVM IR. Once jit_function is + * generated, we never need to use the IR again - need to find a + * way to release this data without destroying the generated + * assembly. + */ + LLVMValueRef function; + + /* The actual generated setup function: + */ + lp_jit_setup_triangle jit_function; + + unsigned no; +}; + +void lp_setup_tri_fallback( const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4], + boolean front_facing, + float (*a0)[4], + float (*dadx)[4], + float (*dady)[4], + const struct lp_setup_variant_key *key ); + +void lp_delete_setup_variants(struct llvmpipe_context *lp); + +void +lp_dump_setup_coef( const struct lp_setup_variant_key *key, + const float (*sa0)[4], + const float (*sdadx)[4], + const float (*sdady)[4]); + +#endif diff --git a/src/gallium/drivers/llvmpipe/lp_test_round.c b/src/gallium/drivers/llvmpipe/lp_test_round.c index 57b0ee57767..816518e5081 100644 --- a/src/gallium/drivers/llvmpipe/lp_test_round.c +++ b/src/gallium/drivers/llvmpipe/lp_test_round.c @@ -75,10 +75,7 @@ add_test(LLVMModuleRef module, const char *name, lp_func_t lp_func) LLVMValueRef ret; struct lp_build_context bld; - bld.builder = builder; - bld.type.floating = 1; - bld.type.width = 32; - bld.type.length = 4; + lp_build_context_init(&bld, builder, lp_float32_vec4_type()); LLVMSetFunctionCallConv(func, LLVMCCallConv); @@ -100,9 +97,10 @@ printv(char* string, v4sf value) f[0], f[1], f[2], f[3]); } -static void +static boolean compare(v4sf x, v4sf y) { + boolean success = TRUE; float *xp = (float *) &x; float *yp = (float *) &y; if (xp[0] != yp[0] || @@ -110,7 +108,9 @@ compare(v4sf x, v4sf y) xp[2] != yp[2] || xp[3] != yp[3]) { printf(" Incorrect result! ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ \n"); + success = FALSE; } + return success; } @@ -171,9 +171,12 @@ test_round(unsigned verbose, FILE *fp) LLVMDumpModule(module); for (i = 0; i < 3; i++) { + /* NOTE: There are several acceptable rules for x.5 rounding: ceiling, + * nearest even, etc. So we avoid testing such corner cases here. + */ v4sf xvals[3] = { {-10.0, -1, 0, 12.0}, - {-1.5, -0.25, 1.25, 2.5}, + {-1.49, -0.25, 1.25, 2.51}, {-0.99, -0.01, 0.01, 0.99} }; v4sf x = xvals[i]; @@ -191,7 +194,7 @@ test_round(unsigned verbose, FILE *fp) y = round_func(x); printv("C round(x) ", ref); printv("LLVM round(x)", y); - compare(ref, y); + success = success && compare(ref, y); refp[0] = trunc(xp[0]); refp[1] = trunc(xp[1]); @@ -200,7 +203,7 @@ test_round(unsigned verbose, FILE *fp) y = trunc_func(x); printv("C trunc(x) ", ref); printv("LLVM trunc(x)", y); - compare(ref, y); + success = success && compare(ref, y); refp[0] = floor(xp[0]); refp[1] = floor(xp[1]); @@ -209,7 +212,7 @@ test_round(unsigned verbose, FILE *fp) y = floor_func(x); printv("C floor(x) ", ref); printv("LLVM floor(x)", y); - compare(ref, y); + success = success && compare(ref, y); refp[0] = ceil(xp[0]); refp[1] = ceil(xp[1]); @@ -218,7 +221,7 @@ test_round(unsigned verbose, FILE *fp) y = ceil_func(x); printv("C ceil(x) ", ref); printv("LLVM ceil(x) ", y); - compare(ref, y); + success = success && compare(ref, y); } LLVMFreeMachineCodeForFunction(engine, test_round); @@ -247,11 +250,7 @@ test_round(unsigned verbose, FILE *fp) boolean test_all(unsigned verbose, FILE *fp) { - boolean success = TRUE; - - test_round(verbose, fp); - - return success; + return test_round(verbose, fp); } diff --git a/src/gallium/drivers/llvmpipe/lp_test_sincos.c b/src/gallium/drivers/llvmpipe/lp_test_sincos.c index 7ab357f162e..79939b1a393 100644 --- a/src/gallium/drivers/llvmpipe/lp_test_sincos.c +++ b/src/gallium/drivers/llvmpipe/lp_test_sincos.c @@ -72,10 +72,7 @@ add_sincos_test(LLVMModuleRef module, boolean sin) LLVMValueRef ret; struct lp_build_context bld; - bld.builder = builder; - bld.type.floating = 1; - bld.type.width = 32; - bld.type.length = 4; + lp_build_context_init(&bld, builder, lp_float32_vec4_type()); LLVMSetFunctionCallConv(func, LLVMCCallConv); diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.py b/src/gallium/drivers/llvmpipe/lp_tile_soa.py index 2ba39052aba..e49f9c62fe7 100644 --- a/src/gallium/drivers/llvmpipe/lp_tile_soa.py +++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.py @@ -289,172 +289,141 @@ def generate_format_write(format, src_channel, src_native_type, src_suffix): print -def generate_ssse3(): +def generate_sse2(): print ''' #if defined(PIPE_ARCH_SSE) #include "util/u_sse.h" -static void -lp_tile_b8g8r8a8_unorm_swizzle_4ub_ssse3(uint8_t *dst, - const uint8_t *src, unsigned src_stride, - unsigned x0, unsigned y0) +static ALWAYS_INLINE void +swz4( const __m128i * restrict x, + const __m128i * restrict y, + const __m128i * restrict z, + const __m128i * restrict w, + __m128i * restrict a, + __m128i * restrict b, + __m128i * restrict c, + __m128i * restrict d) { + __m128i i, j, k, l; + __m128i m, n, o, p; + __m128i e, f, g, h; + + m = _mm_unpacklo_epi8(*x,*y); + n = _mm_unpackhi_epi8(*x,*y); + o = _mm_unpacklo_epi8(*z,*w); + p = _mm_unpackhi_epi8(*z,*w); + + i = _mm_unpacklo_epi16(m,n); + j = _mm_unpackhi_epi16(m,n); + k = _mm_unpacklo_epi16(o,p); + l = _mm_unpackhi_epi16(o,p); + + e = _mm_unpacklo_epi8(i,j); + f = _mm_unpackhi_epi8(i,j); + g = _mm_unpacklo_epi8(k,l); + h = _mm_unpackhi_epi8(k,l); + + *a = _mm_unpacklo_epi64(e,g); + *b = _mm_unpackhi_epi64(e,g); + *c = _mm_unpacklo_epi64(f,h); + *d = _mm_unpackhi_epi64(f,h); +} + +static ALWAYS_INLINE void +unswz4( const __m128i * restrict a, + const __m128i * restrict b, + const __m128i * restrict c, + const __m128i * restrict d, + __m128i * restrict x, + __m128i * restrict y, + __m128i * restrict z, + __m128i * restrict w) +{ + __m128i i, j, k, l; + __m128i m, n, o, p; + + i = _mm_unpacklo_epi8(*a,*b); + j = _mm_unpackhi_epi8(*a,*b); + k = _mm_unpacklo_epi8(*c,*d); + l = _mm_unpackhi_epi8(*c,*d); + + m = _mm_unpacklo_epi16(i,k); + n = _mm_unpackhi_epi16(i,k); + o = _mm_unpacklo_epi16(j,l); + p = _mm_unpackhi_epi16(j,l); + + *x = _mm_unpacklo_epi64(m,n); + *y = _mm_unpackhi_epi64(m,n); + *z = _mm_unpacklo_epi64(o,p); + *w = _mm_unpackhi_epi64(o,p); +} +static void +lp_tile_b8g8r8a8_unorm_swizzle_4ub_sse2(uint8_t * restrict dst, + const uint8_t * restrict src, unsigned src_stride, + unsigned x0, unsigned y0) +{ + __m128i *dst128 = (__m128i *) dst; unsigned x, y; - __m128i *pdst = (__m128i*) dst; - const uint8_t *ysrc0 = src + y0*src_stride + x0*sizeof(uint32_t); - unsigned int tile_stridex = src_stride*(TILE_VECTOR_HEIGHT - 1) - sizeof(uint32_t)*TILE_VECTOR_WIDTH; - unsigned int tile_stridey = src_stride*TILE_VECTOR_HEIGHT; - - const __m128i shuffle00 = _mm_setr_epi8(0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); - const __m128i shuffle01 = _mm_setr_epi8(0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); - const __m128i shuffle02 = _mm_setr_epi8(0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); - const __m128i shuffle03 = _mm_setr_epi8(0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); - - const __m128i shuffle10 = _mm_setr_epi8(0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); - const __m128i shuffle11 = _mm_setr_epi8(0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); - const __m128i shuffle12 = _mm_setr_epi8(0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); - const __m128i shuffle13 = _mm_setr_epi8(0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); - - const __m128i shuffle20 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff); - const __m128i shuffle21 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff); - const __m128i shuffle22 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff); - const __m128i shuffle23 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff); - - const __m128i shuffle30 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e); - const __m128i shuffle31 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d); - const __m128i shuffle32 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c); - const __m128i shuffle33 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f); - - for (y = 0; y < TILE_SIZE; y += TILE_VECTOR_HEIGHT) { - __m128i line0 = *(__m128i*)ysrc0; - const uint8_t *ysrc = ysrc0 + src_stride; - ysrc0 += tile_stridey; - - for (x = 0; x < TILE_SIZE; x += TILE_VECTOR_WIDTH) { - __m128i r, g, b, a, line1; - line1 = *(__m128i*)ysrc; - PIPE_READ_WRITE_BARRIER(); - ysrc += src_stride; - r = _mm_shuffle_epi8(line0, shuffle00); - g = _mm_shuffle_epi8(line0, shuffle01); - b = _mm_shuffle_epi8(line0, shuffle02); - a = _mm_shuffle_epi8(line0, shuffle03); - - line0 = *(__m128i*)ysrc; - PIPE_READ_WRITE_BARRIER(); - ysrc += src_stride; - r = _mm_or_si128(r, _mm_shuffle_epi8(line1, shuffle10)); - g = _mm_or_si128(g, _mm_shuffle_epi8(line1, shuffle11)); - b = _mm_or_si128(b, _mm_shuffle_epi8(line1, shuffle12)); - a = _mm_or_si128(a, _mm_shuffle_epi8(line1, shuffle13)); - - line1 = *(__m128i*)ysrc; - PIPE_READ_WRITE_BARRIER(); - ysrc -= tile_stridex; - r = _mm_or_si128(r, _mm_shuffle_epi8(line0, shuffle20)); - g = _mm_or_si128(g, _mm_shuffle_epi8(line0, shuffle21)); - b = _mm_or_si128(b, _mm_shuffle_epi8(line0, shuffle22)); - a = _mm_or_si128(a, _mm_shuffle_epi8(line0, shuffle23)); - - if (x + 1 < TILE_SIZE) { - line0 = *(__m128i*)ysrc; - ysrc += src_stride; - } - - PIPE_READ_WRITE_BARRIER(); - r = _mm_or_si128(r, _mm_shuffle_epi8(line1, shuffle30)); - g = _mm_or_si128(g, _mm_shuffle_epi8(line1, shuffle31)); - b = _mm_or_si128(b, _mm_shuffle_epi8(line1, shuffle32)); - a = _mm_or_si128(a, _mm_shuffle_epi8(line1, shuffle33)); - - *pdst++ = r; - *pdst++ = g; - *pdst++ = b; - *pdst++ = a; + + src += y0 * src_stride; + src += x0 * sizeof(uint32_t); + + for (y = 0; y < TILE_SIZE; y += 4) { + const uint8_t *src_row = src; + + for (x = 0; x < TILE_SIZE; x += 4) { + swz4((const __m128i *) (src_row + 0 * src_stride), + (const __m128i *) (src_row + 1 * src_stride), + (const __m128i *) (src_row + 2 * src_stride), + (const __m128i *) (src_row + 3 * src_stride), + dst128 + 2, /* b */ + dst128 + 1, /* g */ + dst128 + 0, /* r */ + dst128 + 3); /* a */ + + dst128 += 4; + src_row += sizeof(__m128i); } - } + src += 4 * src_stride; + } } static void -lp_tile_b8g8r8a8_unorm_unswizzle_4ub_ssse3(const uint8_t *src, - uint8_t *dst, unsigned dst_stride, +lp_tile_b8g8r8a8_unorm_unswizzle_4ub_sse2(const uint8_t * restrict src, + uint8_t * restrict dst, unsigned dst_stride, unsigned x0, unsigned y0) { unsigned int x, y; - const __m128i *psrc = (__m128i*) src; - const __m128i *end = (__m128i*) (src + (y0 + TILE_SIZE - 1)*dst_stride + (x0 + TILE_SIZE - 1)*sizeof(uint32_t)); - uint8_t *pdst = dst + y0 * dst_stride + x0 * sizeof(uint32_t); - __m128i c0 = *psrc++; - __m128i c1; - - const __m128i shuffle00 = _mm_setr_epi8(0xff,0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff); - const __m128i shuffle01 = _mm_setr_epi8(0xff,0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff); - const __m128i shuffle02 = _mm_setr_epi8(0xff,0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff); - const __m128i shuffle03 = _mm_setr_epi8(0xff,0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff); - - const __m128i shuffle10 = _mm_setr_epi8(0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff,0xff); - const __m128i shuffle11 = _mm_setr_epi8(0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff,0xff); - const __m128i shuffle12 = _mm_setr_epi8(0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff,0xff); - const __m128i shuffle13 = _mm_setr_epi8(0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff,0xff); - - const __m128i shuffle20 = _mm_setr_epi8(0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff,0xff,0xff); - const __m128i shuffle21 = _mm_setr_epi8(0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff,0xff,0xff); - const __m128i shuffle22 = _mm_setr_epi8(0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff,0xff,0xff); - const __m128i shuffle23 = _mm_setr_epi8(0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff,0xff,0xff); - - const __m128i shuffle30 = _mm_setr_epi8(0xff,0xff,0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05); - const __m128i shuffle31 = _mm_setr_epi8(0xff,0xff,0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07); - const __m128i shuffle32 = _mm_setr_epi8(0xff,0xff,0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d); - const __m128i shuffle33 = _mm_setr_epi8(0xff,0xff,0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f); - - for (y = 0; y < TILE_SIZE; y += TILE_VECTOR_HEIGHT) { - __m128i *tile = (__m128i*) pdst; - pdst += dst_stride * TILE_VECTOR_HEIGHT; - for (x = 0; x < TILE_SIZE; x += TILE_VECTOR_WIDTH) { - uint8_t *linep = (uint8_t*) (tile++); - __m128i line0, line1, line2, line3; - - c1 = *psrc++; /* r */ - PIPE_READ_WRITE_BARRIER(); - line0 = _mm_shuffle_epi8(c0, shuffle00); - line1 = _mm_shuffle_epi8(c0, shuffle01); - line2 = _mm_shuffle_epi8(c0, shuffle02); - line3 = _mm_shuffle_epi8(c0, shuffle03); - - c0 = *psrc++; /* g */ - PIPE_READ_WRITE_BARRIER(); - line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c1, shuffle10)); - line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c1, shuffle11)); - line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c1, shuffle12)); - line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c1, shuffle13)); - - c1 = *psrc++; /* b */ - PIPE_READ_WRITE_BARRIER(); - line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c0, shuffle20)); - line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c0, shuffle21)); - line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c0, shuffle22)); - line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c0, shuffle23)); - - if (psrc != end) - c0 = *psrc++; /* a */ - PIPE_READ_WRITE_BARRIER(); - line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c1, shuffle30)); - line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c1, shuffle31)); - line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c1, shuffle32)); - line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c1, shuffle33)); - - *(__m128i*) (linep) = line0; - *(__m128i*) (((char*)linep) + dst_stride) = line1; - *(__m128i*) (((char*)linep) + 2 * dst_stride) = line2; - *(__m128i*) (((char*)linep) + 3 * dst_stride) = line3; + const __m128i *src128 = (const __m128i *) src; + + dst += y0 * dst_stride; + dst += x0 * sizeof(uint32_t); + + for (y = 0; y < TILE_SIZE; y += 4) { + const uint8_t *dst_row = dst; + + for (x = 0; x < TILE_SIZE; x += 4) { + unswz4( &src128[2], /* b */ + &src128[1], /* g */ + &src128[0], /* r */ + &src128[3], /* a */ + (__m128i *) (dst_row + 0 * dst_stride), + (__m128i *) (dst_row + 1 * dst_stride), + (__m128i *) (dst_row + 2 * dst_stride), + (__m128i *) (dst_row + 3 * dst_stride)); + + src128 += 4; + dst_row += sizeof(__m128i);; } + + dst += 4 * dst_stride; } } -#endif /* PIPE_ARCH_SSSE3 */ +#endif /* PIPE_ARCH_SSE */ ''' @@ -479,7 +448,7 @@ def generate_swizzle(formats, dst_channel, dst_native_type, dst_suffix): func_name = 'lp_tile_%s_swizzle_%s' % (format.short_name(), dst_suffix) if format.name == 'PIPE_FORMAT_B8G8R8A8_UNORM': print '#ifdef PIPE_ARCH_SSE' - print ' func = util_cpu_caps.has_ssse3 ? %s_ssse3 : %s;' % (func_name, func_name) + print ' func = util_cpu_caps.has_sse2 ? %s_sse2 : %s;' % (func_name, func_name) print '#else' print ' func = %s;' % (func_name,) print '#endif' @@ -517,7 +486,7 @@ def generate_unswizzle(formats, src_channel, src_native_type, src_suffix): func_name = 'lp_tile_%s_unswizzle_%s' % (format.short_name(), src_suffix) if format.name == 'PIPE_FORMAT_B8G8R8A8_UNORM': print '#ifdef PIPE_ARCH_SSE' - print ' func = util_cpu_caps.has_ssse3 ? %s_ssse3 : %s;' % (func_name, func_name) + print ' func = util_cpu_caps.has_sse2 ? %s_sse2 : %s;' % (func_name, func_name) print '#else' print ' func = %s;' % (func_name,) print '#endif' @@ -577,7 +546,7 @@ def main(): print '};' print - generate_ssse3() + generate_sse2() channel = Channel(UNSIGNED, True, 8) native_type = 'uint8_t' |