diff options
Diffstat (limited to 'src/gallium/auxiliary')
51 files changed, 3537 insertions, 1593 deletions
diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile index 05096b12a86..ed96725d782 100644 --- a/src/gallium/auxiliary/Makefile +++ b/src/gallium/auxiliary/Makefile @@ -173,6 +173,7 @@ GALLIVM_SOURCES = \ gallivm/lp_bld_struct.c \ gallivm/lp_bld_swizzle.c \ gallivm/lp_bld_tgsi_aos.c \ + gallivm/lp_bld_tgsi_info.c \ gallivm/lp_bld_tgsi_soa.c \ gallivm/lp_bld_type.c \ draw/draw_llvm.c \ diff --git a/src/gallium/auxiliary/SConscript b/src/gallium/auxiliary/SConscript index a18f7c0b2a3..f22c8b96123 100644 --- a/src/gallium/auxiliary/SConscript +++ b/src/gallium/auxiliary/SConscript @@ -225,6 +225,7 @@ if env['llvm']: 'gallivm/lp_bld_struct.c', 'gallivm/lp_bld_swizzle.c', 'gallivm/lp_bld_tgsi_aos.c', + 'gallivm/lp_bld_tgsi_info.c', 'gallivm/lp_bld_tgsi_soa.c', 'gallivm/lp_bld_type.c', 'draw/draw_llvm.c', diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c index 032fcbbc70a..39d82f32892 100644 --- a/src/gallium/auxiliary/draw/draw_context.c +++ b/src/gallium/auxiliary/draw/draw_context.c @@ -335,6 +335,7 @@ draw_set_mapped_constant_buffer(struct draw_context *draw, case PIPE_SHADER_VERTEX: draw->pt.user.vs_constants[slot] = buffer; draw->pt.user.vs_constants_size[slot] = size; + draw->pt.user.planes = (float (*) [12][4]) &(draw->plane[0]); draw_vs_set_constants(draw, slot, buffer, size); break; case PIPE_SHADER_GEOMETRY: @@ -721,9 +722,9 @@ draw_set_mapped_texture(struct draw_context *draw, unsigned sampler_idx, uint32_t width, uint32_t height, uint32_t depth, uint32_t last_level, - uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS], - uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS], - const void *data[DRAW_MAX_TEXTURE_LEVELS]) + uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS], + uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS], + const void *data[PIPE_MAX_TEXTURE_LEVELS]) { #ifdef HAVE_LLVM if(draw->llvm) diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h index 1f27cbf488a..ff4f753604f 100644 --- a/src/gallium/auxiliary/draw/draw_context.h +++ b/src/gallium/auxiliary/draw/draw_context.h @@ -49,7 +49,6 @@ struct draw_geometry_shader; struct draw_fragment_shader; struct tgsi_sampler; -#define DRAW_MAX_TEXTURE_LEVELS 13 /* 4K x 4K for now */ struct draw_context *draw_create( struct pipe_context *pipe ); @@ -120,9 +119,9 @@ draw_set_mapped_texture(struct draw_context *draw, unsigned sampler_idx, uint32_t width, uint32_t height, uint32_t depth, uint32_t last_level, - uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS], - uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS], - const void *data[DRAW_MAX_TEXTURE_LEVELS]); + uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS], + uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS], + const void *data[PIPE_MAX_TEXTURE_LEVELS]); /* diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c index 7fb86d7cb27..140e596f994 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.c +++ b/src/gallium/auxiliary/draw/draw_llvm.c @@ -31,6 +31,9 @@ #include "draw_vs.h" #include "gallivm/lp_bld_arit.h" +#include "gallivm/lp_bld_logic.h" +#include "gallivm/lp_bld_const.h" +#include "gallivm/lp_bld_swizzle.h" #include "gallivm/lp_bld_struct.h" #include "gallivm/lp_bld_type.h" #include "gallivm/lp_bld_flow.h" @@ -43,7 +46,6 @@ #include "tgsi/tgsi_exec.h" #include "tgsi/tgsi_dump.h" -#include "util/u_cpu_detect.h" #include "util/u_math.h" #include "util/u_pointer.h" #include "util/u_string.h" @@ -72,12 +74,12 @@ init_globals(struct draw_llvm *llvm) elem_types[DRAW_JIT_TEXTURE_DEPTH] = LLVMInt32Type(); elem_types[DRAW_JIT_TEXTURE_LAST_LEVEL] = LLVMInt32Type(); elem_types[DRAW_JIT_TEXTURE_ROW_STRIDE] = - LLVMArrayType(LLVMInt32Type(), DRAW_MAX_TEXTURE_LEVELS); + LLVMArrayType(LLVMInt32Type(), PIPE_MAX_TEXTURE_LEVELS); elem_types[DRAW_JIT_TEXTURE_IMG_STRIDE] = - LLVMArrayType(LLVMInt32Type(), DRAW_MAX_TEXTURE_LEVELS); + LLVMArrayType(LLVMInt32Type(), PIPE_MAX_TEXTURE_LEVELS); elem_types[DRAW_JIT_TEXTURE_DATA] = LLVMArrayType(LLVMPointerType(LLVMInt8Type(), 0), - DRAW_MAX_TEXTURE_LEVELS); + PIPE_MAX_TEXTURE_LEVELS); elem_types[DRAW_JIT_TEXTURE_MIN_LOD] = LLVMFloatType(); elem_types[DRAW_JIT_TEXTURE_MAX_LOD] = LLVMFloatType(); elem_types[DRAW_JIT_TEXTURE_LOD_BIAS] = LLVMFloatType(); @@ -128,12 +130,14 @@ init_globals(struct draw_llvm *llvm) /* struct draw_jit_context */ { - LLVMTypeRef elem_types[3]; + LLVMTypeRef elem_types[5]; LLVMTypeRef context_type; elem_types[0] = LLVMPointerType(LLVMFloatType(), 0); /* vs_constants */ - elem_types[1] = LLVMPointerType(LLVMFloatType(), 0); /* vs_constants */ - elem_types[2] = LLVMArrayType(texture_type, + elem_types[1] = LLVMPointerType(LLVMFloatType(), 0); /* gs_constants */ + elem_types[2] = LLVMPointerType(LLVMArrayType(LLVMArrayType(LLVMFloatType(), 4), 12), 0); /* planes */ + elem_types[3] = LLVMPointerType(LLVMFloatType(), 0); /* viewport */ + elem_types[4] = LLVMArrayType(texture_type, PIPE_MAX_VERTEX_SAMPLERS); /* textures */ context_type = LLVMStructType(elem_types, Elements(elem_types), 0); @@ -142,6 +146,8 @@ init_globals(struct draw_llvm *llvm) llvm->target, context_type, 0); LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, gs_constants, llvm->target, context_type, 1); + LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, planes, + llvm->target, context_type, 2); LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, textures, llvm->target, context_type, DRAW_JIT_CTX_TEXTURES); @@ -267,13 +273,7 @@ draw_llvm_create(struct draw_context *draw) LLVMAddConstantPropagationPass(llvm->pass); } - if(util_cpu_caps.has_sse4_1) { - /* FIXME: There is a bug in this pass, whereby the combination of fptosi - * and sitofp (necessary for trunc/floor/ceil/round implementation) - * somehow becomes invalid code. - */ - LLVMAddInstructionCombiningPass(llvm->pass); - } + LLVMAddInstructionCombiningPass(llvm->pass); LLVMAddGVNPass(llvm->pass); } else { /* We need at least this pass to prevent the backends to fail in @@ -421,7 +421,7 @@ generate_fetch(LLVMBuilderRef builder, "instance_divisor"); } - /* limit index to min(inex, vb_max_index) */ + /* limit index to min(index, vb_max_index) */ cond = LLVMBuildICmp(builder, LLVMIntULE, index, vb_max_index, ""); index = LLVMBuildSelect(builder, cond, index, vb_max_index, ""); @@ -550,19 +550,28 @@ static void store_aos(LLVMBuilderRef builder, LLVMValueRef io_ptr, LLVMValueRef index, - LLVMValueRef value) + LLVMValueRef value, + LLVMValueRef clipmask) { LLVMValueRef id_ptr = draw_jit_header_id(builder, io_ptr); LLVMValueRef data_ptr = draw_jit_header_data(builder, io_ptr); LLVMValueRef indices[3]; + LLVMValueRef val, shift; indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0); indices[1] = index; indices[2] = LLVMConstInt(LLVMInt32Type(), 0, 0); - /* undefined vertex */ - LLVMBuildStore(builder, LLVMConstInt(LLVMInt32Type(), - 0xffff, 0), id_ptr); + /* initialize vertex id:16 = 0xffff, pad:3 = 0, edgeflag:1 = 1 */ + val = LLVMConstInt(LLVMInt32Type(), 0xffff1, 0); + shift = LLVMConstInt(LLVMInt32Type(), 12, 0); + val = LLVMBuildShl(builder, val, shift, ""); + /* add clipmask:12 */ + val = LLVMBuildOr(builder, val, clipmask, ""); + + /* store vertex header */ + LLVMBuildStore(builder, val, id_ptr); + #if DEBUG_STORE lp_build_printf(builder, " ---- %p storing attribute %d (io = %p)\n", data_ptr, index, io_ptr); @@ -617,7 +626,8 @@ store_aos_array(LLVMBuilderRef builder, LLVMValueRef io_ptr, LLVMValueRef aos[NUM_CHANNELS], int attrib, - int num_outputs) + int num_outputs, + LLVMValueRef clipmask) { LLVMValueRef attr_index = LLVMConstInt(LLVMInt32Type(), attrib, 0); LLVMValueRef ind0 = LLVMConstInt(LLVMInt32Type(), 0, 0); @@ -625,7 +635,8 @@ store_aos_array(LLVMBuilderRef builder, LLVMValueRef ind2 = LLVMConstInt(LLVMInt32Type(), 2, 0); LLVMValueRef ind3 = LLVMConstInt(LLVMInt32Type(), 3, 0); LLVMValueRef io0_ptr, io1_ptr, io2_ptr, io3_ptr; - + LLVMValueRef clipmask0, clipmask1, clipmask2, clipmask3; + debug_assert(NUM_CHANNELS == 4); io0_ptr = LLVMBuildGEP(builder, io_ptr, @@ -637,21 +648,31 @@ store_aos_array(LLVMBuilderRef builder, io3_ptr = LLVMBuildGEP(builder, io_ptr, &ind3, 1, ""); + clipmask0 = LLVMBuildExtractElement(builder, clipmask, + ind0, ""); + clipmask1 = LLVMBuildExtractElement(builder, clipmask, + ind1, ""); + clipmask2 = LLVMBuildExtractElement(builder, clipmask, + ind2, ""); + clipmask3 = LLVMBuildExtractElement(builder, clipmask, + ind3, ""); + #if DEBUG_STORE - lp_build_printf(builder, " io = %p, indexes[%d, %d, %d, %d]\n", - io_ptr, ind0, ind1, ind2, ind3); + lp_build_printf(builder, "io = %p, indexes[%d, %d, %d, %d]\n, clipmask0 = %x, clipmask1 = %x, clipmask2 = %x, clipmask3 = %x\n", + io_ptr, ind0, ind1, ind2, ind3, clipmask0, clipmask1, clipmask2, clipmask3); #endif - - store_aos(builder, io0_ptr, attr_index, aos[0]); - store_aos(builder, io1_ptr, attr_index, aos[1]); - store_aos(builder, io2_ptr, attr_index, aos[2]); - store_aos(builder, io3_ptr, attr_index, aos[3]); + /* store for each of the 4 vertices */ + store_aos(builder, io0_ptr, attr_index, aos[0], clipmask0); + store_aos(builder, io1_ptr, attr_index, aos[1], clipmask1); + store_aos(builder, io2_ptr, attr_index, aos[2], clipmask2); + store_aos(builder, io3_ptr, attr_index, aos[3], clipmask3); } static void convert_to_aos(LLVMBuilderRef builder, LLVMValueRef io, LLVMValueRef (*outputs)[NUM_CHANNELS], + LLVMValueRef clipmask, int num_outputs, int max_vertices) { @@ -680,13 +701,305 @@ convert_to_aos(LLVMBuilderRef builder, io, aos, attrib, - num_outputs); + num_outputs, + clipmask); } #if DEBUG_STORE lp_build_printf(builder, " # storing end\n"); #endif } +/* + * Stores original vertex positions in clip coordinates + * There is probably a more efficient way to do this, 4 floats at once + * rather than extracting each element one by one. + */ +static void +store_clip(LLVMBuilderRef builder, + LLVMValueRef io_ptr, + LLVMValueRef (*outputs)[NUM_CHANNELS]) +{ + LLVMValueRef out[4]; + LLVMValueRef indices[2]; + LLVMValueRef io0_ptr, io1_ptr, io2_ptr, io3_ptr; + LLVMValueRef clip_ptr0, clip_ptr1, clip_ptr2, clip_ptr3; + LLVMValueRef clip0_ptr, clip1_ptr, clip2_ptr, clip3_ptr; + LLVMValueRef out0elem, out1elem, out2elem, out3elem; + int i; + + LLVMValueRef ind0 = LLVMConstInt(LLVMInt32Type(), 0, 0); + LLVMValueRef ind1 = LLVMConstInt(LLVMInt32Type(), 1, 0); + LLVMValueRef ind2 = LLVMConstInt(LLVMInt32Type(), 2, 0); + LLVMValueRef ind3 = LLVMConstInt(LLVMInt32Type(), 3, 0); + + indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0); + indices[1] = LLVMConstInt(LLVMInt32Type(), 0, 0); + + out[0] = LLVMBuildLoad(builder, outputs[0][0], ""); /*x0 x1 x2 x3*/ + out[1] = LLVMBuildLoad(builder, outputs[0][1], ""); /*y0 y1 y2 y3*/ + out[2] = LLVMBuildLoad(builder, outputs[0][2], ""); /*z0 z1 z2 z3*/ + out[3] = LLVMBuildLoad(builder, outputs[0][3], ""); /*w0 w1 w2 w3*/ + + io0_ptr = LLVMBuildGEP(builder, io_ptr, &ind0, 1, ""); + io1_ptr = LLVMBuildGEP(builder, io_ptr, &ind1, 1, ""); + io2_ptr = LLVMBuildGEP(builder, io_ptr, &ind2, 1, ""); + io3_ptr = LLVMBuildGEP(builder, io_ptr, &ind3, 1, ""); + + clip_ptr0 = draw_jit_header_clip(builder, io0_ptr); + clip_ptr1 = draw_jit_header_clip(builder, io1_ptr); + clip_ptr2 = draw_jit_header_clip(builder, io2_ptr); + clip_ptr3 = draw_jit_header_clip(builder, io3_ptr); + + for (i = 0; i<4; i++){ + clip0_ptr = LLVMBuildGEP(builder, clip_ptr0, + indices, 2, ""); //x0 + clip1_ptr = LLVMBuildGEP(builder, clip_ptr1, + indices, 2, ""); //x1 + clip2_ptr = LLVMBuildGEP(builder, clip_ptr2, + indices, 2, ""); //x2 + clip3_ptr = LLVMBuildGEP(builder, clip_ptr3, + indices, 2, ""); //x3 + + out0elem = LLVMBuildExtractElement(builder, out[i], + ind0, ""); //x0 + out1elem = LLVMBuildExtractElement(builder, out[i], + ind1, ""); //x1 + out2elem = LLVMBuildExtractElement(builder, out[i], + ind2, ""); //x2 + out3elem = LLVMBuildExtractElement(builder, out[i], + ind3, ""); //x3 + + LLVMBuildStore(builder, out0elem, clip0_ptr); + LLVMBuildStore(builder, out1elem, clip1_ptr); + LLVMBuildStore(builder, out2elem, clip2_ptr); + LLVMBuildStore(builder, out3elem, clip3_ptr); + + indices[1]= LLVMBuildAdd(builder, indices[1], ind1, ""); + } + +} + +/* Equivalent of _mm_set1_ps(a) + */ +static LLVMValueRef vec4f_from_scalar(LLVMBuilderRef bld, + LLVMValueRef a, + const char *name) +{ + LLVMValueRef res = LLVMGetUndef(LLVMVectorType(LLVMFloatType(), 4)); + int i; + + for(i = 0; i < 4; ++i) { + LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0); + res = LLVMBuildInsertElement(bld, res, a, index, i == 3 ? name : ""); + } + + return res; +} + +/* + * Transforms the outputs for viewport mapping + */ +static void +generate_viewport(struct draw_llvm *llvm, + LLVMBuilderRef builder, + LLVMValueRef (*outputs)[NUM_CHANNELS], + LLVMValueRef context_ptr) +{ + int i; + struct lp_type f32_type = lp_type_float_vec(32); + LLVMValueRef out3 = LLVMBuildLoad(builder, outputs[0][3], ""); /*w0 w1 w2 w3*/ + LLVMValueRef const1 = lp_build_const_vec(f32_type, 1.0); /*1.0 1.0 1.0 1.0*/ + LLVMValueRef vp_ptr = draw_jit_context_viewport(builder, context_ptr); + + /* for 1/w convention*/ + out3 = LLVMBuildFDiv(builder, const1, out3, ""); + LLVMBuildStore(builder, out3, outputs[0][3]); + + /* Viewport Mapping */ + for (i=0; i<3; i++){ + LLVMValueRef out = LLVMBuildLoad(builder, outputs[0][i], ""); /*x0 x1 x2 x3*/ + LLVMValueRef scale; + LLVMValueRef trans; + LLVMValueRef scale_i; + LLVMValueRef trans_i; + LLVMValueRef index; + + index = LLVMConstInt(LLVMInt32Type(), i, 0); + scale_i = LLVMBuildGEP(builder, vp_ptr, &index, 1, ""); + + index = LLVMConstInt(LLVMInt32Type(), i+4, 0); + trans_i = LLVMBuildGEP(builder, vp_ptr, &index, 1, ""); + + scale = vec4f_from_scalar(builder, LLVMBuildLoad(builder, scale_i, ""), "scale"); + trans = vec4f_from_scalar(builder, LLVMBuildLoad(builder, trans_i, ""), "trans"); + + /* divide by w */ + out = LLVMBuildFMul(builder, out, out3, ""); + /* mult by scale */ + out = LLVMBuildFMul(builder, out, scale, ""); + /* add translation */ + out = LLVMBuildFAdd(builder, out, trans, ""); + + /* store transformed outputs */ + LLVMBuildStore(builder, out, outputs[0][i]); + } + +} + + +/* + * Returns clipmask as 4xi32 bitmask for the 4 vertices + */ +static LLVMValueRef +generate_clipmask(LLVMBuilderRef builder, + LLVMValueRef (*outputs)[NUM_CHANNELS], + boolean clip_xy, + boolean clip_z, + boolean clip_user, + boolean clip_halfz, + unsigned nr, + LLVMValueRef context_ptr) +{ + LLVMValueRef mask; /* stores the <4xi32> clipmasks */ + LLVMValueRef test, temp; + LLVMValueRef zero, shift; + LLVMValueRef pos_x, pos_y, pos_z, pos_w; + LLVMValueRef plane1, planes, plane_ptr, sum; + + unsigned i; + + struct lp_type f32_type = lp_type_float_vec(32); + + mask = lp_build_const_int_vec(lp_type_int_vec(32), 0); + temp = lp_build_const_int_vec(lp_type_int_vec(32), 0); + zero = lp_build_const_vec(f32_type, 0); /* 0.0f 0.0f 0.0f 0.0f */ + shift = lp_build_const_int_vec(lp_type_int_vec(32), 1); /* 1 1 1 1 */ + + /* Assuming position stored at output[0] */ + pos_x = LLVMBuildLoad(builder, outputs[0][0], ""); /*x0 x1 x2 x3*/ + pos_y = LLVMBuildLoad(builder, outputs[0][1], ""); /*y0 y1 y2 y3*/ + pos_z = LLVMBuildLoad(builder, outputs[0][2], ""); /*z0 z1 z2 z3*/ + pos_w = LLVMBuildLoad(builder, outputs[0][3], ""); /*w0 w1 w2 w3*/ + + /* Cliptest, for hardwired planes */ + if (clip_xy){ + /* plane 1 */ + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, pos_x , pos_w); + temp = shift; + test = LLVMBuildAnd(builder, test, temp, ""); + mask = test; + + /* plane 2 */ + test = LLVMBuildFAdd(builder, pos_x, pos_w, ""); + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, test); + temp = LLVMBuildShl(builder, temp, shift, ""); + test = LLVMBuildAnd(builder, test, temp, ""); + mask = LLVMBuildOr(builder, mask, test, ""); + + /* plane 3 */ + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, pos_y, pos_w); + temp = LLVMBuildShl(builder, temp, shift, ""); + test = LLVMBuildAnd(builder, test, temp, ""); + mask = LLVMBuildOr(builder, mask, test, ""); + + /* plane 4 */ + test = LLVMBuildFAdd(builder, pos_y, pos_w, ""); + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, test); + temp = LLVMBuildShl(builder, temp, shift, ""); + test = LLVMBuildAnd(builder, test, temp, ""); + mask = LLVMBuildOr(builder, mask, test, ""); + } + + if (clip_z){ + temp = lp_build_const_int_vec(lp_type_int_vec(32), 16); + if (clip_halfz){ + /* plane 5 */ + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, pos_z); + test = LLVMBuildAnd(builder, test, temp, ""); + mask = LLVMBuildOr(builder, mask, test, ""); + } + else{ + /* plane 5 */ + test = LLVMBuildFAdd(builder, pos_z, pos_w, ""); + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, test); + test = LLVMBuildAnd(builder, test, temp, ""); + mask = LLVMBuildOr(builder, mask, test, ""); + } + /* plane 6 */ + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, pos_z, pos_w); + temp = LLVMBuildShl(builder, temp, shift, ""); + test = LLVMBuildAnd(builder, test, temp, ""); + mask = LLVMBuildOr(builder, mask, test, ""); + } + + if (clip_user){ + LLVMValueRef planes_ptr = draw_jit_context_planes(builder, context_ptr); + LLVMValueRef indices[3]; + temp = lp_build_const_int_vec(lp_type_int_vec(32), 32); + + /* userclip planes */ + for (i = 6; i < nr; i++) { + indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0); + indices[1] = LLVMConstInt(LLVMInt32Type(), i, 0); + + indices[2] = LLVMConstInt(LLVMInt32Type(), 0, 0); + plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, ""); + plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_x"); + planes = vec4f_from_scalar(builder, plane1, "plane4_x"); + sum = LLVMBuildFMul(builder, planes, pos_x, ""); + + indices[2] = LLVMConstInt(LLVMInt32Type(), 1, 0); + plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, ""); + plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_y"); + planes = vec4f_from_scalar(builder, plane1, "plane4_y"); + test = LLVMBuildFMul(builder, planes, pos_y, ""); + sum = LLVMBuildFAdd(builder, sum, test, ""); + + indices[2] = LLVMConstInt(LLVMInt32Type(), 2, 0); + plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, ""); + plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_z"); + planes = vec4f_from_scalar(builder, plane1, "plane4_z"); + test = LLVMBuildFMul(builder, planes, pos_z, ""); + sum = LLVMBuildFAdd(builder, sum, test, ""); + + indices[2] = LLVMConstInt(LLVMInt32Type(), 3, 0); + plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, ""); + plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_w"); + planes = vec4f_from_scalar(builder, plane1, "plane4_w"); + test = LLVMBuildFMul(builder, planes, pos_w, ""); + sum = LLVMBuildFAdd(builder, sum, test, ""); + + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, sum); + temp = LLVMBuildShl(builder, temp, shift, ""); + test = LLVMBuildAnd(builder, test, temp, ""); + mask = LLVMBuildOr(builder, mask, test, ""); + } + } + return mask; +} + +/* + * Returns boolean if any clipping has occurred + * Used zero/non-zero i32 value to represent boolean + */ +static void +clipmask_bool(LLVMBuilderRef builder, + LLVMValueRef clipmask, + LLVMValueRef ret_ptr) +{ + LLVMValueRef ret = LLVMBuildLoad(builder, ret_ptr, ""); + LLVMValueRef temp; + int i; + + for (i=0; i<4; i++){ + temp = LLVMBuildExtractElement(builder, clipmask, + LLVMConstInt(LLVMInt32Type(), i, 0) , ""); + ret = LLVMBuildOr(builder, ret, temp, ""); + } + + LLVMBuildStore(builder, ret, ret_ptr); +} + static void draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) { @@ -706,7 +1019,12 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS]; void *code; struct lp_build_sampler_soa *sampler = 0; - + LLVMValueRef ret, ret_ptr; + boolean bypass_viewport = variant->key.bypass_viewport; + boolean enable_cliptest = variant->key.clip_xy || + variant->key.clip_z || + variant->key.clip_user; + arg_types[0] = llvm->context_ptr_type; /* context */ arg_types[1] = llvm->vertex_header_ptr_type; /* vertex_header */ arg_types[2] = llvm->buffer_ptr_type; /* vbuffers */ @@ -716,7 +1034,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) arg_types[6] = llvm->vb_ptr_type; /* pipe_vertex_buffer's */ arg_types[7] = LLVMInt32Type(); /* instance_id */ - func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0); + func_type = LLVMFunctionType(LLVMInt32Type(), arg_types, Elements(arg_types), 0); variant->function = LLVMAddFunction(llvm->module, "draw_llvm_shader", func_type); LLVMSetFunctionCallConv(variant->function, LLVMCCallConv); @@ -756,6 +1074,10 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) step = LLVMConstInt(LLVMInt32Type(), max_vertices, 0); + /* function will return non-zero i32 value if any clipped vertices */ + ret_ptr = lp_build_alloca(builder, LLVMInt32Type(), ""); + LLVMBuildStore(builder, LLVMConstInt(LLVMInt32Type(), 0, 0), ret_ptr); + /* code generated texture sampling */ sampler = draw_llvm_sampler_soa_create( draw_llvm_variant_key_samplers(&variant->key), @@ -770,6 +1092,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS]; LLVMValueRef aos_attribs[PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS] = { { 0 } }; LLVMValueRef io; + LLVMValueRef clipmask; /* holds the clipmask value */ const LLVMValueRef (*ptr_aos)[NUM_CHANNELS]; io_itr = LLVMBuildSub(builder, lp_loop.counter, start, ""); @@ -806,10 +1129,37 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) context_ptr, sampler); - convert_to_aos(builder, io, outputs, + /* store original positions in clip before further manipulation */ + store_clip(builder, io, outputs); + + /* do cliptest */ + if (enable_cliptest){ + /* allocate clipmask, assign it integer type */ + clipmask = generate_clipmask(builder, outputs, + variant->key.clip_xy, + variant->key.clip_z, + variant->key.clip_user, + variant->key.clip_halfz, + variant->key.nr_planes, + context_ptr); + /* return clipping boolean value for function */ + clipmask_bool(builder, clipmask, ret_ptr); + } + else{ + clipmask = lp_build_const_int_vec(lp_type_int_vec(32), 0); + } + + /* do viewport mapping */ + if (!bypass_viewport){ + generate_viewport(llvm, builder, outputs, context_ptr); + } + + /* store clipmask in vertex header and positions in data */ + convert_to_aos(builder, io, outputs, clipmask, draw->vs.vertex_shader->info.num_outputs, max_vertices); } + lp_build_loop_end_cond(builder, end, step, LLVMIntUGE, &lp_loop); sampler->destroy(sampler); @@ -819,8 +1169,9 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) lp_build_intrinsic(builder, "llvm.x86.mmx.emms", LLVMVoidType(), NULL, 0); #endif - LLVMBuildRetVoid(builder); - + ret = LLVMBuildLoad(builder, ret_ptr,""); + LLVMBuildRet(builder, ret); + LLVMDisposeBuilder(builder); /* @@ -870,7 +1221,12 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian LLVMValueRef fetch_max; void *code; struct lp_build_sampler_soa *sampler = 0; - + LLVMValueRef ret, ret_ptr; + boolean bypass_viewport = variant->key.bypass_viewport; + boolean enable_cliptest = variant->key.clip_xy || + variant->key.clip_z || + variant->key.clip_user; + arg_types[0] = llvm->context_ptr_type; /* context */ arg_types[1] = llvm->vertex_header_ptr_type; /* vertex_header */ arg_types[2] = llvm->buffer_ptr_type; /* vbuffers */ @@ -880,10 +1236,9 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian arg_types[6] = llvm->vb_ptr_type; /* pipe_vertex_buffer's */ arg_types[7] = LLVMInt32Type(); /* instance_id */ - func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0); + func_type = LLVMFunctionType(LLVMInt32Type(), arg_types, Elements(arg_types), 0); - variant->function_elts = LLVMAddFunction(llvm->module, "draw_llvm_shader_elts", - func_type); + variant->function_elts = LLVMAddFunction(llvm->module, "draw_llvm_shader_elts", func_type); LLVMSetFunctionCallConv(variant->function_elts, LLVMCCallConv); for(i = 0; i < Elements(arg_types); ++i) if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) @@ -929,11 +1284,16 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian LLVMConstInt(LLVMInt32Type(), 1, 0), "fetch_max"); + /* function returns non-zero i32 value if any clipped vertices */ + ret_ptr = lp_build_alloca(builder, LLVMInt32Type(), ""); + LLVMBuildStore(builder, LLVMConstInt(LLVMInt32Type(), 0, 0), ret_ptr); + lp_build_loop_begin(builder, LLVMConstInt(LLVMInt32Type(), 0, 0), &lp_loop); { LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS]; LLVMValueRef aos_attribs[PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS] = { { 0 } }; LLVMValueRef io; + LLVMValueRef clipmask; /* holds the clipmask value */ const LLVMValueRef (*ptr_aos)[NUM_CHANNELS]; io_itr = lp_loop.counter; @@ -980,10 +1340,40 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian context_ptr, sampler); - convert_to_aos(builder, io, outputs, + /* store original positions in clip before further manipulation */ + store_clip(builder, io, outputs); + + /* do cliptest */ + if (enable_cliptest){ + /* allocate clipmask, assign it integer type */ + clipmask = generate_clipmask(builder, outputs, + variant->key.clip_xy, + variant->key.clip_z, + variant->key.clip_user, + variant->key.clip_halfz, + variant->key.nr_planes, + context_ptr); + /* return clipping boolean value for function */ + clipmask_bool(builder, clipmask, ret_ptr); + } + else{ + clipmask = lp_build_const_int_vec(lp_type_int_vec(32), 0); + } + + /* do viewport mapping */ + if (!bypass_viewport){ + generate_viewport(llvm, builder, outputs, context_ptr); + } + + /* store clipmask in vertex header, + * original positions in clip + * and transformed positions in data + */ + convert_to_aos(builder, io, outputs, clipmask, draw->vs.vertex_shader->info.num_outputs, max_vertices); } + lp_build_loop_end_cond(builder, fetch_count, step, LLVMIntUGE, &lp_loop); sampler->destroy(sampler); @@ -993,8 +1383,9 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian lp_build_intrinsic(builder, "llvm.x86.mmx.emms", LLVMVoidType(), NULL, 0); #endif - LLVMBuildRetVoid(builder); - + ret = LLVMBuildLoad(builder, ret_ptr,""); + LLVMBuildRet(builder, ret); + LLVMDisposeBuilder(builder); /* @@ -1038,6 +1429,16 @@ draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store) */ key->nr_vertex_elements = llvm->draw->pt.nr_vertex_elements; + /* will have to rig this up properly later */ + key->clip_xy = llvm->draw->clip_xy; + key->clip_z = llvm->draw->clip_z; + key->clip_user = llvm->draw->clip_user; + key->bypass_viewport = llvm->draw->identity_viewport; + key->clip_halfz = !llvm->draw->rasterizer->gl_rasterization_rules; + key->need_edgeflags = (llvm->draw->vs.edgeflag_output ? TRUE : FALSE); + key->nr_planes = llvm->draw->nr_planes; + key->pad = 0; + /* All variants of this shader will have the same value for * nr_samplers. Not yet trying to compact away holes in the * sampler array. @@ -1066,9 +1467,9 @@ draw_llvm_set_mapped_texture(struct draw_context *draw, unsigned sampler_idx, uint32_t width, uint32_t height, uint32_t depth, uint32_t last_level, - uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS], - uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS], - const void *data[DRAW_MAX_TEXTURE_LEVELS]) + uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS], + uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS], + const void *data[PIPE_MAX_TEXTURE_LEVELS]) { unsigned j; struct draw_jit_texture *jit_tex; diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h index d0a68ae412d..c3c30c07c64 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.h +++ b/src/gallium/auxiliary/draw/draw_llvm.h @@ -41,7 +41,6 @@ #include <llvm-c/Target.h> #include <llvm-c/ExecutionEngine.h> -#define DRAW_MAX_TEXTURE_LEVELS 13 /* 4K x 4K for now */ struct draw_llvm; struct llvm_vertex_shader; @@ -52,9 +51,9 @@ struct draw_jit_texture uint32_t height; uint32_t depth; uint32_t last_level; - uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS]; - uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS]; - const void *data[DRAW_MAX_TEXTURE_LEVELS]; + uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS]; + uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS]; + const void *data[PIPE_MAX_TEXTURE_LEVELS]; float min_lod; float max_lod; float lod_bias; @@ -97,7 +96,8 @@ struct draw_jit_context { const float *vs_constants; const float *gs_constants; - + float (*planes) [12][4]; + float *viewport; struct draw_jit_texture textures[PIPE_MAX_VERTEX_SAMPLERS]; }; @@ -109,18 +109,22 @@ struct draw_jit_context #define draw_jit_context_gs_constants(_builder, _ptr) \ lp_build_struct_get(_builder, _ptr, 1, "gs_constants") -#define DRAW_JIT_CTX_TEXTURES 2 +#define draw_jit_context_planes(_builder, _ptr) \ + lp_build_struct_get(_builder, _ptr, 2, "planes") -#define draw_jit_context_textures(_builder, _ptr) \ - lp_build_struct_get_ptr(_builder, _ptr, DRAW_JIT_CTX_TEXTURES, "textures") +#define draw_jit_context_viewport(_builder, _ptr) \ + lp_build_struct_get(_builder, _ptr, 3, "viewport") +#define DRAW_JIT_CTX_TEXTURES 4 +#define draw_jit_context_textures(_builder, _ptr) \ + lp_build_struct_get_ptr(_builder, _ptr, DRAW_JIT_CTX_TEXTURES, "textures") #define draw_jit_header_id(_builder, _ptr) \ lp_build_struct_get_ptr(_builder, _ptr, 0, "id") #define draw_jit_header_clip(_builder, _ptr) \ - lp_build_struct_get(_builder, _ptr, 1, "clip") + lp_build_struct_get_ptr(_builder, _ptr, 1, "clip") #define draw_jit_header_data(_builder, _ptr) \ lp_build_struct_get_ptr(_builder, _ptr, 2, "data") @@ -136,7 +140,7 @@ struct draw_jit_context lp_build_struct_get(_builder, _ptr, 2, "buffer_offset") -typedef void +typedef int (*draw_jit_vert_func)(struct draw_jit_context *context, struct vertex_header *io, const char *vbuffers[PIPE_MAX_ATTRIBS], @@ -147,7 +151,7 @@ typedef void unsigned instance_id); -typedef void +typedef int (*draw_jit_vert_func_elts)(struct draw_jit_context *context, struct vertex_header *io, const char *vbuffers[PIPE_MAX_ATTRIBS], @@ -159,8 +163,16 @@ typedef void struct draw_llvm_variant_key { - unsigned nr_vertex_elements:16; - unsigned nr_samplers:16; + unsigned nr_vertex_elements:8; + unsigned nr_samplers:8; + unsigned clip_xy:1; + unsigned clip_z:1; + unsigned clip_user:1; + unsigned clip_halfz:1; + unsigned bypass_viewport:1; + unsigned need_edgeflags:1; + unsigned nr_planes:4; + unsigned pad:6; /* Variable number of vertex elements: */ @@ -290,8 +302,8 @@ draw_llvm_set_mapped_texture(struct draw_context *draw, unsigned sampler_idx, uint32_t width, uint32_t height, uint32_t depth, uint32_t last_level, - uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS], - uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS], - const void *data[DRAW_MAX_TEXTURE_LEVELS]); + uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS], + uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS], + const void *data[PIPE_MAX_TEXTURE_LEVELS]); #endif diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h index d417f825a0f..54163d7f9eb 100644 --- a/src/gallium/auxiliary/draw/draw_private.h +++ b/src/gallium/auxiliary/draw/draw_private.h @@ -169,6 +169,9 @@ struct draw_context unsigned vs_constants_size[PIPE_MAX_CONSTANT_BUFFERS]; const void *gs_constants[PIPE_MAX_CONSTANT_BUFFERS]; unsigned gs_constants_size[PIPE_MAX_CONSTANT_BUFFERS]; + + /* pointer to planes */ + float (*planes)[12][4]; } user; boolean test_fse; /* enable FSE even though its not correct (eg for softpipe) */ diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c index f44bf2507c6..4078b2a07d0 100644 --- a/src/gallium/auxiliary/draw/draw_pt.c +++ b/src/gallium/auxiliary/draw/draw_pt.c @@ -287,6 +287,84 @@ draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count) } +/** Helper code for below */ +#define PRIM_RESTART_LOOP(elements) \ + do { \ + for (i = start; i < end; i++) { \ + if (elements[i] == info->restart_index) { \ + if (cur_count > 0) { \ + /* draw elts up to prev pos */ \ + draw_pt_arrays(draw, prim, cur_start, cur_count); \ + } \ + /* begin new prim at next elt */ \ + cur_start = i + 1; \ + cur_count = 0; \ + } \ + else { \ + cur_count++; \ + } \ + } \ + if (cur_count > 0) { \ + draw_pt_arrays(draw, prim, cur_start, cur_count); \ + } \ + } while (0) + + +/** + * For drawing prims with primitive restart enabled. + * Scan for restart indexes and draw the runs of elements/vertices between + * the restarts. + */ +static void +draw_pt_arrays_restart(struct draw_context *draw, + const struct pipe_draw_info *info) +{ + const unsigned prim = info->mode; + const unsigned start = info->start; + const unsigned count = info->count; + const unsigned end = start + count; + unsigned i, cur_start, cur_count; + + assert(info->primitive_restart); + + if (draw->pt.user.elts) { + /* indexed prims (draw_elements) */ + cur_start = start; + cur_count = 0; + + switch (draw->pt.user.eltSize) { + case 1: + { + const ubyte *elt_ub = (const ubyte *) draw->pt.user.elts; + PRIM_RESTART_LOOP(elt_ub); + } + break; + case 2: + { + const ushort *elt_us = (const ushort *) draw->pt.user.elts; + PRIM_RESTART_LOOP(elt_us); + } + break; + case 4: + { + const uint *elt_ui = (const uint *) draw->pt.user.elts; + PRIM_RESTART_LOOP(elt_ui); + } + break; + default: + assert(0 && "bad eltSize in draw_arrays()"); + } + } + else { + /* Non-indexed prims (draw_arrays). + * Primitive restart should have been handled in the state tracker. + */ + draw_pt_arrays(draw, prim, start, count); + } +} + + + /** * Non-instanced drawing. * \sa draw_arrays_instanced @@ -395,6 +473,12 @@ draw_vbo(struct draw_context *draw, for (instance = 0; instance < info->instance_count; instance++) { draw->instance_id = instance + info->start_instance; - draw_pt_arrays(draw, info->mode, info->start, info->count); + + if (info->primitive_restart) { + draw_pt_arrays_restart(draw, info); + } + else { + draw_pt_arrays(draw, info->mode, info->start, info->count); + } } } diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c index 77291e304e1..a53a768d029 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c @@ -175,6 +175,11 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle, draw->pt.user.vs_constants[0]; fpme->llvm->jit_context.gs_constants = draw->pt.user.gs_constants[0]; + fpme->llvm->jit_context.planes = + (float (*) [12][4]) draw->pt.user.planes[0]; + fpme->llvm->jit_context.viewport = + (float *)draw->viewport.scale; + } @@ -217,6 +222,7 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle, struct draw_vertex_info gs_vert_info; struct draw_vertex_info *vert_info; unsigned opt = fpme->opt; + unsigned clipped = 0; llvm_vert_info.count = fetch_info->count; llvm_vert_info.vertex_size = fpme->vertex_size; @@ -230,7 +236,7 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle, } if (fetch_info->linear) - fpme->current_variant->jit_func( &fpme->llvm->jit_context, + clipped = fpme->current_variant->jit_func( &fpme->llvm->jit_context, llvm_vert_info.verts, (const char **)draw->pt.user.vbuffer, fetch_info->start, @@ -239,7 +245,7 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle, draw->pt.vertex_buffer, draw->instance_id); else - fpme->current_variant->jit_func_elts( &fpme->llvm->jit_context, + clipped = fpme->current_variant->jit_func_elts( &fpme->llvm->jit_context, llvm_vert_info.verts, (const char **)draw->pt.user.vbuffer, fetch_info->elts, @@ -266,6 +272,9 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle, FREE(vert_info->verts); vert_info = &gs_vert_info; prim_info = &gs_prim_info; + + clipped = draw_pt_post_vs_run( fpme->post_vs, vert_info ); + } /* stream output needs to be done before clipping */ @@ -273,11 +282,11 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle, vert_info, prim_info ); - if (draw_pt_post_vs_run( fpme->post_vs, vert_info )) { + if (clipped) { opt |= PT_PIPELINE; } - /* Do we need to run the pipeline? + /* Do we need to run the pipeline? Now will come here if clipped */ if (opt & PT_PIPELINE) { pipeline( fpme, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index e65c13e64b5..f9a12a41a1b 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -983,6 +983,12 @@ enum lp_build_round_sse41_mode }; +/** + * Helper for SSE4.1's ROUNDxx instructions. + * + * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the + * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0. + */ static INLINE LLVMValueRef lp_build_round_sse41(struct lp_build_context *bld, LLVMValueRef a, @@ -1053,10 +1059,58 @@ lp_build_round_sse41(struct lp_build_context *bld, } +static INLINE LLVMValueRef +lp_build_iround_nearest_sse2(struct lp_build_context *bld, + LLVMValueRef a) +{ + const struct lp_type type = bld->type; + LLVMTypeRef i32t = LLVMInt32Type(); + LLVMTypeRef ret_type = lp_build_int_vec_type(type); + const char *intrinsic; + LLVMValueRef res; + + assert(type.floating); + /* using the double precision conversions is a bit more complicated */ + assert(type.width == 32); + + assert(lp_check_value(type, a)); + assert(util_cpu_caps.has_sse2); + + /* This is relying on MXCSR rounding mode, which should always be nearest. */ + if (type.length == 1) { + LLVMTypeRef vec_type; + LLVMValueRef undef; + LLVMValueRef arg; + LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); + + vec_type = LLVMVectorType(bld->elem_type, 4); + + intrinsic = "llvm.x86.sse.cvtss2si"; + + undef = LLVMGetUndef(vec_type); + + arg = LLVMBuildInsertElement(bld->builder, undef, a, index0, ""); + + res = lp_build_intrinsic_unary(bld->builder, intrinsic, + ret_type, arg); + } + else { + assert(type.width*type.length == 128); + + intrinsic = "llvm.x86.sse2.cvtps2dq"; + + res = lp_build_intrinsic_unary(bld->builder, intrinsic, + ret_type, a); + } + + return res; +} + + /** - * Return the integer part of a float (vector) value. The returned value is - * a float (vector). - * Ex: trunc(-1.5) = 1.0 + * Return the integer part of a float (vector) value (== round toward zero). + * The returned value is a float (vector). + * Ex: trunc(-1.5) = -1.0 */ LLVMValueRef lp_build_trunc(struct lp_build_context *bld, @@ -1181,9 +1235,9 @@ lp_build_fract(struct lp_build_context *bld, /** - * Return the integer part of a float (vector) value. The returned value is - * an integer (vector). - * Ex: itrunc(-1.5) = 1 + * Return the integer part of a float (vector) value (== round toward zero). + * The returned value is an integer (vector). + * Ex: itrunc(-1.5) = -1 */ LLVMValueRef lp_build_itrunc(struct lp_build_context *bld, @@ -1210,32 +1264,40 @@ lp_build_iround(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; - LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); + LLVMTypeRef int_vec_type = bld->int_vec_type; LLVMValueRef res; assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && + if (util_cpu_caps.has_sse2 && + ((type.width == 32) && (type.length == 1 || type.length == 4))) { + return lp_build_iround_nearest_sse2(bld, a); + } + else if (util_cpu_caps.has_sse4_1 && (type.length == 1 || type.width*type.length == 128)) { res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); } else { - LLVMTypeRef vec_type = lp_build_vec_type(type); - LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); - LLVMValueRef sign; LLVMValueRef half; - /* get sign bit */ - sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); - sign = LLVMBuildAnd(bld->builder, sign, mask, ""); - - /* sign * 0.5 */ half = lp_build_const_vec(type, 0.5); - half = LLVMBuildBitCast(bld->builder, half, int_vec_type, ""); - half = LLVMBuildOr(bld->builder, sign, half, ""); - half = LLVMBuildBitCast(bld->builder, half, vec_type, ""); + + if (type.sign) { + LLVMTypeRef vec_type = bld->vec_type; + LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); + LLVMValueRef sign; + + /* get sign bit */ + sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); + sign = LLVMBuildAnd(bld->builder, sign, mask, ""); + + /* sign * 0.5 */ + half = LLVMBuildBitCast(bld->builder, half, int_vec_type, ""); + half = LLVMBuildOr(bld->builder, sign, half, ""); + half = LLVMBuildBitCast(bld->builder, half, vec_type, ""); + } res = LLVMBuildFAdd(bld->builder, a, half, ""); } @@ -1256,7 +1318,7 @@ lp_build_ifloor(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; - LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); + LLVMTypeRef int_vec_type = bld->int_vec_type; LLVMValueRef res; assert(type.floating); @@ -1267,27 +1329,31 @@ lp_build_ifloor(struct lp_build_context *bld, res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); } else { - /* Take the sign bit and add it to 1 constant */ - LLVMTypeRef vec_type = lp_build_vec_type(type); - unsigned mantissa = lp_mantissa(type); - LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); - LLVMValueRef sign; - LLVMValueRef offset; - - /* sign = a < 0 ? ~0 : 0 */ - sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); - sign = LLVMBuildAnd(bld->builder, sign, mask, ""); - sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign"); - - /* offset = -0.99999(9)f */ - offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); - offset = LLVMConstBitCast(offset, int_vec_type); - - /* offset = a < 0 ? offset : 0.0f */ - offset = LLVMBuildAnd(bld->builder, offset, sign, ""); - offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset"); - - res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res"); + res = a; + + if (type.sign) { + /* Take the sign bit and add it to 1 constant */ + LLVMTypeRef vec_type = bld->vec_type; + unsigned mantissa = lp_mantissa(type); + LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); + LLVMValueRef sign; + LLVMValueRef offset; + + /* sign = a < 0 ? ~0 : 0 */ + sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); + sign = LLVMBuildAnd(bld->builder, sign, mask, ""); + sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign"); + + /* offset = -0.99999(9)f */ + offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); + offset = LLVMConstBitCast(offset, int_vec_type); + + /* offset = a < 0 ? offset : 0.0f */ + offset = LLVMBuildAnd(bld->builder, offset, sign, ""); + offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset"); + + res = LLVMBuildFAdd(bld->builder, res, offset, "ifloor.res"); + } } /* round to nearest (toward zero) */ @@ -1307,7 +1373,7 @@ lp_build_iceil(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; - LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); + LLVMTypeRef int_vec_type = bld->int_vec_type; LLVMValueRef res; assert(type.floating); @@ -1318,25 +1384,28 @@ lp_build_iceil(struct lp_build_context *bld, res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); } else { - LLVMTypeRef vec_type = lp_build_vec_type(type); + LLVMTypeRef vec_type = bld->vec_type; unsigned mantissa = lp_mantissa(type); - LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); - LLVMValueRef sign; LLVMValueRef offset; - /* sign = a < 0 ? 0 : ~0 */ - sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); - sign = LLVMBuildAnd(bld->builder, sign, mask, ""); - sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign"); - sign = LLVMBuildNot(bld->builder, sign, "iceil.not"); - /* offset = 0.99999(9)f */ offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); - offset = LLVMConstBitCast(offset, int_vec_type); - /* offset = a < 0 ? 0.0 : offset */ - offset = LLVMBuildAnd(bld->builder, offset, sign, ""); - offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset"); + if (type.sign) { + LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); + LLVMValueRef sign; + + /* sign = a < 0 ? 0 : ~0 */ + sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); + sign = LLVMBuildAnd(bld->builder, sign, mask, ""); + sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign"); + sign = LLVMBuildNot(bld->builder, sign, "iceil.not"); + + /* offset = a < 0 ? 0.0 : offset */ + offset = LLVMConstBitCast(offset, int_vec_type); + offset = LLVMBuildAnd(bld->builder, offset, sign, ""); + offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset"); + } res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res"); } @@ -1348,6 +1417,46 @@ lp_build_iceil(struct lp_build_context *bld, } +/** + * Combined ifloor() & fract(). + * + * Preferred to calling the functions separately, as it will ensure that the + * stratergy (floor() vs ifloor()) that results in less redundant work is used. + */ +void +lp_build_ifloor_fract(struct lp_build_context *bld, + LLVMValueRef a, + LLVMValueRef *out_ipart, + LLVMValueRef *out_fpart) +{ + const struct lp_type type = bld->type; + LLVMValueRef ipart; + + assert(type.floating); + assert(lp_check_value(type, a)); + + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { + /* + * floor() is easier. + */ + + ipart = lp_build_floor(bld, a); + *out_fpart = LLVMBuildFSub(bld->builder, a, ipart, "fpart"); + *out_ipart = LLVMBuildFPToSI(bld->builder, ipart, bld->int_vec_type, "ipart"); + } + else { + /* + * ifloor() is easier. + */ + + *out_ipart = lp_build_ifloor(bld, a); + ipart = LLVMBuildSIToFP(bld->builder, *out_ipart, bld->vec_type, "ipart"); + *out_fpart = LLVMBuildFSub(bld->builder, a, ipart, "fpart"); + } +} + + LLVMValueRef lp_build_sqrt(struct lp_build_context *bld, LLVMValueRef a) @@ -2157,6 +2266,71 @@ lp_build_exp2(struct lp_build_context *bld, /** + * Extract the exponent of a IEEE-754 floating point value. + * + * Optionally apply an integer bias. + * + * Result is an integer value with + * + * ifloor(log2(x)) + bias + */ +LLVMValueRef +lp_build_extract_exponent(struct lp_build_context *bld, + LLVMValueRef x, + int bias) +{ + const struct lp_type type = bld->type; + unsigned mantissa = lp_mantissa(type); + LLVMValueRef res; + + assert(type.floating); + + assert(lp_check_value(bld->type, x)); + + x = LLVMBuildBitCast(bld->builder, x, bld->int_vec_type, ""); + + res = LLVMBuildLShr(bld->builder, x, lp_build_const_int_vec(type, mantissa), ""); + res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(type, 255), ""); + res = LLVMBuildSub(bld->builder, res, lp_build_const_int_vec(type, 127 - bias), ""); + + return res; +} + + +/** + * Extract the mantissa of the a floating. + * + * Result is a floating point value with + * + * x / floor(log2(x)) + */ +LLVMValueRef +lp_build_extract_mantissa(struct lp_build_context *bld, + LLVMValueRef x) +{ + const struct lp_type type = bld->type; + unsigned mantissa = lp_mantissa(type); + LLVMValueRef mantmask = lp_build_const_int_vec(type, (1ULL << mantissa) - 1); + LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type); + LLVMValueRef res; + + assert(lp_check_value(bld->type, x)); + + assert(type.floating); + + x = LLVMBuildBitCast(bld->builder, x, bld->int_vec_type, ""); + + /* res = x / 2**ipart */ + res = LLVMBuildAnd(bld->builder, x, mantmask, ""); + res = LLVMBuildOr(bld->builder, res, one, ""); + res = LLVMBuildBitCast(bld->builder, res, bld->vec_type, ""); + + return res; +} + + + +/** * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ * These coefficients can be generate with * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html @@ -2275,3 +2449,62 @@ lp_build_log2(struct lp_build_context *bld, lp_build_log2_approx(bld, x, NULL, NULL, &res); return res; } + + +/** + * Faster (and less accurate) log2. + * + * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x)) + * + * Piece-wise linear approximation, with exact results when x is a + * power of two. + * + * See http://www.flipcode.com/archives/Fast_log_Function.shtml + */ +LLVMValueRef +lp_build_fast_log2(struct lp_build_context *bld, + LLVMValueRef x) +{ + LLVMValueRef ipart; + LLVMValueRef fpart; + + assert(lp_check_value(bld->type, x)); + + assert(bld->type.floating); + + /* ipart = floor(log2(x)) - 1 */ + ipart = lp_build_extract_exponent(bld, x, -1); + ipart = LLVMBuildSIToFP(bld->builder, ipart, bld->vec_type, ""); + + /* fpart = x / 2**ipart */ + fpart = lp_build_extract_mantissa(bld, x); + + /* ipart + fpart */ + return LLVMBuildFAdd(bld->builder, ipart, fpart, ""); +} + + +/** + * Fast implementation of iround(log2(x)). + * + * Not an approximation -- it should give accurate results all the time. + */ +LLVMValueRef +lp_build_ilog2(struct lp_build_context *bld, + LLVMValueRef x) +{ + LLVMValueRef sqrt2 = lp_build_const_vec(bld->type, M_SQRT2); + LLVMValueRef ipart; + + assert(bld->type.floating); + + assert(lp_check_value(bld->type, x)); + + /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */ + x = LLVMBuildFMul(bld->builder, x, sqrt2, ""); + + /* ipart = floor(log2(x) + 0.5) */ + ipart = lp_build_extract_exponent(bld, x, 0); + + return ipart; +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h index 31efa9921ce..c78b61decf0 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h @@ -171,6 +171,12 @@ LLVMValueRef lp_build_itrunc(struct lp_build_context *bld, LLVMValueRef a); +void +lp_build_ifloor_fract(struct lp_build_context *bld, + LLVMValueRef a, + LLVMValueRef *out_ipart, + LLVMValueRef *out_fpart); + LLVMValueRef lp_build_sqrt(struct lp_build_context *bld, LLVMValueRef a); @@ -209,9 +215,26 @@ lp_build_exp2(struct lp_build_context *bld, LLVMValueRef a); LLVMValueRef +lp_build_extract_exponent(struct lp_build_context *bld, + LLVMValueRef x, + int bias); + +LLVMValueRef +lp_build_extract_mantissa(struct lp_build_context *bld, + LLVMValueRef x); + +LLVMValueRef lp_build_log2(struct lp_build_context *bld, LLVMValueRef a); +LLVMValueRef +lp_build_fast_log2(struct lp_build_context *bld, + LLVMValueRef a); + +LLVMValueRef +lp_build_ilog2(struct lp_build_context *bld, + LLVMValueRef x); + void lp_build_exp2_approx(struct lp_build_context *bld, LLVMValueRef x, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c index 8b477313d48..6967dd26225 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c @@ -63,6 +63,7 @@ #include "util/u_debug.h" #include "util/u_math.h" +#include "util/u_cpu_detect.h" #include "lp_bld_type.h" #include "lp_bld_const.h" @@ -96,58 +97,104 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder, LLVMTypeRef int_vec_type = lp_build_int_vec_type(src_type); LLVMValueRef res; unsigned mantissa; - unsigned n; - unsigned long long ubound; - unsigned long long mask; - double scale; - double bias; assert(src_type.floating); + assert(dst_width <= src_type.width); + src_type.sign = FALSE; mantissa = lp_mantissa(src_type); - /* We cannot carry more bits than the mantissa */ - n = MIN2(mantissa, dst_width); + if (dst_width <= mantissa) { + /* + * Apply magic coefficients that will make the desired result to appear + * in the lowest significant bits of the mantissa, with correct rounding. + * + * This only works if the destination width fits in the mantissa. + */ - /* This magic coefficients will make the desired result to appear in the - * lowest significant bits of the mantissa. - */ - ubound = ((unsigned long long)1 << n); - mask = ubound - 1; - scale = (double)mask/ubound; - bias = (double)((unsigned long long)1 << (mantissa - n)); + unsigned long long ubound; + unsigned long long mask; + double scale; + double bias; - res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), ""); - res = LLVMBuildFAdd(builder, res, lp_build_const_vec(src_type, bias), ""); - res = LLVMBuildBitCast(builder, res, int_vec_type, ""); + ubound = (1ULL << dst_width); + mask = ubound - 1; + scale = (double)mask/ubound; + bias = (double)(1ULL << (mantissa - dst_width)); - if(dst_width > n) { - int shift = dst_width - n; - res = LLVMBuildShl(builder, res, lp_build_const_int_vec(src_type, shift), ""); + res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), ""); + res = LLVMBuildFAdd(builder, res, lp_build_const_vec(src_type, bias), ""); + res = LLVMBuildBitCast(builder, res, int_vec_type, ""); + res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(src_type, mask), ""); + } + else if (dst_width == (mantissa + 1)) { + /* + * The destination width matches exactly what can be represented in + * floating point (i.e., mantissa + 1 bits). So do a straight + * multiplication followed by casting. No further rounding is necessary. + */ + + double scale; - /* TODO: Fill in the empty lower bits for additional precision? */ - /* YES: this fixes progs/trivial/tri-z-eq.c. - * Otherwise vertex Z=1.0 values get converted to something like - * 0xfffffb00 and the test for equality with 0xffffffff fails. + scale = (double)((1ULL << dst_width) - 1); + + res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), ""); + res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); + } + else { + /* + * The destination exceeds what can be represented in the floating point. + * So multiply by the largest power two we get away with, and when + * subtract the most significant bit to rescale to normalized values. + * + * The largest power of two factor we can get away is + * (1 << (src_type.width - 1)), because we need to use signed . In theory it + * should be (1 << (src_type.width - 2)), but IEEE 754 rules states + * INT_MIN should be returned in FPToSI, which is the correct result for + * values near 1.0! + * + * This means we get (src_type.width - 1) correct bits for values near 0.0, + * and (mantissa + 1) correct bits for values near 1.0. Equally or more + * important, we also get exact results for 0.0 and 1.0. */ -#if 0 - { - LLVMValueRef msb; - msb = LLVMBuildLShr(builder, res, lp_build_const_int_vec(src_type, dst_width - 1), ""); - msb = LLVMBuildShl(builder, msb, lp_build_const_int_vec(src_type, shift), ""); - msb = LLVMBuildSub(builder, msb, lp_build_const_int_vec(src_type, 1), ""); - res = LLVMBuildOr(builder, res, msb, ""); - } -#elif 0 - while(shift > 0) { - res = LLVMBuildOr(builder, res, LLVMBuildLShr(builder, res, lp_build_const_int_vec(src_type, n), ""), ""); - shift -= n; - n *= 2; + + unsigned n = MIN2(src_type.width - 1, dst_width); + + double scale = (double)(1ULL << n); + unsigned lshift = dst_width - n; + unsigned rshift = n; + LLVMValueRef lshifted; + LLVMValueRef rshifted; + + res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), ""); + res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); + + /* + * Align the most significant bit to its final place. + * + * This will cause 1.0 to overflow to 0, but the later adjustment will + * get it right. + */ + if (lshift) { + lshifted = LLVMBuildShl(builder, res, + lp_build_const_int_vec(src_type, lshift), ""); + } else { + lshifted = res; } -#endif + + /* + * Align the most significant bit to the right. + */ + rshifted = LLVMBuildAShr(builder, res, + lp_build_const_int_vec(src_type, rshift), ""); + + /* + * Subtract the MSB to the LSB, therefore re-scaling from + * (1 << dst_width) to ((1 << dst_width) - 1). + */ + + res = LLVMBuildSub(builder, lshifted, rshifted, ""); } - else - res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(src_type, mask), ""); return res; } @@ -177,6 +224,16 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder, assert(dst_type.floating); + /* Special-case int8->float, though most cases could be handled + * this way: + */ + if (src_width == 8) { + scale = 1.0/255.0; + res = LLVMBuildSIToFP(builder, src, vec_type, ""); + res = LLVMBuildFMul(builder, res, lp_build_const_vec(dst_type, scale), ""); + return res; + } + mantissa = lp_mantissa(dst_type); n = MIN2(mantissa, src_width); @@ -241,6 +298,87 @@ lp_build_conv(LLVMBuilderRef builder, } num_tmps = num_srcs; + + /* Special case 4x4f --> 1x16ub + */ + if (src_type.floating == 1 && + src_type.fixed == 0 && + src_type.sign == 1 && + src_type.norm == 0 && + src_type.width == 32 && + src_type.length == 4 && + + dst_type.floating == 0 && + dst_type.fixed == 0 && + dst_type.sign == 0 && + dst_type.norm == 1 && + dst_type.width == 8 && + dst_type.length == 16 && + + util_cpu_caps.has_sse2) + { + int i; + + for (i = 0; i < num_dsts; i++, src += 4) { + struct lp_type int16_type = dst_type; + struct lp_type int32_type = dst_type; + LLVMValueRef lo, hi; + LLVMValueRef src_int0; + LLVMValueRef src_int1; + LLVMValueRef src_int2; + LLVMValueRef src_int3; + LLVMTypeRef int16_vec_type; + LLVMTypeRef int32_vec_type; + LLVMTypeRef src_vec_type; + LLVMTypeRef dst_vec_type; + LLVMValueRef const_255f; + LLVMValueRef a, b, c, d; + + int16_type.width *= 2; + int16_type.length /= 2; + int16_type.sign = 1; + + int32_type.width *= 4; + int32_type.length /= 4; + int32_type.sign = 1; + + src_vec_type = lp_build_vec_type(src_type); + dst_vec_type = lp_build_vec_type(dst_type); + int16_vec_type = lp_build_vec_type(int16_type); + int32_vec_type = lp_build_vec_type(int32_type); + + const_255f = lp_build_const_vec(src_type, 255.0f); + + a = LLVMBuildFMul(builder, src[0], const_255f, ""); + b = LLVMBuildFMul(builder, src[1], const_255f, ""); + c = LLVMBuildFMul(builder, src[2], const_255f, ""); + d = LLVMBuildFMul(builder, src[3], const_255f, ""); + + { + struct lp_build_context bld; + + bld.builder = builder; + bld.type = src_type; + bld.vec_type = src_vec_type; + bld.int_elem_type = lp_build_elem_type(int32_type); + bld.int_vec_type = int32_vec_type; + bld.undef = lp_build_undef(src_type); + bld.zero = lp_build_zero(src_type); + bld.one = lp_build_one(src_type); + + src_int0 = lp_build_iround(&bld, a); + src_int1 = lp_build_iround(&bld, b); + src_int2 = lp_build_iround(&bld, c); + src_int3 = lp_build_iround(&bld, d); + } + /* relying on clamping behavior of sse2 intrinsics here */ + lo = lp_build_pack2(builder, int32_type, int16_type, src_int0, src_int1); + hi = lp_build_pack2(builder, int32_type, int16_type, src_int2, src_int3); + dst[i] = lp_build_pack2(builder, int16_type, dst_type, lo, hi); + } + return; + } + /* * Clamp if necessary */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.c b/src/gallium/auxiliary/gallivm/lp_bld_debug.c index d3a5afff8c2..93e56553d7b 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_debug.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.c @@ -57,6 +57,8 @@ lp_disassemble(const void* func) #ifdef HAVE_UDIS86 ud_t ud_obj; uint64_t max_jmp_pc; + uint inst_no; + boolean emit_addrs = TRUE, emit_line_nos = FALSE; ud_init(&ud_obj); @@ -76,13 +78,18 @@ lp_disassemble(const void* func) while (ud_disassemble(&ud_obj)) { + if (emit_addrs) { #ifdef PIPE_ARCH_X86 - debug_printf("0x%08lx:\t", (unsigned long)ud_insn_off(&ud_obj)); + debug_printf("0x%08lx:\t", (unsigned long)ud_insn_off(&ud_obj)); #endif #ifdef PIPE_ARCH_X86_64 - debug_printf("0x%016llx:\t", (unsigned long long)ud_insn_off(&ud_obj)); + debug_printf("0x%016llx:\t", (unsigned long long)ud_insn_off(&ud_obj)); #endif - + } + else if (emit_line_nos) { + debug_printf("%6d:\t", inst_no); + inst_no++; + } #if 0 debug_printf("%-16s ", ud_insn_hex(&ud_obj)); #endif @@ -115,8 +122,10 @@ lp_disassemble(const void* func) } } - if ((ud_insn_off(&ud_obj) >= max_jmp_pc && ud_obj.mnemonic == UD_Iret) || - ud_obj.mnemonic == UD_Iinvalid) + if (ud_obj.mnemonic == UD_Iinvalid || + (ud_insn_off(&ud_obj) >= max_jmp_pc && + (ud_obj.mnemonic == UD_Iret || + ud_obj.mnemonic == UD_Ijmp))) break; } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.h b/src/gallium/auxiliary/gallivm/lp_bld_debug.h index 369c1bbf09a..eb11dcd4ef4 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_debug.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.h @@ -36,11 +36,12 @@ #include "util/u_string.h" -#define GALLIVM_DEBUG_TGSI 0x1 -#define GALLIVM_DEBUG_IR 0x2 -#define GALLIVM_DEBUG_ASM 0x4 -#define GALLIVM_DEBUG_NO_OPT 0x8 -#define GALLIVM_DEBUG_PERF 0x10 +#define GALLIVM_DEBUG_TGSI (1 << 0) +#define GALLIVM_DEBUG_IR (1 << 1) +#define GALLIVM_DEBUG_ASM (1 << 2) +#define GALLIVM_DEBUG_NO_OPT (1 << 3) +#define GALLIVM_DEBUG_PERF (1 << 4) +#define GALLIVM_DEBUG_NO_BRILINEAR (1 << 5) #ifdef DEBUG diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c index 5bc9c741a88..a2cee199a01 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c @@ -38,273 +38,15 @@ #include "lp_bld_flow.h" -#define LP_BUILD_FLOW_MAX_VARIABLES 64 -#define LP_BUILD_FLOW_MAX_DEPTH 32 - -/** - * Enumeration of all possible flow constructs. - */ -enum lp_build_flow_construct_kind { - LP_BUILD_FLOW_SCOPE, - LP_BUILD_FLOW_SKIP, - LP_BUILD_FLOW_IF -}; - - -/** - * Variable declaration scope. - */ -struct lp_build_flow_scope -{ - /** Number of variables declared in this scope */ - unsigned num_variables; -}; - - -/** - * Early exit. Useful to skip to the end of a function or block when - * the execution mask becomes zero or when there is an error condition. - */ -struct lp_build_flow_skip -{ - /** Block to skip to */ - LLVMBasicBlockRef block; - - /** Number of variables declared at the beginning */ - unsigned num_variables; - - LLVMValueRef *phi; /**< array [num_variables] */ -}; - - -/** - * if/else/endif. - */ -struct lp_build_flow_if -{ - unsigned num_variables; - - LLVMValueRef *phi; /**< array [num_variables] */ - - LLVMValueRef condition; - LLVMBasicBlockRef entry_block, true_block, false_block, merge_block; -}; - - -/** - * Union of all possible flow constructs' data - */ -union lp_build_flow_construct_data -{ - struct lp_build_flow_scope scope; - struct lp_build_flow_skip skip; - struct lp_build_flow_if ifthen; -}; - - -/** - * Element of the flow construct stack. - */ -struct lp_build_flow_construct -{ - enum lp_build_flow_construct_kind kind; - union lp_build_flow_construct_data data; -}; - - /** - * All necessary data to generate LLVM control flow constructs. + * Insert a new block, right where builder is pointing to. * - * Besides keeping track of the control flow construct themselves we also - * need to keep track of variables in order to generate SSA Phi values. - */ -struct lp_build_flow_context -{ - LLVMBuilderRef builder; - - /** - * Control flow stack. - */ - struct lp_build_flow_construct constructs[LP_BUILD_FLOW_MAX_DEPTH]; - unsigned num_constructs; - - /** - * Variable stack - */ - LLVMValueRef *variables[LP_BUILD_FLOW_MAX_VARIABLES]; - unsigned num_variables; -}; - - -struct lp_build_flow_context * -lp_build_flow_create(LLVMBuilderRef builder) -{ - struct lp_build_flow_context *flow; - - flow = CALLOC_STRUCT(lp_build_flow_context); - if(!flow) - return NULL; - - flow->builder = builder; - - return flow; -} - - -void -lp_build_flow_destroy(struct lp_build_flow_context *flow) -{ - assert(flow->num_constructs == 0); - assert(flow->num_variables == 0); - FREE(flow); -} - - -/** - * Begin/push a new flow control construct, such as a loop, skip block - * or variable scope. - */ -static union lp_build_flow_construct_data * -lp_build_flow_push(struct lp_build_flow_context *flow, - enum lp_build_flow_construct_kind kind) -{ - assert(flow->num_constructs < LP_BUILD_FLOW_MAX_DEPTH); - if(flow->num_constructs >= LP_BUILD_FLOW_MAX_DEPTH) - return NULL; - - flow->constructs[flow->num_constructs].kind = kind; - return &flow->constructs[flow->num_constructs++].data; -} - - -/** - * Return the current/top flow control construct on the stack. - * \param kind the expected type of the top-most construct - */ -static union lp_build_flow_construct_data * -lp_build_flow_peek(struct lp_build_flow_context *flow, - enum lp_build_flow_construct_kind kind) -{ - assert(flow->num_constructs); - if(!flow->num_constructs) - return NULL; - - assert(flow->constructs[flow->num_constructs - 1].kind == kind); - if(flow->constructs[flow->num_constructs - 1].kind != kind) - return NULL; - - return &flow->constructs[flow->num_constructs - 1].data; -} - - -/** - * End/pop the current/top flow control construct on the stack. - * \param kind the expected type of the top-most construct - */ -static union lp_build_flow_construct_data * -lp_build_flow_pop(struct lp_build_flow_context *flow, - enum lp_build_flow_construct_kind kind) -{ - assert(flow->num_constructs); - if(!flow->num_constructs) - return NULL; - - assert(flow->constructs[flow->num_constructs - 1].kind == kind); - if(flow->constructs[flow->num_constructs - 1].kind != kind) - return NULL; - - return &flow->constructs[--flow->num_constructs].data; -} - - -/** - * Begin a variable scope. + * This is useful important not only for aesthetic reasons, but also for + * performance reasons, as frequently run blocks should be laid out next to + * each other and fall-throughs maximized. * + * See also llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp. * - */ -void -lp_build_flow_scope_begin(struct lp_build_flow_context *flow) -{ - struct lp_build_flow_scope *scope; - - scope = &lp_build_flow_push(flow, LP_BUILD_FLOW_SCOPE)->scope; - if(!scope) - return; - - scope->num_variables = 0; -} - - -/** - * Declare a variable. - * - * A variable is a named entity which can have different LLVMValueRef's at - * different points of the program. This is relevant for control flow because - * when there are multiple branches to a same location we need to replace - * the variable's value with a Phi function as explained in - * http://en.wikipedia.org/wiki/Static_single_assignment_form . - * - * We keep track of variables by keeping around a pointer to where they're - * current. - * - * There are a few cautions to observe: - * - * - Variable's value must not be NULL. If there is no initial value then - * LLVMGetUndef() should be used. - * - * - Variable's value must be kept up-to-date. If the variable is going to be - * modified by a function then a pointer should be passed so that its value - * is accurate. Failure to do this will cause some of the variables' - * transient values to be lost, leading to wrong results. - * - * - A program should be written from top to bottom, by always appending - * instructions to the bottom with a single LLVMBuilderRef. Inserting and/or - * modifying existing statements will most likely lead to wrong results. - * - */ -void -lp_build_flow_scope_declare(struct lp_build_flow_context *flow, - LLVMValueRef *variable) -{ - struct lp_build_flow_scope *scope; - - scope = &lp_build_flow_peek(flow, LP_BUILD_FLOW_SCOPE)->scope; - if(!scope) - return; - - assert(*variable); - if(!*variable) - return; - - assert(flow->num_variables < LP_BUILD_FLOW_MAX_VARIABLES); - if(flow->num_variables >= LP_BUILD_FLOW_MAX_VARIABLES) - return; - - flow->variables[flow->num_variables++] = variable; - ++scope->num_variables; -} - - -void -lp_build_flow_scope_end(struct lp_build_flow_context *flow) -{ - struct lp_build_flow_scope *scope; - - scope = &lp_build_flow_pop(flow, LP_BUILD_FLOW_SCOPE)->scope; - if(!scope) - return; - - assert(flow->num_variables >= scope->num_variables); - if(flow->num_variables < scope->num_variables) { - flow->num_variables = 0; - return; - } - - flow->num_variables -= scope->num_variables; -} - - -/** * Note: this function has no dependencies on the flow code and could * be used elsewhere. */ @@ -334,52 +76,18 @@ lp_build_insert_new_block(LLVMBuilderRef builder, const char *name) } -static LLVMBasicBlockRef -lp_build_flow_insert_block(struct lp_build_flow_context *flow) -{ - return lp_build_insert_new_block(flow->builder, ""); -} - - /** * Begin a "skip" block. Inside this block we can test a condition and * skip to the end of the block if the condition is false. */ void -lp_build_flow_skip_begin(struct lp_build_flow_context *flow) +lp_build_flow_skip_begin(struct lp_build_skip_context *skip, + LLVMBuilderRef builder) { - struct lp_build_flow_skip *skip; - LLVMBuilderRef builder; - unsigned i; - - skip = &lp_build_flow_push(flow, LP_BUILD_FLOW_SKIP)->skip; - if(!skip) - return; + skip->builder = builder; /* create new basic block */ - skip->block = lp_build_flow_insert_block(flow); - - skip->num_variables = flow->num_variables; - if(!skip->num_variables) { - skip->phi = NULL; - return; - } - - /* Allocate a Phi node for each variable in this skip scope */ - skip->phi = MALLOC(skip->num_variables * sizeof *skip->phi); - if(!skip->phi) { - skip->num_variables = 0; - return; - } - - builder = LLVMCreateBuilder(); - LLVMPositionBuilderAtEnd(builder, skip->block); - - /* create a Phi node for each variable */ - for(i = 0; i < skip->num_variables; ++i) - skip->phi[i] = LLVMBuildPhi(builder, LLVMTypeOf(*flow->variables[i]), ""); - - LLVMDisposeBuilder(builder); + skip->block = lp_build_insert_new_block(skip->builder, "skip"); } @@ -388,83 +96,50 @@ lp_build_flow_skip_begin(struct lp_build_flow_context *flow) * skip block if the condition is true. */ void -lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow, +lp_build_flow_skip_cond_break(struct lp_build_skip_context *skip, LLVMValueRef cond) { - struct lp_build_flow_skip *skip; - LLVMBasicBlockRef current_block; LLVMBasicBlockRef new_block; - unsigned i; - - skip = &lp_build_flow_peek(flow, LP_BUILD_FLOW_SKIP)->skip; - if(!skip) - return; - current_block = LLVMGetInsertBlock(flow->builder); - - new_block = lp_build_flow_insert_block(flow); - - /* for each variable, update the Phi node with a (variable, block) pair */ - for(i = 0; i < skip->num_variables; ++i) { - assert(*flow->variables[i]); - assert(LLVMTypeOf(skip->phi[i]) == LLVMTypeOf(*flow->variables[i])); - LLVMAddIncoming(skip->phi[i], flow->variables[i], ¤t_block, 1); - } + new_block = lp_build_insert_new_block(skip->builder, ""); /* if cond is true, goto skip->block, else goto new_block */ - LLVMBuildCondBr(flow->builder, cond, skip->block, new_block); + LLVMBuildCondBr(skip->builder, cond, skip->block, new_block); - LLVMPositionBuilderAtEnd(flow->builder, new_block); + LLVMPositionBuilderAtEnd(skip->builder, new_block); } void -lp_build_flow_skip_end(struct lp_build_flow_context *flow) +lp_build_flow_skip_end(struct lp_build_skip_context *skip) { - struct lp_build_flow_skip *skip; - LLVMBasicBlockRef current_block; - unsigned i; - - skip = &lp_build_flow_pop(flow, LP_BUILD_FLOW_SKIP)->skip; - if(!skip) - return; - - current_block = LLVMGetInsertBlock(flow->builder); - - /* add (variable, block) tuples to the phi nodes */ - for(i = 0; i < skip->num_variables; ++i) { - assert(*flow->variables[i]); - assert(LLVMTypeOf(skip->phi[i]) == LLVMTypeOf(*flow->variables[i])); - LLVMAddIncoming(skip->phi[i], flow->variables[i], ¤t_block, 1); - *flow->variables[i] = skip->phi[i]; - } - /* goto block */ - LLVMBuildBr(flow->builder, skip->block); - LLVMPositionBuilderAtEnd(flow->builder, skip->block); - - FREE(skip->phi); + LLVMBuildBr(skip->builder, skip->block); + LLVMPositionBuilderAtEnd(skip->builder, skip->block); } /** * Check if the mask predicate is zero. If so, jump to the end of the block. */ -static void +void lp_build_mask_check(struct lp_build_mask_context *mask) { - LLVMBuilderRef builder = mask->flow->builder; + LLVMBuilderRef builder = mask->skip.builder; + LLVMValueRef value; LLVMValueRef cond; + value = lp_build_mask_value(mask); + /* cond = (mask == 0) */ cond = LLVMBuildICmp(builder, LLVMIntEQ, - LLVMBuildBitCast(builder, mask->value, mask->reg_type, ""), + LLVMBuildBitCast(builder, value, mask->reg_type, ""), LLVMConstNull(mask->reg_type), ""); /* if cond, goto end of block */ - lp_build_flow_skip_cond_break(mask->flow, cond); + lp_build_flow_skip_cond_break(&mask->skip, cond); } @@ -477,21 +152,27 @@ lp_build_mask_check(struct lp_build_mask_context *mask) */ void lp_build_mask_begin(struct lp_build_mask_context *mask, - struct lp_build_flow_context *flow, + LLVMBuilderRef builder, struct lp_type type, LLVMValueRef value) { memset(mask, 0, sizeof *mask); - mask->flow = flow; mask->reg_type = LLVMIntType(type.width * type.length); - mask->value = value; + mask->var = lp_build_alloca(builder, + lp_build_int_vec_type(type), + "execution_mask"); - lp_build_flow_scope_begin(flow); - lp_build_flow_scope_declare(flow, &mask->value); - lp_build_flow_skip_begin(flow); + LLVMBuildStore(builder, value, mask->var); - lp_build_mask_check(mask); + lp_build_flow_skip_begin(&mask->skip, builder); +} + + +LLVMValueRef +lp_build_mask_value(struct lp_build_mask_context *mask) +{ + return LLVMBuildLoad(mask->skip.builder, mask->var, ""); } @@ -504,9 +185,10 @@ void lp_build_mask_update(struct lp_build_mask_context *mask, LLVMValueRef value) { - mask->value = LLVMBuildAnd( mask->flow->builder, mask->value, value, ""); - - lp_build_mask_check(mask); + value = LLVMBuildAnd(mask->skip.builder, + lp_build_mask_value(mask), + value, ""); + LLVMBuildStore(mask->skip.builder, value, mask->var); } @@ -516,9 +198,8 @@ lp_build_mask_update(struct lp_build_mask_context *mask, LLVMValueRef lp_build_mask_end(struct lp_build_mask_context *mask) { - lp_build_flow_skip_end(mask->flow); - lp_build_flow_scope_end(mask->flow); - return mask->value; + lp_build_flow_skip_end(&mask->skip); + return lp_build_mask_value(mask); } @@ -528,59 +209,27 @@ lp_build_loop_begin(LLVMBuilderRef builder, LLVMValueRef start, struct lp_build_loop_state *state) { - LLVMBasicBlockRef block = LLVMGetInsertBlock(builder); - LLVMValueRef function = LLVMGetBasicBlockParent(block); + state->block = lp_build_insert_new_block(builder, "loop_begin"); - state->block = LLVMAppendBasicBlock(function, "loop"); + state->counter_var = lp_build_alloca(builder, LLVMTypeOf(start), "loop_counter"); + + LLVMBuildStore(builder, start, state->counter_var); LLVMBuildBr(builder, state->block); LLVMPositionBuilderAtEnd(builder, state->block); - state->counter = LLVMBuildPhi(builder, LLVMTypeOf(start), ""); - - LLVMAddIncoming(state->counter, &start, &block, 1); - + state->counter = LLVMBuildLoad(builder, state->counter_var, ""); } void -lp_build_loop_end(LLVMBuilderRef builder, - LLVMValueRef end, - LLVMValueRef step, - struct lp_build_loop_state *state) -{ - LLVMBasicBlockRef block = LLVMGetInsertBlock(builder); - LLVMValueRef function = LLVMGetBasicBlockParent(block); - LLVMValueRef next; - LLVMValueRef cond; - LLVMBasicBlockRef after_block; - - if (!step) - step = LLVMConstInt(LLVMTypeOf(end), 1, 0); - - next = LLVMBuildAdd(builder, state->counter, step, ""); - - cond = LLVMBuildICmp(builder, LLVMIntNE, next, end, ""); - - after_block = LLVMAppendBasicBlock(function, ""); - - LLVMBuildCondBr(builder, cond, after_block, state->block); - - LLVMAddIncoming(state->counter, &next, &block, 1); - - LLVMPositionBuilderAtEnd(builder, after_block); -} - -void lp_build_loop_end_cond(LLVMBuilderRef builder, LLVMValueRef end, LLVMValueRef step, - int llvm_cond, + LLVMIntPredicate llvm_cond, struct lp_build_loop_state *state) { - LLVMBasicBlockRef block = LLVMGetInsertBlock(builder); - LLVMValueRef function = LLVMGetBasicBlockParent(block); LLVMValueRef next; LLVMValueRef cond; LLVMBasicBlockRef after_block; @@ -590,15 +239,27 @@ lp_build_loop_end_cond(LLVMBuilderRef builder, next = LLVMBuildAdd(builder, state->counter, step, ""); + LLVMBuildStore(builder, next, state->counter_var); + cond = LLVMBuildICmp(builder, llvm_cond, next, end, ""); - after_block = LLVMAppendBasicBlock(function, ""); + after_block = lp_build_insert_new_block(builder, "loop_end"); LLVMBuildCondBr(builder, cond, after_block, state->block); - LLVMAddIncoming(state->counter, &next, &block, 1); - LLVMPositionBuilderAtEnd(builder, after_block); + + state->counter = LLVMBuildLoad(builder, state->counter_var, ""); +} + + +void +lp_build_loop_end(LLVMBuilderRef builder, + LLVMValueRef end, + LLVMValueRef step, + struct lp_build_loop_state *state) +{ + lp_build_loop_end_cond(builder, end, step, LLVMIntNE, state); } @@ -616,24 +277,16 @@ lp_build_loop_end_cond(LLVMBuilderRef builder, Is built with: - LLVMValueRef x = LLVMGetUndef(); // or something else + // x needs an alloca variable + x = lp_build_alloca(builder, type, "x"); - flow = lp_build_flow_create(builder); - lp_build_flow_scope_begin(flow); + lp_build_if(ctx, builder, cond); + LLVMBuildStore(LLVMBuildAdd(1, 2), x); + lp_build_else(ctx); + LLVMBuildStore(LLVMBuildAdd(2, 3). x); + lp_build_endif(ctx); - // x needs a phi node - lp_build_flow_scope_declare(flow, &x); - - lp_build_if(ctx, flow, builder, cond); - x = LLVMAdd(1, 2); - lp_build_else(ctx); - x = LLVMAdd(2, 3); - lp_build_endif(ctx); - - lp_build_flow_scope_end(flow); - - lp_build_flow_destroy(flow); */ @@ -642,47 +295,19 @@ lp_build_loop_end_cond(LLVMBuilderRef builder, * Begin an if/else/endif construct. */ void -lp_build_if(struct lp_build_if_state *ctx, - struct lp_build_flow_context *flow, +lp_build_if(struct lp_build_if_state *ifthen, LLVMBuilderRef builder, LLVMValueRef condition) { LLVMBasicBlockRef block = LLVMGetInsertBlock(builder); - struct lp_build_flow_if *ifthen; - unsigned i; - - memset(ctx, 0, sizeof(*ctx)); - ctx->builder = builder; - ctx->flow = flow; - /* push/create new scope */ - ifthen = &lp_build_flow_push(flow, LP_BUILD_FLOW_IF)->ifthen; - assert(ifthen); - - ifthen->num_variables = flow->num_variables; + memset(ifthen, 0, sizeof *ifthen); + ifthen->builder = builder; ifthen->condition = condition; ifthen->entry_block = block; - /* create a Phi node for each variable in this flow scope */ - ifthen->phi = MALLOC(ifthen->num_variables * sizeof(*ifthen->phi)); - if (!ifthen->phi) { - ifthen->num_variables = 0; - return; - } - /* create endif/merge basic block for the phi functions */ ifthen->merge_block = lp_build_insert_new_block(builder, "endif-block"); - LLVMPositionBuilderAtEnd(builder, ifthen->merge_block); - - /* create a phi node for each variable */ - for (i = 0; i < flow->num_variables; i++) { - ifthen->phi[i] = LLVMBuildPhi(builder, LLVMTypeOf(*flow->variables[i]), ""); - - /* add add the initial value of the var from the entry block */ - if (!LLVMIsUndef(*flow->variables[i])) - LLVMAddIncoming(ifthen->phi[i], flow->variables[i], - &ifthen->entry_block, 1); - } /* create/insert true_block before merge_block */ ifthen->true_block = LLVMInsertBasicBlock(ifthen->merge_block, "if-true-block"); @@ -696,27 +321,16 @@ lp_build_if(struct lp_build_if_state *ctx, * Begin else-part of a conditional */ void -lp_build_else(struct lp_build_if_state *ctx) +lp_build_else(struct lp_build_if_state *ifthen) { - struct lp_build_flow_context *flow = ctx->flow; - struct lp_build_flow_if *ifthen; - unsigned i; - - ifthen = &lp_build_flow_peek(flow, LP_BUILD_FLOW_IF)->ifthen; - assert(ifthen); - - /* for each variable, update the Phi node with a (variable, block) pair */ - LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block); - for (i = 0; i < flow->num_variables; i++) { - assert(*flow->variables[i]); - LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &ifthen->true_block, 1); - } + /* Append an unconditional Br(anch) instruction on the true_block */ + LLVMBuildBr(ifthen->builder, ifthen->merge_block); /* create/insert false_block before the merge block */ ifthen->false_block = LLVMInsertBasicBlock(ifthen->merge_block, "if-false-block"); /* successive code goes into the else block */ - LLVMPositionBuilderAtEnd(ctx->builder, ifthen->false_block); + LLVMPositionBuilderAtEnd(ifthen->builder, ifthen->false_block); } @@ -724,75 +338,30 @@ lp_build_else(struct lp_build_if_state *ctx) * End a conditional. */ void -lp_build_endif(struct lp_build_if_state *ctx) +lp_build_endif(struct lp_build_if_state *ifthen) { - struct lp_build_flow_context *flow = ctx->flow; - struct lp_build_flow_if *ifthen; - LLVMBasicBlockRef curBlock = LLVMGetInsertBlock(ctx->builder); - unsigned i; - - ifthen = &lp_build_flow_pop(flow, LP_BUILD_FLOW_IF)->ifthen; - assert(ifthen); - /* Insert branch to the merge block from current block */ - LLVMBuildBr(ctx->builder, ifthen->merge_block); + LLVMBuildBr(ifthen->builder, ifthen->merge_block); - if (ifthen->false_block) { - LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block); - /* for each variable, update the Phi node with a (variable, block) pair */ - for (i = 0; i < flow->num_variables; i++) { - assert(*flow->variables[i]); - LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &curBlock, 1); - /* replace the variable ref with the phi function */ - *flow->variables[i] = ifthen->phi[i]; - } - } - else { - /* no else clause */ - LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block); - for (i = 0; i < flow->num_variables; i++) { - assert(*flow->variables[i]); - LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &ifthen->true_block, 1); - - /* replace the variable ref with the phi function */ - *flow->variables[i] = ifthen->phi[i]; - } - } - - FREE(ifthen->phi); - - /*** - *** Now patch in the various branch instructions. - ***/ + /* + * Now patch in the various branch instructions. + */ /* Insert the conditional branch instruction at the end of entry_block */ - LLVMPositionBuilderAtEnd(ctx->builder, ifthen->entry_block); + LLVMPositionBuilderAtEnd(ifthen->builder, ifthen->entry_block); if (ifthen->false_block) { /* we have an else clause */ - LLVMBuildCondBr(ctx->builder, ifthen->condition, + LLVMBuildCondBr(ifthen->builder, ifthen->condition, ifthen->true_block, ifthen->false_block); } else { /* no else clause */ - LLVMBuildCondBr(ctx->builder, ifthen->condition, + LLVMBuildCondBr(ifthen->builder, ifthen->condition, ifthen->true_block, ifthen->merge_block); } - /* Insert branch from end of true_block to merge_block */ - if (ifthen->false_block) { - /* Append an unconditional Br(anch) instruction on the true_block */ - LLVMPositionBuilderAtEnd(ctx->builder, ifthen->true_block); - LLVMBuildBr(ctx->builder, ifthen->merge_block); - } - else { - /* No else clause. - * Note that we've already inserted the branch at the end of - * true_block. See the very first LLVMBuildBr() call in this function. - */ - } - /* Resume building code at end of the ifthen->merge_block */ - LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block); + LLVMPositionBuilderAtEnd(ifthen->builder, ifthen->merge_block); } @@ -830,6 +399,7 @@ lp_build_alloca(LLVMBuilderRef builder, } res = LLVMBuildAlloca(first_builder, type, name); + LLVMBuildStore(builder, LLVMConstNull(type), res); LLVMDisposeBuilder(first_builder); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.h b/src/gallium/auxiliary/gallivm/lp_bld_flow.h index fffb493a93b..e729ee6eaac 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_flow.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.h @@ -41,52 +41,49 @@ struct lp_type; -struct lp_build_flow_context; - - -struct lp_build_flow_context * -lp_build_flow_create(LLVMBuilderRef builder); - -void -lp_build_flow_destroy(struct lp_build_flow_context *flow); - -void -lp_build_flow_scope_begin(struct lp_build_flow_context *flow); - -void -lp_build_flow_scope_declare(struct lp_build_flow_context *flow, - LLVMValueRef *variable); +/** + * Early exit. Useful to skip to the end of a function or block when + * the execution mask becomes zero or when there is an error condition. + */ +struct lp_build_skip_context +{ + LLVMBuilderRef builder; -void -lp_build_flow_scope_end(struct lp_build_flow_context *flow); + /** Block to skip to */ + LLVMBasicBlockRef block; +}; void -lp_build_flow_skip_begin(struct lp_build_flow_context *flow); +lp_build_flow_skip_begin(struct lp_build_skip_context *ctx, + LLVMBuilderRef builder); void -lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow, +lp_build_flow_skip_cond_break(struct lp_build_skip_context *ctx, LLVMValueRef cond); void -lp_build_flow_skip_end(struct lp_build_flow_context *flow); +lp_build_flow_skip_end(struct lp_build_skip_context *ctx); struct lp_build_mask_context { - struct lp_build_flow_context *flow; + struct lp_build_skip_context skip; LLVMTypeRef reg_type; - LLVMValueRef value; + LLVMValueRef var; }; void lp_build_mask_begin(struct lp_build_mask_context *mask, - struct lp_build_flow_context *flow, + LLVMBuilderRef builder, struct lp_type type, LLVMValueRef value); +LLVMValueRef +lp_build_mask_value(struct lp_build_mask_context *mask); + /** * Bitwise AND the mask with the given value, if a previous mask was set. */ @@ -94,6 +91,9 @@ void lp_build_mask_update(struct lp_build_mask_context *mask, LLVMValueRef value); +void +lp_build_mask_check(struct lp_build_mask_context *mask); + LLVMValueRef lp_build_mask_end(struct lp_build_mask_context *mask); @@ -108,6 +108,7 @@ lp_build_mask_end(struct lp_build_mask_context *mask); struct lp_build_loop_state { LLVMBasicBlockRef block; + LLVMValueRef counter_var; LLVMValueRef counter; }; @@ -128,22 +129,28 @@ void lp_build_loop_end_cond(LLVMBuilderRef builder, LLVMValueRef end, LLVMValueRef step, - int cond, /* LLVM condition */ + LLVMIntPredicate cond, struct lp_build_loop_state *state); +/** + * if/else/endif. + */ struct lp_build_if_state { LLVMBuilderRef builder; - struct lp_build_flow_context *flow; + LLVMValueRef condition; + LLVMBasicBlockRef entry_block; + LLVMBasicBlockRef true_block; + LLVMBasicBlockRef false_block; + LLVMBasicBlockRef merge_block; }; void lp_build_if(struct lp_build_if_state *ctx, - struct lp_build_flow_context *flow, LLVMBuilderRef builder, LLVMValueRef condition); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c index 761f33b578d..5598ca5c489 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_init.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c @@ -44,6 +44,7 @@ static const struct debug_named_value lp_bld_debug_flags[] = { { "asm", GALLIVM_DEBUG_ASM, NULL }, { "nopt", GALLIVM_DEBUG_NO_OPT, NULL }, { "perf", GALLIVM_DEBUG_PERF, NULL }, + { "no_brilinear", GALLIVM_DEBUG_NO_BRILINEAR, NULL }, DEBUG_NAMED_VALUE_END }; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h index f26fdac4663..0b4b1ca7d11 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_init.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h @@ -47,4 +47,10 @@ lp_build_init(void); extern void lp_func_delete_body(LLVMValueRef func); + +extern LLVMValueRef +lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal, + const char *Name); + + #endif /* !LP_BLD_INIT_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c index d5c62a3f734..026b60ac36e 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c @@ -92,9 +92,23 @@ lp_build_compare(LLVMBuilderRef builder, if(func == PIPE_FUNC_ALWAYS) return ones; - /* TODO: optimize the constant case */ +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) + /* + * There are no unsigned integer comparison instructions in SSE. + */ - /* XXX: It is not clear if we should use the ordered or unordered operators */ + if (!type.floating && !type.sign && + type.width * type.length == 128 && + util_cpu_caps.has_sse2 && + (func == PIPE_FUNC_LESS || + func == PIPE_FUNC_LEQUAL || + func == PIPE_FUNC_GREATER || + func == PIPE_FUNC_GEQUAL) && + (gallivm_debug & GALLIVM_DEBUG_PERF)) { + debug_printf("%s: inefficient <%u x i%u> unsigned comparison\n", + __FUNCTION__, type.length, type.width); + } +#endif #if HAVE_LLVM < 0x0207 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) @@ -225,6 +239,8 @@ lp_build_compare(LLVMBuilderRef builder, #endif #endif /* HAVE_LLVM < 0x0207 */ + /* XXX: It is not clear if we should use the ordered or unordered operators */ + if(type.floating) { LLVMRealPredicate op; switch(func) { @@ -446,10 +462,12 @@ lp_build_select(struct lp_build_context *bld, LLVMTypeRef arg_type; LLVMValueRef args[3]; - if (type.width == 64) { + if (type.floating && + type.width == 64) { intrinsic = "llvm.x86.sse41.blendvpd"; arg_type = LLVMVectorType(LLVMDoubleType(), 2); - } else if (type.width == 32) { + } else if (type.floating && + type.width == 32) { intrinsic = "llvm.x86.sse41.blendvps"; arg_type = LLVMVectorType(LLVMFloatType(), 4); } else { diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp index 48baf7c425c..f56ddee7fd7 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp +++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp @@ -178,3 +178,13 @@ lp_func_delete_body(LLVMValueRef FF) llvm::Function *func = llvm::unwrap<llvm::Function>(FF); func->deleteBody(); } + + +extern "C" +LLVMValueRef +lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal, + const char *Name) +{ + return llvm::wrap(llvm::unwrap(B)->CreateLoad(llvm::unwrap(PointerVal), true, Name)); +} + diff --git a/src/gallium/auxiliary/gallivm/lp_bld_printf.c b/src/gallium/auxiliary/gallivm/lp_bld_printf.c index 153ba5b15b1..f418e96aff4 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_printf.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_printf.c @@ -29,6 +29,8 @@ #include "util/u_debug.h" #include "util/u_memory.h" +#include "util/u_string.h" +#include "lp_bld_const.h" #include "lp_bld_printf.h" @@ -119,3 +121,22 @@ lp_build_printf(LLVMBuilderRef builder, const char *fmt, ...) return LLVMBuildCall(builder, func_printf, params, argcount + 1, ""); } + + +/** + * Print a float[4] vector. + */ +LLVMValueRef +lp_build_print_vec4(LLVMBuilderRef builder, const char *msg, LLVMValueRef vec) +{ + char format[1000]; + LLVMValueRef x, y, z, w; + + x = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(0), ""); + y = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(1), ""); + z = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(2), ""); + w = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(3), ""); + + util_snprintf(format, sizeof(format), "%s %%f %%f %%f %%f\n", msg); + return lp_build_printf(builder, format, x, y, z, w); +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_printf.h b/src/gallium/auxiliary/gallivm/lp_bld_printf.h index 83bd8f1d557..b6222c62ebe 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_printf.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_printf.h @@ -35,5 +35,9 @@ LLVMValueRef lp_build_const_string_variable(LLVMModuleRef module, const char *str, int len); LLVMValueRef lp_build_printf(LLVMBuilderRef builder, const char *fmt, ...); +LLVMValueRef +lp_build_print_vec4(LLVMBuilderRef builder, const char *msg, LLVMValueRef vec); + + #endif diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c b/src/gallium/auxiliary/gallivm/lp_bld_quad.c index 7b1088939b9..c18c8b47100 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_quad.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c @@ -81,11 +81,15 @@ LLVMValueRef lp_build_scalar_ddx(struct lp_build_context *bld, LLVMValueRef a) { - LLVMValueRef idx_left = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_LEFT, 0); - LLVMValueRef idx_right = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_RIGHT, 0); - LLVMValueRef a_left = LLVMBuildExtractElement(bld->builder, a, idx_left, ""); - LLVMValueRef a_right = LLVMBuildExtractElement(bld->builder, a, idx_right, ""); - return lp_build_sub(bld, a_right, a_left); + LLVMTypeRef i32t = LLVMInt32Type(); + LLVMValueRef idx_left = LLVMConstInt(i32t, LP_BLD_QUAD_TOP_LEFT, 0); + LLVMValueRef idx_right = LLVMConstInt(i32t, LP_BLD_QUAD_TOP_RIGHT, 0); + LLVMValueRef a_left = LLVMBuildExtractElement(bld->builder, a, idx_left, "left"); + LLVMValueRef a_right = LLVMBuildExtractElement(bld->builder, a, idx_right, "right"); + if (bld->type.floating) + return LLVMBuildFSub(bld->builder, a_right, a_left, "ddx"); + else + return LLVMBuildSub(bld->builder, a_right, a_left, "ddx"); } @@ -93,9 +97,13 @@ LLVMValueRef lp_build_scalar_ddy(struct lp_build_context *bld, LLVMValueRef a) { - LLVMValueRef idx_top = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_LEFT, 0); - LLVMValueRef idx_bottom = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_BOTTOM_LEFT, 0); - LLVMValueRef a_top = LLVMBuildExtractElement(bld->builder, a, idx_top, ""); - LLVMValueRef a_bottom = LLVMBuildExtractElement(bld->builder, a, idx_bottom, ""); - return lp_build_sub(bld, a_bottom, a_top); + LLVMTypeRef i32t = LLVMInt32Type(); + LLVMValueRef idx_top = LLVMConstInt(i32t, LP_BLD_QUAD_TOP_LEFT, 0); + LLVMValueRef idx_bottom = LLVMConstInt(i32t, LP_BLD_QUAD_BOTTOM_LEFT, 0); + LLVMValueRef a_top = LLVMBuildExtractElement(bld->builder, a, idx_top, "top"); + LLVMValueRef a_bottom = LLVMBuildExtractElement(bld->builder, a, idx_bottom, "bottom"); + if (bld->type.floating) + return LLVMBuildFSub(bld->builder, a_bottom, a_top, "ddy"); + else + return LLVMBuildSub(bld->builder, a_bottom, a_top, "ddy"); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c index aee94c1b866..844d1d935b5 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c @@ -39,12 +39,19 @@ #include "lp_bld_arit.h" #include "lp_bld_const.h" #include "lp_bld_debug.h" +#include "lp_bld_printf.h" #include "lp_bld_flow.h" #include "lp_bld_sample.h" #include "lp_bld_swizzle.h" #include "lp_bld_type.h" +/* + * Bri-linear factor. Should be greater than one. + */ +#define BRILINEAR_FACTOR 2 + + /** * Does the given texture wrap mode allow sampling the texture border color? * XXX maybe move this into gallium util code. @@ -126,17 +133,32 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, state->wrap_r = sampler->wrap_r; state->min_img_filter = sampler->min_img_filter; state->mag_img_filter = sampler->mag_img_filter; - if (view->last_level) { + + if (view->last_level && sampler->max_lod > 0.0f) { state->min_mip_filter = sampler->min_mip_filter; } else { state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE; } - /* If min_lod == max_lod we can greatly simplify mipmap selection. - * This is a case that occurs during automatic mipmap generation. - */ - if (sampler->min_lod == sampler->max_lod) { - state->min_max_lod_equal = 1; + if (state->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) { + if (sampler->lod_bias != 0.0f) { + state->lod_bias_non_zero = 1; + } + + /* If min_lod == max_lod we can greatly simplify mipmap selection. + * This is a case that occurs during automatic mipmap generation. + */ + if (sampler->min_lod == sampler->max_lod) { + state->min_max_lod_equal = 1; + } else { + if (sampler->min_lod > 0.0f) { + state->apply_min_lod = 1; + } + + if (sampler->max_lod < (float)view->last_level) { + state->apply_max_lod = 1; + } + } } state->compare_mode = sampler->compare_mode; @@ -153,6 +175,220 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, /** + * Generate code to compute coordinate gradient (rho). + * \param ddx partial derivatives of (s, t, r, q) with respect to X + * \param ddy partial derivatives of (s, t, r, q) with respect to Y + * + * XXX: The resulting rho is scalar, so we ignore all but the first element of + * derivatives that are passed by the shader. + */ +static LLVMValueRef +lp_build_rho(struct lp_build_sample_context *bld, + const LLVMValueRef ddx[4], + const LLVMValueRef ddy[4]) +{ + struct lp_build_context *float_size_bld = &bld->float_size_bld; + struct lp_build_context *float_bld = &bld->float_bld; + const unsigned dims = bld->dims; + LLVMTypeRef i32t = LLVMInt32Type(); + LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); + LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0); + LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0); + LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy; + LLVMValueRef rho_x, rho_y; + LLVMValueRef rho_vec; + LLVMValueRef float_size; + LLVMValueRef rho; + + dsdx = ddx[0]; + dsdy = ddy[0]; + + if (dims <= 1) { + rho_x = dsdx; + rho_y = dsdy; + } + else { + rho_x = float_size_bld->undef; + rho_y = float_size_bld->undef; + + rho_x = LLVMBuildInsertElement(bld->builder, rho_x, dsdx, index0, ""); + rho_y = LLVMBuildInsertElement(bld->builder, rho_y, dsdy, index0, ""); + + dtdx = ddx[1]; + dtdy = ddy[1]; + + rho_x = LLVMBuildInsertElement(bld->builder, rho_x, dtdx, index1, ""); + rho_y = LLVMBuildInsertElement(bld->builder, rho_y, dtdy, index1, ""); + + if (dims >= 3) { + drdx = ddx[2]; + drdy = ddy[2]; + + rho_x = LLVMBuildInsertElement(bld->builder, rho_x, drdx, index2, ""); + rho_y = LLVMBuildInsertElement(bld->builder, rho_y, drdy, index2, ""); + } + } + + rho_x = lp_build_abs(float_size_bld, rho_x); + rho_y = lp_build_abs(float_size_bld, rho_y); + + rho_vec = lp_build_max(float_size_bld, rho_x, rho_y); + + float_size = lp_build_int_to_float(float_size_bld, bld->int_size); + + rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size); + + if (dims <= 1) { + rho = rho_vec; + } + else { + if (dims >= 2) { + LLVMValueRef rho_s, rho_t, rho_r; + + rho_s = LLVMBuildExtractElement(bld->builder, rho_vec, index0, ""); + rho_t = LLVMBuildExtractElement(bld->builder, rho_vec, index1, ""); + + rho = lp_build_max(float_bld, rho_s, rho_t); + + if (dims >= 3) { + rho_r = LLVMBuildExtractElement(bld->builder, rho_vec, index0, ""); + rho = lp_build_max(float_bld, rho, rho_r); + } + } + } + + return rho; +} + + +/* + * Bri-linear lod computation + * + * Use a piece-wise linear approximation of log2 such that: + * - round to nearest, for values in the neighborhood of -1, 0, 1, 2, etc. + * - linear approximation for values in the neighborhood of 0.5, 1.5., etc, + * with the steepness specified in 'factor' + * - exact result for 0.5, 1.5, etc. + * + * + * 1.0 - /----* + * / + * / + * / + * 0.5 - * + * / + * / + * / + * 0.0 - *----/ + * + * | | + * 2^0 2^1 + * + * This is a technique also commonly used in hardware: + * - http://ixbtlabs.com/articles2/gffx/nv40-rx800-3.html + * + * TODO: For correctness, this should only be applied when texture is known to + * have regular mipmaps, i.e., mipmaps derived from the base level. + * + * TODO: This could be done in fixed point, where applicable. + */ +static void +lp_build_brilinear_lod(struct lp_build_context *bld, + LLVMValueRef lod, + double factor, + LLVMValueRef *out_lod_ipart, + LLVMValueRef *out_lod_fpart) +{ + LLVMValueRef lod_fpart; + double pre_offset = (factor - 0.5)/factor - 0.5; + double post_offset = 1 - factor; + + if (0) { + lp_build_printf(bld->builder, "lod = %f\n", lod); + } + + lod = lp_build_add(bld, lod, + lp_build_const_vec(bld->type, pre_offset)); + + lp_build_ifloor_fract(bld, lod, out_lod_ipart, &lod_fpart); + + lod_fpart = lp_build_mul(bld, lod_fpart, + lp_build_const_vec(bld->type, factor)); + + lod_fpart = lp_build_add(bld, lod_fpart, + lp_build_const_vec(bld->type, post_offset)); + + /* + * It's not necessary to clamp lod_fpart since: + * - the above expression will never produce numbers greater than one. + * - the mip filtering branch is only taken if lod_fpart is positive + */ + + *out_lod_fpart = lod_fpart; + + if (0) { + lp_build_printf(bld->builder, "lod_ipart = %i\n", *out_lod_ipart); + lp_build_printf(bld->builder, "lod_fpart = %f\n\n", *out_lod_fpart); + } +} + + +/* + * Combined log2 and brilinear lod computation. + * + * It's in all identical to calling lp_build_fast_log2() and + * lp_build_brilinear_lod() above, but by combining we can compute the interger + * and fractional part independently. + */ +static void +lp_build_brilinear_rho(struct lp_build_context *bld, + LLVMValueRef rho, + double factor, + LLVMValueRef *out_lod_ipart, + LLVMValueRef *out_lod_fpart) +{ + LLVMValueRef lod_ipart; + LLVMValueRef lod_fpart; + + const double pre_factor = (2*factor - 0.5)/(M_SQRT2*factor); + const double post_offset = 1 - 2*factor; + + assert(bld->type.floating); + + assert(lp_check_value(bld->type, rho)); + + /* + * The pre factor will make the intersections with the exact powers of two + * happen precisely where we want then to be, which means that the integer + * part will not need any post adjustments. + */ + rho = lp_build_mul(bld, rho, + lp_build_const_vec(bld->type, pre_factor)); + + /* ipart = ifloor(log2(rho)) */ + lod_ipart = lp_build_extract_exponent(bld, rho, 0); + + /* fpart = rho / 2**ipart */ + lod_fpart = lp_build_extract_mantissa(bld, rho); + + lod_fpart = lp_build_mul(bld, lod_fpart, + lp_build_const_vec(bld->type, factor)); + + lod_fpart = lp_build_add(bld, lod_fpart, + lp_build_const_vec(bld->type, post_offset)); + + /* + * Like lp_build_brilinear_lod, it's not necessary to clamp lod_fpart since: + * - the above expression will never produce numbers greater than one. + * - the mip filtering branch is only taken if lod_fpart is positive + */ + + *out_lod_ipart = lod_ipart; + *out_lod_fpart = lod_fpart; +} + + +/** * Generate code to compute texture level of detail (lambda). * \param ddx partial derivatives of (s, t, r, q) with respect to X * \param ddy partial derivatives of (s, t, r, q) with respect to Y @@ -165,85 +401,81 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, * XXX: The resulting lod is scalar, so ignore all but the first element of * derivatives, lod_bias, etc that are passed by the shader. */ -LLVMValueRef +void lp_build_lod_selector(struct lp_build_sample_context *bld, unsigned unit, const LLVMValueRef ddx[4], const LLVMValueRef ddy[4], LLVMValueRef lod_bias, /* optional */ LLVMValueRef explicit_lod, /* optional */ - LLVMValueRef width, - LLVMValueRef height, - LLVMValueRef depth) + unsigned mip_filter, + LLVMValueRef *out_lod_ipart, + LLVMValueRef *out_lod_fpart) { - LLVMValueRef min_lod = - bld->dynamic_state->min_lod(bld->dynamic_state, bld->builder, unit); + struct lp_build_context *float_bld = &bld->float_bld; + LLVMValueRef lod; + + *out_lod_ipart = bld->int_bld.zero; + *out_lod_fpart = bld->float_bld.zero; if (bld->static_state->min_max_lod_equal) { /* User is forcing sampling from a particular mipmap level. * This is hit during mipmap generation. */ - return min_lod; + LLVMValueRef min_lod = + bld->dynamic_state->min_lod(bld->dynamic_state, bld->builder, unit); + + lod = min_lod; } else { - struct lp_build_context *float_bld = &bld->float_bld; LLVMValueRef sampler_lod_bias = bld->dynamic_state->lod_bias(bld->dynamic_state, bld->builder, unit); - LLVMValueRef max_lod = - bld->dynamic_state->max_lod(bld->dynamic_state, bld->builder, unit); LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0); - LLVMValueRef lod; if (explicit_lod) { lod = LLVMBuildExtractElement(bld->builder, explicit_lod, index0, ""); } else { - const int dims = texture_dims(bld->static_state->target); - LLVMValueRef dsdx, dsdy; - LLVMValueRef dtdx = NULL, dtdy = NULL, drdx = NULL, drdy = NULL; LLVMValueRef rho; - dsdx = LLVMBuildExtractElement(bld->builder, ddx[0], index0, "dsdx"); - dsdx = lp_build_abs(float_bld, dsdx); - dsdy = LLVMBuildExtractElement(bld->builder, ddy[0], index0, "dsdy"); - dsdy = lp_build_abs(float_bld, dsdy); - if (dims > 1) { - dtdx = LLVMBuildExtractElement(bld->builder, ddx[1], index0, "dtdx"); - dtdx = lp_build_abs(float_bld, dtdx); - dtdy = LLVMBuildExtractElement(bld->builder, ddy[1], index0, "dtdy"); - dtdy = lp_build_abs(float_bld, dtdy); - if (dims > 2) { - drdx = LLVMBuildExtractElement(bld->builder, ddx[2], index0, "drdx"); - drdx = lp_build_abs(float_bld, drdx); - drdy = LLVMBuildExtractElement(bld->builder, ddy[2], index0, "drdy"); - drdy = lp_build_abs(float_bld, drdy); - } - } + rho = lp_build_rho(bld, ddx, ddy); - /* Compute rho = max of all partial derivatives scaled by texture size. - * XXX this could be vectorized somewhat + /* + * Compute lod = log2(rho) */ - rho = LLVMBuildFMul(bld->builder, - lp_build_max(float_bld, dsdx, dsdy), - lp_build_int_to_float(float_bld, width), ""); - if (dims > 1) { - LLVMValueRef max; - max = LLVMBuildFMul(bld->builder, - lp_build_max(float_bld, dtdx, dtdy), - lp_build_int_to_float(float_bld, height), ""); - rho = lp_build_max(float_bld, rho, max); - if (dims > 2) { - max = LLVMBuildFMul(bld->builder, - lp_build_max(float_bld, drdx, drdy), - lp_build_int_to_float(float_bld, depth), ""); - rho = lp_build_max(float_bld, rho, max); + + if (!lod_bias && + !bld->static_state->lod_bias_non_zero && + !bld->static_state->apply_max_lod && + !bld->static_state->apply_min_lod) { + /* + * Special case when there are no post-log2 adjustments, which + * saves instructions but keeping the integer and fractional lod + * computations separate from the start. + */ + + if (mip_filter == PIPE_TEX_MIPFILTER_NONE || + mip_filter == PIPE_TEX_MIPFILTER_NEAREST) { + *out_lod_ipart = lp_build_ilog2(float_bld, rho); + *out_lod_fpart = bld->float_bld.zero; + return; + } + if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR && + !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) { + lp_build_brilinear_rho(float_bld, rho, BRILINEAR_FACTOR, + out_lod_ipart, out_lod_fpart); + return; } } - /* compute lod = log2(rho) */ - lod = lp_build_log2(float_bld, rho); + if (0) { + lod = lp_build_log2(float_bld, rho); + } + else { + lod = lp_build_fast_log2(float_bld, rho); + } /* add shader lod bias */ if (lod_bias) { @@ -254,13 +486,43 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, } /* add sampler lod bias */ - lod = LLVMBuildFAdd(bld->builder, lod, sampler_lod_bias, "sampler_lod_bias"); + if (bld->static_state->lod_bias_non_zero) + lod = LLVMBuildFAdd(bld->builder, lod, sampler_lod_bias, "sampler_lod_bias"); + /* clamp lod */ - lod = lp_build_clamp(float_bld, lod, min_lod, max_lod); + if (bld->static_state->apply_max_lod) { + LLVMValueRef max_lod = + bld->dynamic_state->max_lod(bld->dynamic_state, bld->builder, unit); + + lod = lp_build_min(float_bld, lod, max_lod); + } + if (bld->static_state->apply_min_lod) { + LLVMValueRef min_lod = + bld->dynamic_state->min_lod(bld->dynamic_state, bld->builder, unit); + + lod = lp_build_max(float_bld, lod, min_lod); + } + } - return lod; + if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { + if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) { + lp_build_brilinear_lod(float_bld, lod, BRILINEAR_FACTOR, + out_lod_ipart, out_lod_fpart); + } + else { + lp_build_ifloor_fract(float_bld, lod, out_lod_ipart, out_lod_fpart); + } + + lp_build_name(*out_lod_fpart, "lod_fpart"); + } + else { + *out_lod_ipart = lp_build_iround(float_bld, lod); } + + lp_build_name(*out_lod_ipart, "lod_ipart"); + + return; } @@ -274,10 +536,9 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, void lp_build_nearest_mip_level(struct lp_build_sample_context *bld, unsigned unit, - LLVMValueRef lod, + LLVMValueRef lod_ipart, LLVMValueRef *level_out) { - struct lp_build_context *float_bld = &bld->float_bld; struct lp_build_context *int_bld = &bld->int_bld; LLVMValueRef last_level, level; @@ -287,7 +548,7 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld, bld->builder, unit); /* convert float lod to integer */ - level = lp_build_iround(float_bld, lod); + level = lod_ipart; /* clamp level to legal range of levels */ *level_out = lp_build_clamp(int_bld, level, zero, last_level); @@ -302,32 +563,61 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld, void lp_build_linear_mip_levels(struct lp_build_sample_context *bld, unsigned unit, - LLVMValueRef lod, + LLVMValueRef lod_ipart, + LLVMValueRef *lod_fpart_inout, LLVMValueRef *level0_out, - LLVMValueRef *level1_out, - LLVMValueRef *weight_out) + LLVMValueRef *level1_out) { - struct lp_build_context *float_bld = &bld->float_bld; + LLVMBuilderRef builder = bld->builder; struct lp_build_context *int_bld = &bld->int_bld; - LLVMValueRef last_level, level; + struct lp_build_context *float_bld = &bld->float_bld; + LLVMValueRef last_level; + LLVMValueRef clamp_min; + LLVMValueRef clamp_max; + + *level0_out = lod_ipart; + *level1_out = lp_build_add(int_bld, lod_ipart, int_bld->one); last_level = bld->dynamic_state->last_level(bld->dynamic_state, bld->builder, unit); - /* convert float lod to integer */ - level = lp_build_ifloor(float_bld, lod); - - /* compute level 0 and clamp to legal range of levels */ - *level0_out = lp_build_clamp(int_bld, level, - int_bld->zero, - last_level); - /* compute level 1 and clamp to legal range of levels */ - level = lp_build_add(int_bld, level, int_bld->one); - *level1_out = lp_build_clamp(int_bld, level, - int_bld->zero, - last_level); - - *weight_out = lp_build_fract(float_bld, lod); + /* + * Clamp both lod_ipart and lod_ipart + 1 to [0, last_level], with the + * minimum number of comparisons, and zeroing lod_fpart in the extreme + * ends in the process. + */ + + /* lod_ipart < 0 */ + clamp_min = LLVMBuildICmp(builder, LLVMIntSLT, + lod_ipart, int_bld->zero, + "clamp_lod_to_zero"); + + *level0_out = LLVMBuildSelect(builder, clamp_min, + int_bld->zero, *level0_out, ""); + + *level1_out = LLVMBuildSelect(builder, clamp_min, + int_bld->zero, *level1_out, ""); + + *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min, + float_bld->zero, *lod_fpart_inout, ""); + + /* lod_ipart >= last_level */ + clamp_max = LLVMBuildICmp(builder, LLVMIntSGE, + lod_ipart, last_level, + "clamp_lod_to_last"); + + *level0_out = LLVMBuildSelect(builder, clamp_max, + last_level, *level0_out, ""); + + *level1_out = LLVMBuildSelect(builder, clamp_max, + last_level, *level1_out, ""); + + *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max, + float_bld->zero, *lod_fpart_inout, ""); + + lp_build_name(*level0_out, "sampler%u_miplevel0", unit); + lp_build_name(*level1_out, "sampler%u_miplevel1", unit); + lp_build_name(*lod_fpart_inout, "sampler%u_mipweight", unit); } @@ -338,12 +628,12 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld, */ LLVMValueRef lp_build_get_mipmap_level(struct lp_build_sample_context *bld, - LLVMValueRef data_array, LLVMValueRef level) + LLVMValueRef level) { LLVMValueRef indexes[2], data_ptr; indexes[0] = LLVMConstInt(LLVMInt32Type(), 0, 0); indexes[1] = level; - data_ptr = LLVMBuildGEP(bld->builder, data_array, indexes, 2, ""); + data_ptr = LLVMBuildGEP(bld->builder, bld->data_array, indexes, 2, ""); data_ptr = LLVMBuildLoad(bld->builder, data_ptr, ""); return data_ptr; } @@ -351,10 +641,10 @@ lp_build_get_mipmap_level(struct lp_build_sample_context *bld, LLVMValueRef lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld, - LLVMValueRef data_array, int level) + int level) { LLVMValueRef lvl = LLVMConstInt(LLVMInt32Type(), level, 0); - return lp_build_get_mipmap_level(bld, data_array, lvl); + return lp_build_get_mipmap_level(bld, lvl); } @@ -363,18 +653,22 @@ lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld, * Return max(1, base_size >> level); */ static LLVMValueRef -lp_build_minify(struct lp_build_sample_context *bld, +lp_build_minify(struct lp_build_context *bld, LLVMValueRef base_size, LLVMValueRef level) { - if (level == bld->int_coord_bld.zero) { + assert(lp_check_value(bld->type, base_size)); + assert(lp_check_value(bld->type, level)); + + if (level == bld->zero) { /* if we're using mipmap level zero, no minification is needed */ return base_size; } else { LLVMValueRef size = LLVMBuildLShr(bld->builder, base_size, level, "minify"); - size = lp_build_max(&bld->int_coord_bld, size, bld->int_coord_bld.one); + assert(bld->type.sign); + size = lp_build_max(bld, size, bld->one); return size; } } @@ -405,71 +699,113 @@ lp_build_get_level_stride_vec(struct lp_build_sample_context *bld, */ void lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld, - unsigned dims, - LLVMValueRef width_vec, - LLVMValueRef height_vec, - LLVMValueRef depth_vec, - LLVMValueRef ilevel0, - LLVMValueRef ilevel1, - LLVMValueRef row_stride_array, - LLVMValueRef img_stride_array, - LLVMValueRef *width0_vec, - LLVMValueRef *width1_vec, - LLVMValueRef *height0_vec, - LLVMValueRef *height1_vec, - LLVMValueRef *depth0_vec, - LLVMValueRef *depth1_vec, - LLVMValueRef *row_stride0_vec, - LLVMValueRef *row_stride1_vec, - LLVMValueRef *img_stride0_vec, - LLVMValueRef *img_stride1_vec) + LLVMValueRef ilevel, + LLVMValueRef *out_size, + LLVMValueRef *row_stride_vec, + LLVMValueRef *img_stride_vec) { - const unsigned mip_filter = bld->static_state->min_mip_filter; - LLVMValueRef ilevel0_vec, ilevel1_vec; + const unsigned dims = bld->dims; + LLVMValueRef ilevel_vec; - ilevel0_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel0); - if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) - ilevel1_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel1); + ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel); /* - * Compute width, height, depth at mipmap level 'ilevel0' + * Compute width, height, depth at mipmap level 'ilevel' */ - *width0_vec = lp_build_minify(bld, width_vec, ilevel0_vec); + *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec); + if (dims >= 2) { - *height0_vec = lp_build_minify(bld, height_vec, ilevel0_vec); - *row_stride0_vec = lp_build_get_level_stride_vec(bld, - row_stride_array, - ilevel0); + *row_stride_vec = lp_build_get_level_stride_vec(bld, + bld->row_stride_array, + ilevel); if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) { - *img_stride0_vec = lp_build_get_level_stride_vec(bld, - img_stride_array, - ilevel0); - if (dims == 3) { - *depth0_vec = lp_build_minify(bld, depth_vec, ilevel0_vec); - } + *img_stride_vec = lp_build_get_level_stride_vec(bld, + bld->img_stride_array, + ilevel); } } - if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { - /* compute width, height, depth for second mipmap level at 'ilevel1' */ - *width1_vec = lp_build_minify(bld, width_vec, ilevel1_vec); - if (dims >= 2) { - *height1_vec = lp_build_minify(bld, height_vec, ilevel1_vec); - *row_stride1_vec = lp_build_get_level_stride_vec(bld, - row_stride_array, - ilevel1); - if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) { - *img_stride1_vec = lp_build_get_level_stride_vec(bld, - img_stride_array, - ilevel1); - if (dims == 3) { - *depth1_vec = lp_build_minify(bld, depth_vec, ilevel1_vec); - } - } +} + + +/** + * Extract and broadcast texture size. + * + * @param size_type type of the texture size vector (either + * bld->int_size_type or bld->float_size_type) + * @param coord_type type of the texture size vector (either + * bld->int_coord_type or bld->coord_type) + * @param int_size vector with the integer texture size (width, height, + * depth) + */ +void +lp_build_extract_image_sizes(struct lp_build_sample_context *bld, + struct lp_type size_type, + struct lp_type coord_type, + LLVMValueRef size, + LLVMValueRef *out_width, + LLVMValueRef *out_height, + LLVMValueRef *out_depth) +{ + const unsigned dims = bld->dims; + LLVMTypeRef i32t = LLVMInt32Type(); + + *out_width = lp_build_extract_broadcast(bld->builder, + size_type, + coord_type, + size, + LLVMConstInt(i32t, 0, 0)); + if (dims >= 2) { + *out_height = lp_build_extract_broadcast(bld->builder, + size_type, + coord_type, + size, + LLVMConstInt(i32t, 1, 0)); + if (dims == 3) { + *out_depth = lp_build_extract_broadcast(bld->builder, + size_type, + coord_type, + size, + LLVMConstInt(i32t, 2, 0)); } } } +/** + * Unnormalize coords. + * + * @param int_size vector with the integer texture size (width, height, depth) + */ +void +lp_build_unnormalized_coords(struct lp_build_sample_context *bld, + LLVMValueRef flt_size, + LLVMValueRef *s, + LLVMValueRef *t, + LLVMValueRef *r) +{ + const unsigned dims = bld->dims; + LLVMValueRef width; + LLVMValueRef height; + LLVMValueRef depth; + + lp_build_extract_image_sizes(bld, + bld->float_size_type, + bld->coord_type, + flt_size, + &width, + &height, + &depth); + + /* s = s * width, t = t * height */ + *s = lp_build_mul(&bld->coord_bld, *s, width); + if (dims >= 2) { + *t = lp_build_mul(&bld->coord_bld, *t, height); + if (dims >= 3) { + *r = lp_build_mul(&bld->coord_bld, *r, depth); + } + } +} + /** Helper used by lp_build_cube_lookup() */ static LLVMValueRef @@ -588,25 +924,16 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld, rz_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rz, float_bld->zero, ""); { - struct lp_build_flow_context *flow_ctx; struct lp_build_if_state if_ctx; + LLVMValueRef face_s_var; + LLVMValueRef face_t_var; + LLVMValueRef face_var; - flow_ctx = lp_build_flow_create(bld->builder); - lp_build_flow_scope_begin(flow_ctx); - - *face_s = bld->coord_bld.undef; - *face_t = bld->coord_bld.undef; - *face = bld->int_bld.undef; + face_s_var = lp_build_alloca(bld->builder, bld->coord_bld.vec_type, "face_s_var"); + face_t_var = lp_build_alloca(bld->builder, bld->coord_bld.vec_type, "face_t_var"); + face_var = lp_build_alloca(bld->builder, bld->int_bld.vec_type, "face_var"); - lp_build_name(*face_s, "face_s"); - lp_build_name(*face_t, "face_t"); - lp_build_name(*face, "face"); - - lp_build_flow_scope_declare(flow_ctx, face_s); - lp_build_flow_scope_declare(flow_ctx, face_t); - lp_build_flow_scope_declare(flow_ctx, face); - - lp_build_if(&if_ctx, flow_ctx, bld->builder, arx_ge_ary_arz); + lp_build_if(&if_ctx, bld->builder, arx_ge_ary_arz); { /* +/- X face */ LLVMValueRef sign = lp_build_sgn(float_bld, rx); @@ -616,57 +943,52 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld, *face = lp_build_cube_face(bld, rx, PIPE_TEX_FACE_POS_X, PIPE_TEX_FACE_NEG_X); + LLVMBuildStore(bld->builder, *face_s, face_s_var); + LLVMBuildStore(bld->builder, *face_t, face_t_var); + LLVMBuildStore(bld->builder, *face, face_var); } lp_build_else(&if_ctx); { - struct lp_build_flow_context *flow_ctx2; struct lp_build_if_state if_ctx2; - LLVMValueRef face_s2 = bld->coord_bld.undef; - LLVMValueRef face_t2 = bld->coord_bld.undef; - LLVMValueRef face2 = bld->int_bld.undef; - - flow_ctx2 = lp_build_flow_create(bld->builder); - lp_build_flow_scope_begin(flow_ctx2); - lp_build_flow_scope_declare(flow_ctx2, &face_s2); - lp_build_flow_scope_declare(flow_ctx2, &face_t2); - lp_build_flow_scope_declare(flow_ctx2, &face2); - ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, ""); - lp_build_if(&if_ctx2, flow_ctx2, bld->builder, ary_ge_arx_arz); + lp_build_if(&if_ctx2, bld->builder, ary_ge_arx_arz); { /* +/- Y face */ LLVMValueRef sign = lp_build_sgn(float_bld, ry); LLVMValueRef ima = lp_build_cube_ima(coord_bld, t); - face_s2 = lp_build_cube_coord(coord_bld, NULL, -1, s, ima); - face_t2 = lp_build_cube_coord(coord_bld, sign, -1, r, ima); - face2 = lp_build_cube_face(bld, ry, + *face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima); + *face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima); + *face = lp_build_cube_face(bld, ry, PIPE_TEX_FACE_POS_Y, PIPE_TEX_FACE_NEG_Y); + LLVMBuildStore(bld->builder, *face_s, face_s_var); + LLVMBuildStore(bld->builder, *face_t, face_t_var); + LLVMBuildStore(bld->builder, *face, face_var); } lp_build_else(&if_ctx2); { /* +/- Z face */ LLVMValueRef sign = lp_build_sgn(float_bld, rz); LLVMValueRef ima = lp_build_cube_ima(coord_bld, r); - face_s2 = lp_build_cube_coord(coord_bld, sign, -1, s, ima); - face_t2 = lp_build_cube_coord(coord_bld, NULL, +1, t, ima); - face2 = lp_build_cube_face(bld, rz, + *face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima); + *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima); + *face = lp_build_cube_face(bld, rz, PIPE_TEX_FACE_POS_Z, PIPE_TEX_FACE_NEG_Z); + LLVMBuildStore(bld->builder, *face_s, face_s_var); + LLVMBuildStore(bld->builder, *face_t, face_t_var); + LLVMBuildStore(bld->builder, *face, face_var); } lp_build_endif(&if_ctx2); - lp_build_flow_scope_end(flow_ctx2); - lp_build_flow_destroy(flow_ctx2); - *face_s = face_s2; - *face_t = face_t2; - *face = face2; } lp_build_endif(&if_ctx); - lp_build_flow_scope_end(flow_ctx); - lp_build_flow_destroy(flow_ctx); + + *face_s = LLVMBuildLoad(bld->builder, face_s_var, "face_s"); + *face_t = LLVMBuildLoad(bld->builder, face_t_var, "face_t"); + *face = LLVMBuildLoad(bld->builder, face_var, "face"); } } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h index 4d2eeaa5eb4..ffed27cee83 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h @@ -83,6 +83,9 @@ struct lp_sampler_static_state unsigned compare_func:3; unsigned normalized_coords:1; unsigned min_max_lod_equal:1; /**< min_lod == max_lod ? */ + unsigned lod_bias_non_zero:1; + unsigned apply_min_lod:1; /**< min_lod > 0 ? */ + unsigned apply_max_lod:1; /**< max_lod < last_level ? */ }; @@ -176,6 +179,9 @@ struct lp_build_sample_context const struct util_format_description *format_desc; + /* See texture_dims() */ + unsigned dims; + /** regular scalar float type */ struct lp_type float_type; struct lp_build_context float_bld; @@ -191,17 +197,32 @@ struct lp_build_sample_context struct lp_type coord_type; struct lp_build_context coord_bld; - /** Unsigned integer coordinates */ - struct lp_type uint_coord_type; - struct lp_build_context uint_coord_bld; - /** Signed integer coordinates */ struct lp_type int_coord_type; struct lp_build_context int_coord_bld; + /** Unsigned integer texture size */ + struct lp_type int_size_type; + struct lp_build_context int_size_bld; + + /** Unsigned integer texture size */ + struct lp_type float_size_type; + struct lp_build_context float_size_bld; + /** Output texels type and build context */ struct lp_type texel_type; struct lp_build_context texel_bld; + + /* Common dynamic state values */ + LLVMValueRef width; + LLVMValueRef height; + LLVMValueRef depth; + LLVMValueRef row_stride_array; + LLVMValueRef img_stride_array; + LLVMValueRef data_array; + + /** Integer vector with texture width, height, depth */ + LLVMValueRef int_size; }; @@ -238,7 +259,7 @@ apply_sampler_swizzle(struct lp_build_sample_context *bld, } -static INLINE int +static INLINE unsigned texture_dims(enum pipe_texture_target tex) { switch (tex) { @@ -271,16 +292,16 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, const struct pipe_sampler_state *sampler); -LLVMValueRef +void lp_build_lod_selector(struct lp_build_sample_context *bld, unsigned unit, const LLVMValueRef ddx[4], const LLVMValueRef ddy[4], LLVMValueRef lod_bias, /* optional */ LLVMValueRef explicit_lod, /* optional */ - LLVMValueRef width, - LLVMValueRef height, - LLVMValueRef depth); + unsigned mip_filter, + LLVMValueRef *out_lod_ipart, + LLVMValueRef *out_lod_fpart); void lp_build_nearest_mip_level(struct lp_build_sample_context *bld, @@ -291,40 +312,44 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld, void lp_build_linear_mip_levels(struct lp_build_sample_context *bld, unsigned unit, - LLVMValueRef lod, + LLVMValueRef lod_ipart, + LLVMValueRef *lod_fpart_inout, LLVMValueRef *level0_out, - LLVMValueRef *level1_out, - LLVMValueRef *weight_out); + LLVMValueRef *level1_out); LLVMValueRef lp_build_get_mipmap_level(struct lp_build_sample_context *bld, - LLVMValueRef data_array, LLVMValueRef level); + LLVMValueRef level); LLVMValueRef lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld, - LLVMValueRef data_array, int level); + int level); void lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld, - unsigned dims, - LLVMValueRef width_vec, - LLVMValueRef height_vec, - LLVMValueRef depth_vec, - LLVMValueRef ilevel0, - LLVMValueRef ilevel1, - LLVMValueRef row_stride_array, - LLVMValueRef img_stride_array, - LLVMValueRef *width0_vec, - LLVMValueRef *width1_vec, - LLVMValueRef *height0_vec, - LLVMValueRef *height1_vec, - LLVMValueRef *depth0_vec, - LLVMValueRef *depth1_vec, - LLVMValueRef *row_stride0_vec, - LLVMValueRef *row_stride1_vec, - LLVMValueRef *img_stride0_vec, - LLVMValueRef *img_stride1_vec); + LLVMValueRef ilevel, + LLVMValueRef *out_size_vec, + LLVMValueRef *row_stride_vec, + LLVMValueRef *img_stride_vec); + + +void +lp_build_extract_image_sizes(struct lp_build_sample_context *bld, + struct lp_type size_type, + struct lp_type coord_type, + LLVMValueRef size, + LLVMValueRef *out_width, + LLVMValueRef *out_height, + LLVMValueRef *out_depth); + + +void +lp_build_unnormalized_coords(struct lp_build_sample_context *bld, + LLVMValueRef flt_size, + LLVMValueRef *s, + LLVMValueRef *t, + LLVMValueRef *r); void diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c index 49a6eed615f..d6831a580b3 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c @@ -45,6 +45,7 @@ #include "lp_bld_const.h" #include "lp_bld_conv.h" #include "lp_bld_arit.h" +#include "lp_bld_bitarit.h" #include "lp_bld_logic.h" #include "lp_bld_swizzle.h" #include "lp_bld_pack.h" @@ -80,11 +81,10 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld, LLVMValueRef *out_offset, LLVMValueRef *out_i) { - struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld; struct lp_build_context *int_coord_bld = &bld->int_coord_bld; LLVMValueRef length_minus_one; - length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one); + length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one); switch(wrap_mode) { case PIPE_TEX_WRAP_REPEAT: @@ -92,7 +92,7 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld, coord = LLVMBuildAnd(bld->builder, coord, length_minus_one, ""); else { /* Add a bias to the texcoord to handle negative coords */ - LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024); + LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024); coord = LLVMBuildAdd(bld->builder, coord, bias, ""); coord = LLVMBuildURem(bld->builder, coord, length, ""); } @@ -113,7 +113,7 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld, assert(0); } - lp_build_sample_partial_offset(uint_coord_bld, block_length, coord, stride, + lp_build_sample_partial_offset(int_coord_bld, block_length, coord, stride, out_offset, out_i); } @@ -146,7 +146,6 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld, LLVMValueRef *i0, LLVMValueRef *i1) { - struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld; struct lp_build_context *int_coord_bld = &bld->int_coord_bld; LLVMValueRef length_minus_one; LLVMValueRef lmask, umask, mask; @@ -188,8 +187,8 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld, * multiplication. */ - *i0 = uint_coord_bld->zero; - *i1 = uint_coord_bld->zero; + *i0 = int_coord_bld->zero; + *i1 = int_coord_bld->zero; length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one); @@ -200,7 +199,7 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld, } else { /* Add a bias to the texcoord to handle negative coords */ - LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024); + LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024); coord0 = LLVMBuildAdd(bld->builder, coord0, bias, ""); coord0 = LLVMBuildURem(bld->builder, coord0, length, ""); } @@ -208,9 +207,9 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld, mask = lp_build_compare(bld->builder, int_coord_bld->type, PIPE_FUNC_NOTEQUAL, coord0, length_minus_one); - *offset0 = lp_build_mul(uint_coord_bld, coord0, stride); + *offset0 = lp_build_mul(int_coord_bld, coord0, stride); *offset1 = LLVMBuildAnd(bld->builder, - lp_build_add(uint_coord_bld, *offset0, stride), + lp_build_add(int_coord_bld, *offset0, stride), mask, ""); break; @@ -225,8 +224,8 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld, mask = LLVMBuildAnd(bld->builder, lmask, umask, ""); - *offset0 = lp_build_mul(uint_coord_bld, coord0, stride); - *offset1 = lp_build_add(uint_coord_bld, + *offset0 = lp_build_mul(int_coord_bld, coord0, stride); + *offset1 = lp_build_add(int_coord_bld, *offset0, LLVMBuildAnd(bld->builder, stride, mask, "")); break; @@ -239,8 +238,8 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld, case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: default: assert(0); - *offset0 = uint_coord_bld->zero; - *offset1 = uint_coord_bld->zero; + *offset0 = int_coord_bld->zero; + *offset1 = int_coord_bld->zero; break; } } @@ -253,9 +252,7 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld, */ static void lp_build_sample_image_nearest(struct lp_build_sample_context *bld, - LLVMValueRef width_vec, - LLVMValueRef height_vec, - LLVMValueRef depth_vec, + LLVMValueRef int_size, LLVMValueRef row_stride_vec, LLVMValueRef img_stride_vec, LLVMValueRef data_ptr, @@ -265,12 +262,13 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, LLVMValueRef *colors_lo, LLVMValueRef *colors_hi) { - const int dims = texture_dims(bld->static_state->target); + const unsigned dims = bld->dims; LLVMBuilderRef builder = bld->builder; struct lp_build_context i32, h16, u8n; LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type; LLVMValueRef i32_c8; - LLVMValueRef s_ipart, t_ipart, r_ipart; + LLVMValueRef width_vec, height_vec, depth_vec; + LLVMValueRef s_ipart, t_ipart = NULL, r_ipart = NULL; LLVMValueRef x_stride; LLVMValueRef x_offset, offset; LLVMValueRef x_subcoord, y_subcoord, z_subcoord; @@ -283,30 +281,33 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, h16_vec_type = lp_build_vec_type(h16.type); u8n_vec_type = lp_build_vec_type(u8n.type); + lp_build_extract_image_sizes(bld, + bld->int_size_type, + bld->int_coord_type, + int_size, + &width_vec, + &height_vec, + &depth_vec); + if (bld->static_state->normalized_coords) { - /* s = s * width, t = t * height */ - LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type); - LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width_vec, - coord_vec_type, ""); - s = lp_build_mul(&bld->coord_bld, s, fp_width); - if (dims >= 2) { - LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height_vec, - coord_vec_type, ""); - t = lp_build_mul(&bld->coord_bld, t, fp_height); - if (dims >= 3) { - LLVMValueRef fp_depth = LLVMBuildSIToFP(bld->builder, depth_vec, - coord_vec_type, ""); - r = lp_build_mul(&bld->coord_bld, r, fp_depth); - } - } - } + LLVMValueRef scaled_size; + LLVMValueRef flt_size; - /* scale coords by 256 (8 fractional bits) */ - s = lp_build_mul_imm(&bld->coord_bld, s, 256); - if (dims >= 2) - t = lp_build_mul_imm(&bld->coord_bld, t, 256); - if (dims >= 3) - r = lp_build_mul_imm(&bld->coord_bld, r, 256); + /* scale size by 256 (8 fractional bits) */ + scaled_size = lp_build_shl_imm(&bld->int_size_bld, int_size, 8); + + flt_size = lp_build_int_to_float(&bld->float_size_bld, scaled_size); + + lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r); + } + else { + /* scale coords by 256 (8 fractional bits) */ + s = lp_build_mul_imm(&bld->coord_bld, s, 256); + if (dims >= 2) + t = lp_build_mul_imm(&bld->coord_bld, t, 256); + if (dims >= 3) + r = lp_build_mul_imm(&bld->coord_bld, r, 256); + } /* convert float to int */ s = LLVMBuildFPToSI(builder, s, i32_vec_type, ""); @@ -324,7 +325,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, r_ipart = LLVMBuildAShr(builder, r, i32_c8, ""); /* get pixel, row, image strides */ - x_stride = lp_build_const_vec(bld->uint_coord_bld.type, + x_stride = lp_build_const_vec(bld->int_coord_bld.type, bld->format_desc->block.bits/8); /* Do texcoord wrapping, compute texel offset */ @@ -343,7 +344,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, bld->static_state->pot_height, bld->static_state->wrap_t, &y_offset, &y_subcoord); - offset = lp_build_add(&bld->uint_coord_bld, offset, y_offset); + offset = lp_build_add(&bld->int_coord_bld, offset, y_offset); if (dims >= 3) { LLVMValueRef z_offset; lp_build_sample_wrap_nearest_int(bld, @@ -352,13 +353,13 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, bld->static_state->pot_height, bld->static_state->wrap_r, &z_offset, &z_subcoord); - offset = lp_build_add(&bld->uint_coord_bld, offset, z_offset); + offset = lp_build_add(&bld->int_coord_bld, offset, z_offset); } else if (bld->static_state->target == PIPE_TEXTURE_CUBE) { LLVMValueRef z_offset; /* The r coord is the cube face in [0,5] */ - z_offset = lp_build_mul(&bld->uint_coord_bld, r, img_stride_vec); - offset = lp_build_add(&bld->uint_coord_bld, offset, z_offset); + z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec); + offset = lp_build_add(&bld->int_coord_bld, offset, z_offset); } } @@ -417,9 +418,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, */ static void lp_build_sample_image_linear(struct lp_build_sample_context *bld, - LLVMValueRef width_vec, - LLVMValueRef height_vec, - LLVMValueRef depth_vec, + LLVMValueRef int_size, LLVMValueRef row_stride_vec, LLVMValueRef img_stride_vec, LLVMValueRef data_ptr, @@ -429,14 +428,15 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, LLVMValueRef *colors_lo, LLVMValueRef *colors_hi) { - const int dims = texture_dims(bld->static_state->target); + const unsigned dims = bld->dims; LLVMBuilderRef builder = bld->builder; struct lp_build_context i32, h16, u8n; LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type; LLVMValueRef i32_c8, i32_c128, i32_c255; + LLVMValueRef width_vec, height_vec, depth_vec; LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi; - LLVMValueRef t_ipart, t_fpart, t_fpart_lo, t_fpart_hi; - LLVMValueRef r_ipart, r_fpart, r_fpart_lo, r_fpart_hi; + LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_fpart_lo = NULL, t_fpart_hi = NULL; + LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_fpart_lo = NULL, r_fpart_hi = NULL; LLVMValueRef x_stride, y_stride, z_stride; LLVMValueRef x_offset0, x_offset1; LLVMValueRef y_offset0, y_offset1; @@ -458,30 +458,33 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, h16_vec_type = lp_build_vec_type(h16.type); u8n_vec_type = lp_build_vec_type(u8n.type); + lp_build_extract_image_sizes(bld, + bld->int_size_type, + bld->int_coord_type, + int_size, + &width_vec, + &height_vec, + &depth_vec); + if (bld->static_state->normalized_coords) { - /* s = s * width, t = t * height */ - LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type); - LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width_vec, - coord_vec_type, ""); - s = lp_build_mul(&bld->coord_bld, s, fp_width); - if (dims >= 2) { - LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height_vec, - coord_vec_type, ""); - t = lp_build_mul(&bld->coord_bld, t, fp_height); - } - if (dims >= 3) { - LLVMValueRef fp_depth = LLVMBuildSIToFP(bld->builder, depth_vec, - coord_vec_type, ""); - r = lp_build_mul(&bld->coord_bld, r, fp_depth); - } - } + LLVMValueRef scaled_size; + LLVMValueRef flt_size; - /* scale coords by 256 (8 fractional bits) */ - s = lp_build_mul_imm(&bld->coord_bld, s, 256); - if (dims >= 2) - t = lp_build_mul_imm(&bld->coord_bld, t, 256); - if (dims >= 3) - r = lp_build_mul_imm(&bld->coord_bld, r, 256); + /* scale size by 256 (8 fractional bits) */ + scaled_size = lp_build_shl_imm(&bld->int_size_bld, int_size, 8); + + flt_size = lp_build_int_to_float(&bld->float_size_bld, scaled_size); + + lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r); + } + else { + /* scale coords by 256 (8 fractional bits) */ + s = lp_build_mul_imm(&bld->coord_bld, s, 256); + if (dims >= 2) + t = lp_build_mul_imm(&bld->coord_bld, t, 256); + if (dims >= 3) + r = lp_build_mul_imm(&bld->coord_bld, r, 256); + } /* convert float to int */ s = LLVMBuildFPToSI(builder, s, i32_vec_type, ""); @@ -517,7 +520,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, r_fpart = LLVMBuildAnd(builder, r, i32_c255, ""); /* get pixel, row and image strides */ - x_stride = lp_build_const_vec(bld->uint_coord_bld.type, + x_stride = lp_build_const_vec(bld->int_coord_bld.type, bld->format_desc->block.bits/8); y_stride = row_stride_vec; z_stride = img_stride_vec; @@ -548,9 +551,9 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, for (z = 0; z < 2; z++) { for (x = 0; x < 2; x++) { - offset[z][0][x] = lp_build_add(&bld->uint_coord_bld, + offset[z][0][x] = lp_build_add(&bld->int_coord_bld, offset[z][0][x], y_offset0); - offset[z][1][x] = lp_build_add(&bld->uint_coord_bld, + offset[z][1][x] = lp_build_add(&bld->int_coord_bld, offset[z][1][x], y_offset1); } } @@ -566,20 +569,20 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, &z_subcoord[0], &z_subcoord[1]); for (y = 0; y < 2; y++) { for (x = 0; x < 2; x++) { - offset[0][y][x] = lp_build_add(&bld->uint_coord_bld, + offset[0][y][x] = lp_build_add(&bld->int_coord_bld, offset[0][y][x], z_offset0); - offset[1][y][x] = lp_build_add(&bld->uint_coord_bld, + offset[1][y][x] = lp_build_add(&bld->int_coord_bld, offset[1][y][x], z_offset1); } } } else if (bld->static_state->target == PIPE_TEXTURE_CUBE) { LLVMValueRef z_offset; - z_offset = lp_build_mul(&bld->uint_coord_bld, r, img_stride_vec); + z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec); for (y = 0; y < 2; y++) { for (x = 0; x < 2; x++) { /* The r coord is the cube face in [0,5] */ - offset[0][y][x] = lp_build_add(&bld->uint_coord_bld, + offset[0][y][x] = lp_build_add(&bld->int_coord_bld, offset[0][y][x], z_offset); } } @@ -781,76 +784,124 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld, LLVMValueRef s, LLVMValueRef t, LLVMValueRef r, + LLVMValueRef ilevel0, + LLVMValueRef ilevel1, LLVMValueRef lod_fpart, - LLVMValueRef width0_vec, - LLVMValueRef width1_vec, - LLVMValueRef height0_vec, - LLVMValueRef height1_vec, - LLVMValueRef depth0_vec, - LLVMValueRef depth1_vec, - LLVMValueRef row_stride0_vec, - LLVMValueRef row_stride1_vec, - LLVMValueRef img_stride0_vec, - LLVMValueRef img_stride1_vec, - LLVMValueRef data_ptr0, - LLVMValueRef data_ptr1, - LLVMValueRef *colors_lo, - LLVMValueRef *colors_hi) + LLVMValueRef colors_lo_var, + LLVMValueRef colors_hi_var) { + LLVMBuilderRef builder = bld->builder; + LLVMValueRef size0; + LLVMValueRef size1; + LLVMValueRef row_stride0_vec; + LLVMValueRef row_stride1_vec; + LLVMValueRef img_stride0_vec; + LLVMValueRef img_stride1_vec; + LLVMValueRef data_ptr0; + LLVMValueRef data_ptr1; LLVMValueRef colors0_lo, colors0_hi; LLVMValueRef colors1_lo, colors1_hi; + + /* sample the first mipmap level */ + lp_build_mipmap_level_sizes(bld, ilevel0, + &size0, + &row_stride0_vec, &img_stride0_vec); + data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0); if (img_filter == PIPE_TEX_FILTER_NEAREST) { - /* sample the first mipmap level */ lp_build_sample_image_nearest(bld, - width0_vec, height0_vec, depth0_vec, + size0, row_stride0_vec, img_stride0_vec, data_ptr0, s, t, r, &colors0_lo, &colors0_hi); - - if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { - /* sample the second mipmap level */ - lp_build_sample_image_nearest(bld, - width1_vec, height1_vec, depth1_vec, - row_stride1_vec, img_stride1_vec, - data_ptr1, s, t, r, - &colors1_lo, &colors1_hi); - } } else { assert(img_filter == PIPE_TEX_FILTER_LINEAR); - - /* sample the first mipmap level */ lp_build_sample_image_linear(bld, - width0_vec, height0_vec, depth0_vec, + size0, row_stride0_vec, img_stride0_vec, data_ptr0, s, t, r, &colors0_lo, &colors0_hi); - - if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { - /* sample the second mipmap level */ - lp_build_sample_image_linear(bld, - width1_vec, height1_vec, depth1_vec, - row_stride1_vec, img_stride1_vec, - data_ptr1, s, t, r, - &colors1_lo, &colors1_hi); - } } + /* Store the first level's colors in the output variables */ + LLVMBuildStore(builder, colors0_lo, colors_lo_var); + LLVMBuildStore(builder, colors0_hi, colors_hi_var); + if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { - /* interpolate samples from the two mipmap levels */ - struct lp_build_context h16; - lp_build_context_init(&h16, bld->builder, lp_type_ufixed(16)); - - *colors_lo = lp_build_lerp(&h16, lod_fpart, - colors0_lo, colors1_lo); - *colors_hi = lp_build_lerp(&h16, lod_fpart, - colors0_hi, colors1_hi); - } - else { - /* use first/only level's colors */ - *colors_lo = colors0_lo; - *colors_hi = colors0_hi; + LLVMValueRef h16_scale = LLVMConstReal(LLVMFloatType(), 256.0); + LLVMTypeRef i32_type = LLVMIntType(32); + struct lp_build_if_state if_ctx; + LLVMValueRef need_lerp; + + lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16_scale, ""); + lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32_type, "lod_fpart.fixed16"); + + /* need_lerp = lod_fpart > 0 */ + need_lerp = LLVMBuildICmp(builder, LLVMIntSGT, + lod_fpart, LLVMConstNull(i32_type), + "need_lerp"); + + lp_build_if(&if_ctx, builder, need_lerp); + { + struct lp_build_context h16_bld; + + lp_build_context_init(&h16_bld, builder, lp_type_ufixed(16)); + + /* sample the second mipmap level */ + lp_build_mipmap_level_sizes(bld, ilevel1, + &size1, + &row_stride1_vec, &img_stride1_vec); + data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1); + if (img_filter == PIPE_TEX_FILTER_NEAREST) { + lp_build_sample_image_nearest(bld, + size1, + row_stride1_vec, img_stride1_vec, + data_ptr1, s, t, r, + &colors1_lo, &colors1_hi); + } + else { + lp_build_sample_image_linear(bld, + size1, + row_stride1_vec, img_stride1_vec, + data_ptr1, s, t, r, + &colors1_lo, &colors1_hi); + } + + /* interpolate samples from the two mipmap levels */ + + lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, ""); + lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart); + +#if HAVE_LLVM == 0x208 + /* This is a work-around for a bug in LLVM 2.8. + * Evidently, something goes wrong in the construction of the + * lod_fpart short[8] vector. Adding this no-effect shuffle seems + * to force the vector to be properly constructed. + * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f). + */ + { + LLVMValueRef shuffles[8], shuffle; + int i; + assert(h16_bld.type.length <= Elements(shuffles)); + for (i = 0; i < h16_bld.type.length; i++) + shuffles[i] = lp_build_const_int32(2 * (i & 1)); + shuffle = LLVMConstVector(shuffles, h16_bld.type.length); + lod_fpart = LLVMBuildShuffleVector(builder, + lod_fpart, lod_fpart, + shuffle, ""); + } +#endif + + colors0_lo = lp_build_lerp(&h16_bld, lod_fpart, + colors0_lo, colors1_lo); + colors0_hi = lp_build_lerp(&h16_bld, lod_fpart, + colors0_hi, colors1_hi); + + LLVMBuildStore(builder, colors0_lo, colors_lo_var); + LLVMBuildStore(builder, colors0_hi, colors_hi_var); + } + lp_build_endif(&if_ctx); } } @@ -871,35 +922,22 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, const LLVMValueRef *ddy, LLVMValueRef lod_bias, /* optional */ LLVMValueRef explicit_lod, /* optional */ - LLVMValueRef width, - LLVMValueRef height, - LLVMValueRef depth, - LLVMValueRef width_vec, - LLVMValueRef height_vec, - LLVMValueRef depth_vec, - LLVMValueRef row_stride_array, - LLVMValueRef img_stride_array, - LLVMValueRef data_array, LLVMValueRef texel_out[4]) { - struct lp_build_context *float_bld = &bld->float_bld; + struct lp_build_context *int_bld = &bld->int_bld; LLVMBuilderRef builder = bld->builder; const unsigned mip_filter = bld->static_state->min_mip_filter; const unsigned min_filter = bld->static_state->min_img_filter; const unsigned mag_filter = bld->static_state->mag_img_filter; - const int dims = texture_dims(bld->static_state->target); - LLVMValueRef lod = NULL, lod_fpart = NULL; + const unsigned dims = bld->dims; + LLVMValueRef lod_ipart = NULL, lod_fpart = NULL; LLVMValueRef ilevel0, ilevel1 = NULL; - LLVMValueRef width0_vec = NULL, height0_vec = NULL, depth0_vec = NULL; - LLVMValueRef width1_vec = NULL, height1_vec = NULL, depth1_vec = NULL; - LLVMValueRef row_stride0_vec = NULL, row_stride1_vec = NULL; - LLVMValueRef img_stride0_vec = NULL, img_stride1_vec = NULL; - LLVMValueRef data_ptr0, data_ptr1 = NULL; LLVMValueRef packed, packed_lo, packed_hi; LLVMValueRef unswizzled[4]; LLVMValueRef face_ddx[4], face_ddy[4]; - struct lp_build_context h16; - LLVMTypeRef h16_vec_type; + struct lp_build_context h16_bld; + LLVMTypeRef i32t = LLVMInt32Type(); + LLVMValueRef i32t_zero = LLVMConstInt(i32t, 0, 0); /* we only support the common/simple wrap modes at this time */ assert(lp_is_simple_wrap_mode(bld->static_state->wrap_s)); @@ -910,9 +948,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, /* make 16-bit fixed-pt builder context */ - lp_build_context_init(&h16, builder, lp_type_ufixed(16)); - h16_vec_type = lp_build_vec_type(h16.type); - + lp_build_context_init(&h16_bld, builder, lp_type_ufixed(16)); /* cube face selection, compute pre-face coords, etc. */ if (bld->static_state->target == PIPE_TEXTURE_CUBE) { @@ -924,19 +960,18 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */ /* recompute ddx, ddy using the new (s,t) face texcoords */ - face_ddx[0] = lp_build_ddx(&bld->coord_bld, s); - face_ddx[1] = lp_build_ddx(&bld->coord_bld, t); + face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s); + face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t); face_ddx[2] = NULL; face_ddx[3] = NULL; - face_ddy[0] = lp_build_ddy(&bld->coord_bld, s); - face_ddy[1] = lp_build_ddy(&bld->coord_bld, t); + face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s); + face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t); face_ddy[2] = NULL; face_ddy[3] = NULL; ddx = face_ddx; ddy = face_ddy; } - /* * Compute the level of detail (float). */ @@ -945,15 +980,16 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, /* Need to compute lod either to choose mipmap levels or to * distinguish between minification/magnification with one mipmap level. */ - lod = lp_build_lod_selector(bld, unit, ddx, ddy, - lod_bias, explicit_lod, - width, height, depth); + lp_build_lod_selector(bld, unit, ddx, ddy, + lod_bias, explicit_lod, + mip_filter, + &lod_ipart, &lod_fpart); + } else { + lod_ipart = i32t_zero; } /* * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1 - * If mipfilter=linear, also compute the weight between the two - * mipmap levels: lod_fpart */ switch (mip_filter) { default: @@ -966,135 +1002,81 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, * We should be able to set ilevel0 = const(0) but that causes * bad x86 code to be emitted. */ - lod = lp_build_const_elem(bld->coord_bld.type, 0.0); - lp_build_nearest_mip_level(bld, unit, lod, &ilevel0); + assert(lod_ipart); + lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0); } else { - ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0); + ilevel0 = i32t_zero; } break; case PIPE_TEX_MIPFILTER_NEAREST: - assert(lod); - lp_build_nearest_mip_level(bld, unit, lod, &ilevel0); + assert(lod_ipart); + lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0); break; case PIPE_TEX_MIPFILTER_LINEAR: - { - LLVMValueRef f256 = LLVMConstReal(LLVMFloatType(), 256.0); - LLVMValueRef i255 = lp_build_const_int32(255); - LLVMTypeRef i16_type = LLVMIntType(16); - - assert(lod); - - lp_build_linear_mip_levels(bld, unit, lod, &ilevel0, &ilevel1, - &lod_fpart); - lod_fpart = LLVMBuildFMul(builder, lod_fpart, f256, ""); - lod_fpart = lp_build_ifloor(&bld->float_bld, lod_fpart); - lod_fpart = LLVMBuildAnd(builder, lod_fpart, i255, ""); - lod_fpart = LLVMBuildTrunc(builder, lod_fpart, i16_type, ""); - lod_fpart = lp_build_broadcast_scalar(&h16, lod_fpart); - - /* the lod_fpart values will be fixed pt values in [0,1) */ - } + assert(lod_ipart); + assert(lod_fpart); + lp_build_linear_mip_levels(bld, unit, + lod_ipart, &lod_fpart, + &ilevel0, &ilevel1); break; } - /* compute image size(s) of source mipmap level(s) */ - lp_build_mipmap_level_sizes(bld, dims, width_vec, height_vec, depth_vec, - ilevel0, ilevel1, - row_stride_array, img_stride_array, - &width0_vec, &width1_vec, - &height0_vec, &height1_vec, - &depth0_vec, &depth1_vec, - &row_stride0_vec, &row_stride1_vec, - &img_stride0_vec, &img_stride1_vec); - /* - * Get pointer(s) to image data for mipmap level(s). + * Get/interpolate texture colors. */ - data_ptr0 = lp_build_get_mipmap_level(bld, data_array, ilevel0); - if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { - data_ptr1 = lp_build_get_mipmap_level(bld, data_array, ilevel1); - } + packed_lo = lp_build_alloca(builder, h16_bld.vec_type, "packed_lo"); + packed_hi = lp_build_alloca(builder, h16_bld.vec_type, "packed_hi"); - /* - * Get/interpolate texture colors. - */ if (min_filter == mag_filter) { /* no need to distinquish between minification and magnification */ - lp_build_sample_mipmap(bld, min_filter, mip_filter, - s, t, r, lod_fpart, - width0_vec, width1_vec, - height0_vec, height1_vec, - depth0_vec, depth1_vec, - row_stride0_vec, row_stride1_vec, - img_stride0_vec, img_stride1_vec, - data_ptr0, data_ptr1, - &packed_lo, &packed_hi); + lp_build_sample_mipmap(bld, + min_filter, mip_filter, + s, t, r, + ilevel0, ilevel1, lod_fpart, + packed_lo, packed_hi); } else { /* Emit conditional to choose min image filter or mag image filter * depending on the lod being > 0 or <= 0, respectively. */ - struct lp_build_flow_context *flow_ctx; struct lp_build_if_state if_ctx; LLVMValueRef minify; - flow_ctx = lp_build_flow_create(builder); - lp_build_flow_scope_begin(flow_ctx); + /* minify = lod >= 0.0 */ + minify = LLVMBuildICmp(builder, LLVMIntSGE, + lod_ipart, int_bld->zero, ""); - packed_lo = LLVMGetUndef(h16_vec_type); - packed_hi = LLVMGetUndef(h16_vec_type); - - lp_build_flow_scope_declare(flow_ctx, &packed_lo); - lp_build_flow_scope_declare(flow_ctx, &packed_hi); - - /* minify = lod > 0.0 */ - minify = LLVMBuildFCmp(builder, LLVMRealUGE, - lod, float_bld->zero, ""); - - lp_build_if(&if_ctx, flow_ctx, builder, minify); + lp_build_if(&if_ctx, builder, minify); { /* Use the minification filter */ - lp_build_sample_mipmap(bld, min_filter, mip_filter, - s, t, r, lod_fpart, - width0_vec, width1_vec, - height0_vec, height1_vec, - depth0_vec, depth1_vec, - row_stride0_vec, row_stride1_vec, - img_stride0_vec, img_stride1_vec, - data_ptr0, data_ptr1, - &packed_lo, &packed_hi); + lp_build_sample_mipmap(bld, + min_filter, mip_filter, + s, t, r, + ilevel0, ilevel1, lod_fpart, + packed_lo, packed_hi); } lp_build_else(&if_ctx); { /* Use the magnification filter */ - lp_build_sample_mipmap(bld, mag_filter, mip_filter, - s, t, r, lod_fpart, - width0_vec, width1_vec, - height0_vec, height1_vec, - depth0_vec, depth1_vec, - row_stride0_vec, row_stride1_vec, - img_stride0_vec, img_stride1_vec, - data_ptr0, data_ptr1, - &packed_lo, &packed_hi); + lp_build_sample_mipmap(bld, + mag_filter, PIPE_TEX_MIPFILTER_NONE, + s, t, r, + i32t_zero, NULL, NULL, + packed_lo, packed_hi); } lp_build_endif(&if_ctx); - - lp_build_flow_scope_end(flow_ctx); - lp_build_flow_destroy(flow_ctx); } - /* combine 'packed_lo', 'packed_hi' into 'packed' */ - { - struct lp_build_context h16, u8n; - - lp_build_context_init(&h16, builder, lp_type_ufixed(16)); - lp_build_context_init(&u8n, builder, lp_type_unorm(8)); - - packed = lp_build_pack2(builder, h16.type, u8n.type, - packed_lo, packed_hi); - } + /* + * combine the values stored in 'packed_lo' and 'packed_hi' variables + * into 'packed' + */ + packed = lp_build_pack2(builder, + h16_bld.type, lp_type_unorm(8), + LLVMBuildLoad(builder, packed_lo, ""), + LLVMBuildLoad(builder, packed_hi, "")); /* * Convert to SoA and swizzle. diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h index e1045bbbc21..5d9ecac4d50 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h @@ -50,15 +50,6 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, const LLVMValueRef *ddy, LLVMValueRef lod_bias, /* optional */ LLVMValueRef explicit_lod, /* optional */ - LLVMValueRef width, - LLVMValueRef height, - LLVMValueRef depth, - LLVMValueRef width_vec, - LLVMValueRef height_vec, - LLVMValueRef depth_vec, - LLVMValueRef row_stride_array, - LLVMValueRef img_stride_array, - LLVMValueRef data_array, LLVMValueRef texel_out[4]); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c index 36a77d3aff0..53cc0c5f345 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c @@ -82,7 +82,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld, LLVMValueRef texel_out[4]) { const struct lp_sampler_static_state *static_state = bld->static_state; - const int dims = texture_dims(static_state->target); + const unsigned dims = bld->dims; struct lp_build_context *int_coord_bld = &bld->int_coord_bld; LLVMValueRef offset; LLVMValueRef i, j; @@ -131,7 +131,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld, } /* convert x,y,z coords to linear offset from start of texture, in bytes */ - lp_build_sample_offset(&bld->uint_coord_bld, + lp_build_sample_offset(&bld->int_coord_bld, bld->format_desc, x, y, z, y_stride, z_stride, &offset, &i, &j); @@ -145,7 +145,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld, * coords which are out of bounds to become zero. Zero's guaranteed * to be inside the texture image. */ - offset = lp_build_andnot(&bld->uint_coord_bld, offset, use_border); + offset = lp_build_andnot(&bld->int_coord_bld, offset, use_border); } lp_build_fetch_rgba_soa(bld->builder, @@ -202,11 +202,7 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld, struct lp_build_context *int_coord_bld = &bld->int_coord_bld; LLVMValueRef fract, flr, isOdd; - /* fract = coord - floor(coord) */ - fract = lp_build_sub(coord_bld, coord, lp_build_floor(coord_bld, coord)); - - /* flr = ifloor(coord); */ - flr = lp_build_ifloor(coord_bld, coord); + lp_build_ifloor_fract(coord_bld, coord, &flr, &fract); /* isOdd = flr & 1 */ isOdd = LLVMBuildAnd(bld->builder, flr, int_coord_bld->one, ""); @@ -234,6 +230,7 @@ static void lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, LLVMValueRef coord, LLVMValueRef length, + LLVMValueRef length_f, boolean is_pot, unsigned wrap_mode, LLVMValueRef *x0_out, @@ -242,10 +239,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, { struct lp_build_context *coord_bld = &bld->coord_bld; struct lp_build_context *int_coord_bld = &bld->int_coord_bld; - struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld; LLVMValueRef half = lp_build_const_vec(coord_bld->type, 0.5); - LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length); - LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one); + LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one); LLVMValueRef coord0, coord1, weight; switch(wrap_mode) { @@ -253,23 +248,25 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, /* mul by size and subtract 0.5 */ coord = lp_build_mul(coord_bld, coord, length_f); coord = lp_build_sub(coord_bld, coord, half); - /* convert to int */ - coord0 = lp_build_ifloor(coord_bld, coord); - coord1 = lp_build_add(uint_coord_bld, coord0, uint_coord_bld->one); - /* compute lerp weight */ - weight = lp_build_fract(coord_bld, coord); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); /* repeat wrap */ if (is_pot) { + coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); coord0 = LLVMBuildAnd(bld->builder, coord0, length_minus_one, ""); coord1 = LLVMBuildAnd(bld->builder, coord1, length_minus_one, ""); } else { /* Add a bias to the texcoord to handle negative coords */ - LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024); + LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024); + LLVMValueRef mask; coord0 = LLVMBuildAdd(bld->builder, coord0, bias, ""); - coord1 = LLVMBuildAdd(bld->builder, coord1, bias, ""); coord0 = LLVMBuildURem(bld->builder, coord0, length, ""); - coord1 = LLVMBuildURem(bld->builder, coord1, length, ""); + mask = lp_build_compare(bld->builder, int_coord_bld->type, + PIPE_FUNC_NOTEQUAL, coord0, length_minus_one); + coord1 = LLVMBuildAnd(bld->builder, + lp_build_add(int_coord_bld, coord0, int_coord_bld->one), + mask, ""); } break; @@ -284,53 +281,47 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, coord = lp_build_sub(coord_bld, coord, half); - weight = lp_build_fract(coord_bld, coord); - coord0 = lp_build_ifloor(coord_bld, coord); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); break; case PIPE_TEX_WRAP_CLAMP_TO_EDGE: - if (bld->static_state->normalized_coords) { - /* clamp to [0,1] */ - coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, coord_bld->one); - /* mul by tex size and subtract 0.5 */ - coord = lp_build_mul(coord_bld, coord, length_f); + { + struct lp_build_context abs_coord_bld = bld->coord_bld; + abs_coord_bld.type.sign = FALSE; + + if (bld->static_state->normalized_coords) { + /* mul by tex size */ + coord = lp_build_mul(coord_bld, coord, length_f); + } + /* clamp to length max */ + coord = lp_build_min(coord_bld, coord, length_f); + /* subtract 0.5 */ coord = lp_build_sub(coord_bld, coord, half); + /* clamp to [0, length - 0.5] */ + coord = lp_build_max(coord_bld, coord, coord_bld->zero); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight); + coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); + /* coord1 = min(coord1, length-1) */ + coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one); + break; } - else { - LLVMValueRef min, max; - /* clamp to [0.5, length - 0.5] */ - min = half; - max = lp_build_sub(coord_bld, length_f, min); - coord = lp_build_clamp(coord_bld, coord, min, max); - } - /* compute lerp weight */ - weight = lp_build_fract(coord_bld, coord); - /* coord0 = floor(coord); */ - coord0 = lp_build_ifloor(coord_bld, coord); - coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); - /* coord0 = max(coord0, 0) */ - coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero); - /* coord1 = min(coord1, length-1) */ - coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one); - break; case PIPE_TEX_WRAP_CLAMP_TO_BORDER: { - LLVMValueRef min, max; + LLVMValueRef min; if (bld->static_state->normalized_coords) { /* scale coord to length */ coord = lp_build_mul(coord_bld, coord, length_f); } - /* clamp to [-0.5, length + 0.5] */ - min = lp_build_const_vec(coord_bld->type, -0.5F); - max = lp_build_sub(coord_bld, length_f, min); - coord = lp_build_clamp(coord_bld, coord, min, max); + /* was: clamp to [-0.5, length + 0.5], then sub 0.5 */ coord = lp_build_sub(coord_bld, coord, half); - /* compute lerp weight */ - weight = lp_build_fract(coord_bld, coord); - /* convert to int */ - coord0 = lp_build_ifloor(coord_bld, coord); + min = lp_build_const_vec(coord_bld->type, -1.0F); + coord = lp_build_clamp(coord_bld, coord, min, length_f); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); } break; @@ -343,11 +334,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, coord = lp_build_mul(coord_bld, coord, length_f); coord = lp_build_sub(coord_bld, coord, half); - /* compute lerp weight */ - weight = lp_build_fract(coord_bld, coord); - - /* convert to int coords */ - coord0 = lp_build_ifloor(coord_bld, coord); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); /* coord0 = max(coord0, 0) */ @@ -369,15 +357,16 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, coord = lp_build_sub(coord_bld, coord, half); - weight = lp_build_fract(coord_bld, coord); - coord0 = lp_build_ifloor(coord_bld, coord); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); break; case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: { LLVMValueRef min, max; - + struct lp_build_context abs_coord_bld = bld->coord_bld; + abs_coord_bld.type.sign = FALSE; coord = lp_build_abs(coord_bld, coord); if (bld->static_state->normalized_coords) { @@ -392,16 +381,14 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, coord = lp_build_sub(coord_bld, coord, half); - weight = lp_build_fract(coord_bld, coord); - coord0 = lp_build_ifloor(coord_bld, coord); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); } break; case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: { - LLVMValueRef min, max; - coord = lp_build_abs(coord_bld, coord); if (bld->static_state->normalized_coords) { @@ -409,15 +396,13 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, coord = lp_build_mul(coord_bld, coord, length_f); } - /* clamp to [-0.5, length + 0.5] */ - min = lp_build_negate(coord_bld, half); - max = lp_build_sub(coord_bld, length_f, min); - coord = lp_build_clamp(coord_bld, coord, min, max); - + /* was: clamp to [-0.5, length + 0.5] then sub 0.5 */ + /* skip -0.5 clamp (always positive), do sub first */ coord = lp_build_sub(coord_bld, coord, half); + coord = lp_build_min(coord_bld, coord, length_f); - weight = lp_build_fract(coord_bld, coord); - coord0 = lp_build_ifloor(coord_bld, coord); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); } break; @@ -446,14 +431,13 @@ static LLVMValueRef lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld, LLVMValueRef coord, LLVMValueRef length, + LLVMValueRef length_f, boolean is_pot, unsigned wrap_mode) { struct lp_build_context *coord_bld = &bld->coord_bld; struct lp_build_context *int_coord_bld = &bld->int_coord_bld; - struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld; - LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length); - LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one); + LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one); LLVMValueRef icoord; switch(wrap_mode) { @@ -464,7 +448,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld, icoord = LLVMBuildAnd(bld->builder, icoord, length_minus_one, ""); else { /* Add a bias to the texcoord to handle negative coords */ - LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024); + LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024); icoord = LLVMBuildAdd(bld->builder, icoord, bias, ""); icoord = LLVMBuildURem(bld->builder, icoord, length, ""); } @@ -478,7 +462,8 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld, } /* floor */ - icoord = lp_build_ifloor(coord_bld, coord); + /* use itrunc instead since we clamp to 0 anyway */ + icoord = lp_build_itrunc(coord_bld, coord); /* clamp to [0, length - 1]. */ icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero, @@ -512,7 +497,8 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld, assert(bld->static_state->normalized_coords); coord = lp_build_mul(coord_bld, coord, length_f); - icoord = lp_build_ifloor(coord_bld, coord); + /* itrunc == ifloor here */ + icoord = lp_build_itrunc(coord_bld, coord); /* clamp to [0, length - 1] */ icoord = lp_build_min(int_coord_bld, icoord, length_minus_one); @@ -527,7 +513,8 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld, coord = lp_build_mul(coord_bld, coord, length_f); } - icoord = lp_build_ifloor(coord_bld, coord); + /* itrunc == ifloor here */ + icoord = lp_build_itrunc(coord_bld, coord); /* clamp to [0, length - 1] */ icoord = lp_build_min(int_coord_bld, icoord, length_minus_one); @@ -541,7 +528,8 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld, coord = lp_build_mul(coord_bld, coord, length_f); } - icoord = lp_build_ifloor(coord_bld, coord); + /* itrunc == ifloor here */ + icoord = lp_build_itrunc(coord_bld, coord); /* clamp to [0, length] */ icoord = lp_build_min(int_coord_bld, icoord, length); @@ -563,9 +551,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld, static void lp_build_sample_image_nearest(struct lp_build_sample_context *bld, unsigned unit, - LLVMValueRef width_vec, - LLVMValueRef height_vec, - LLVMValueRef depth_vec, + LLVMValueRef size, LLVMValueRef row_stride_vec, LLVMValueRef img_stride_vec, LLVMValueRef data_ptr, @@ -574,25 +560,46 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, LLVMValueRef r, LLVMValueRef colors_out[4]) { - const int dims = texture_dims(bld->static_state->target); + const unsigned dims = bld->dims; + LLVMValueRef width_vec; + LLVMValueRef height_vec; + LLVMValueRef depth_vec; + LLVMValueRef flt_size; + LLVMValueRef flt_width_vec; + LLVMValueRef flt_height_vec; + LLVMValueRef flt_depth_vec; LLVMValueRef x, y, z; + lp_build_extract_image_sizes(bld, + bld->int_size_type, + bld->int_coord_type, + size, + &width_vec, &height_vec, &depth_vec); + + flt_size = lp_build_int_to_float(&bld->float_size_bld, size); + + lp_build_extract_image_sizes(bld, + bld->float_size_type, + bld->coord_type, + flt_size, + &flt_width_vec, &flt_height_vec, &flt_depth_vec); + /* * Compute integer texcoords. */ - x = lp_build_sample_wrap_nearest(bld, s, width_vec, + x = lp_build_sample_wrap_nearest(bld, s, width_vec, flt_width_vec, bld->static_state->pot_width, bld->static_state->wrap_s); lp_build_name(x, "tex.x.wrapped"); if (dims >= 2) { - y = lp_build_sample_wrap_nearest(bld, t, height_vec, + y = lp_build_sample_wrap_nearest(bld, t, height_vec, flt_height_vec, bld->static_state->pot_height, bld->static_state->wrap_t); lp_build_name(y, "tex.y.wrapped"); if (dims == 3) { - z = lp_build_sample_wrap_nearest(bld, r, depth_vec, + z = lp_build_sample_wrap_nearest(bld, r, depth_vec, flt_depth_vec, bld->static_state->pot_depth, bld->static_state->wrap_r); lp_build_name(z, "tex.z.wrapped"); @@ -626,9 +633,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, static void lp_build_sample_image_linear(struct lp_build_sample_context *bld, unsigned unit, - LLVMValueRef width_vec, - LLVMValueRef height_vec, - LLVMValueRef depth_vec, + LLVMValueRef size, LLVMValueRef row_stride_vec, LLVMValueRef img_stride_vec, LLVMValueRef data_ptr, @@ -637,16 +642,37 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, LLVMValueRef r, LLVMValueRef colors_out[4]) { - const int dims = texture_dims(bld->static_state->target); + const unsigned dims = bld->dims; + LLVMValueRef width_vec; + LLVMValueRef height_vec; + LLVMValueRef depth_vec; + LLVMValueRef flt_size; + LLVMValueRef flt_width_vec; + LLVMValueRef flt_height_vec; + LLVMValueRef flt_depth_vec; LLVMValueRef x0, y0, z0, x1, y1, z1; LLVMValueRef s_fpart, t_fpart, r_fpart; LLVMValueRef neighbors[2][2][4]; int chan; + lp_build_extract_image_sizes(bld, + bld->int_size_type, + bld->int_coord_type, + size, + &width_vec, &height_vec, &depth_vec); + + flt_size = lp_build_int_to_float(&bld->float_size_bld, size); + + lp_build_extract_image_sizes(bld, + bld->float_size_type, + bld->coord_type, + flt_size, + &flt_width_vec, &flt_height_vec, &flt_depth_vec); + /* * Compute integer texcoords. */ - lp_build_sample_wrap_linear(bld, s, width_vec, + lp_build_sample_wrap_linear(bld, s, width_vec, flt_width_vec, bld->static_state->pot_width, bld->static_state->wrap_s, &x0, &x1, &s_fpart); @@ -654,7 +680,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, lp_build_name(x1, "tex.x1.wrapped"); if (dims >= 2) { - lp_build_sample_wrap_linear(bld, t, height_vec, + lp_build_sample_wrap_linear(bld, t, height_vec, flt_height_vec, bld->static_state->pot_height, bld->static_state->wrap_t, &y0, &y1, &t_fpart); @@ -662,7 +688,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, lp_build_name(y1, "tex.y1.wrapped"); if (dims == 3) { - lp_build_sample_wrap_linear(bld, r, depth_vec, + lp_build_sample_wrap_linear(bld, r, depth_vec, flt_depth_vec, bld->static_state->pot_depth, bld->static_state->wrap_r, &z0, &z1, &r_fpart); @@ -799,69 +825,92 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld, LLVMValueRef s, LLVMValueRef t, LLVMValueRef r, + LLVMValueRef ilevel0, + LLVMValueRef ilevel1, LLVMValueRef lod_fpart, - LLVMValueRef width0_vec, - LLVMValueRef width1_vec, - LLVMValueRef height0_vec, - LLVMValueRef height1_vec, - LLVMValueRef depth0_vec, - LLVMValueRef depth1_vec, - LLVMValueRef row_stride0_vec, - LLVMValueRef row_stride1_vec, - LLVMValueRef img_stride0_vec, - LLVMValueRef img_stride1_vec, - LLVMValueRef data_ptr0, - LLVMValueRef data_ptr1, LLVMValueRef *colors_out) { + LLVMBuilderRef builder = bld->builder; + LLVMValueRef size0; + LLVMValueRef size1; + LLVMValueRef row_stride0_vec; + LLVMValueRef row_stride1_vec; + LLVMValueRef img_stride0_vec; + LLVMValueRef img_stride1_vec; + LLVMValueRef data_ptr0; + LLVMValueRef data_ptr1; LLVMValueRef colors0[4], colors1[4]; - int chan; + unsigned chan; + /* sample the first mipmap level */ + lp_build_mipmap_level_sizes(bld, ilevel0, + &size0, + &row_stride0_vec, &img_stride0_vec); + data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0); if (img_filter == PIPE_TEX_FILTER_NEAREST) { - /* sample the first mipmap level */ lp_build_sample_image_nearest(bld, unit, - width0_vec, height0_vec, depth0_vec, + size0, row_stride0_vec, img_stride0_vec, - data_ptr0, s, t, r, colors0); - - if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { - /* sample the second mipmap level */ - lp_build_sample_image_nearest(bld, unit, - width1_vec, height1_vec, depth1_vec, - row_stride1_vec, img_stride1_vec, - data_ptr1, s, t, r, colors1); - } + data_ptr0, s, t, r, + colors0); } else { assert(img_filter == PIPE_TEX_FILTER_LINEAR); - - /* sample the first mipmap level */ lp_build_sample_image_linear(bld, unit, - width0_vec, height0_vec, depth0_vec, + size0, row_stride0_vec, img_stride0_vec, - data_ptr0, s, t, r, colors0); + data_ptr0, s, t, r, + colors0); + } - if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { - /* sample the second mipmap level */ - lp_build_sample_image_linear(bld, unit, - width1_vec, height1_vec, depth1_vec, - row_stride1_vec, img_stride1_vec, - data_ptr1, s, t, r, colors1); - } + /* Store the first level's colors in the output variables */ + for (chan = 0; chan < 4; chan++) { + LLVMBuildStore(builder, colors0[chan], colors_out[chan]); } if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { - /* interpolate samples from the two mipmap levels */ - for (chan = 0; chan < 4; chan++) { - colors_out[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart, + struct lp_build_if_state if_ctx; + LLVMValueRef need_lerp; + + /* need_lerp = lod_fpart > 0 */ + need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT, + lod_fpart, + bld->float_bld.zero, + "need_lerp"); + + lp_build_if(&if_ctx, builder, need_lerp); + { + /* sample the second mipmap level */ + lp_build_mipmap_level_sizes(bld, ilevel1, + &size1, + &row_stride1_vec, &img_stride1_vec); + data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1); + if (img_filter == PIPE_TEX_FILTER_NEAREST) { + lp_build_sample_image_nearest(bld, unit, + size1, + row_stride1_vec, img_stride1_vec, + data_ptr1, s, t, r, + colors1); + } + else { + lp_build_sample_image_linear(bld, unit, + size1, + row_stride1_vec, img_stride1_vec, + data_ptr1, s, t, r, + colors1); + } + + /* interpolate samples from the two mipmap levels */ + + lod_fpart = lp_build_broadcast_scalar(&bld->texel_bld, lod_fpart); + + for (chan = 0; chan < 4; chan++) { + colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart, colors0[chan], colors1[chan]); + LLVMBuildStore(builder, colors0[chan], colors_out[chan]); + } } - } - else { - /* use first/only level's colors */ - for (chan = 0; chan < 4; chan++) { - colors_out[chan] = colors0[chan]; - } + lp_build_endif(&if_ctx); } } @@ -882,30 +931,20 @@ lp_build_sample_general(struct lp_build_sample_context *bld, const LLVMValueRef *ddy, LLVMValueRef lod_bias, /* optional */ LLVMValueRef explicit_lod, /* optional */ - LLVMValueRef width, - LLVMValueRef height, - LLVMValueRef depth, - LLVMValueRef width_vec, - LLVMValueRef height_vec, - LLVMValueRef depth_vec, - LLVMValueRef row_stride_array, - LLVMValueRef img_stride_array, - LLVMValueRef data_array, LLVMValueRef *colors_out) { - struct lp_build_context *float_bld = &bld->float_bld; + struct lp_build_context *int_bld = &bld->int_bld; + LLVMBuilderRef builder = bld->builder; const unsigned mip_filter = bld->static_state->min_mip_filter; const unsigned min_filter = bld->static_state->min_img_filter; const unsigned mag_filter = bld->static_state->mag_img_filter; - const int dims = texture_dims(bld->static_state->target); - LLVMValueRef lod = NULL, lod_fpart = NULL; + LLVMValueRef lod_ipart = NULL, lod_fpart = NULL; LLVMValueRef ilevel0, ilevel1 = NULL; - LLVMValueRef width0_vec = NULL, height0_vec = NULL, depth0_vec = NULL; - LLVMValueRef width1_vec = NULL, height1_vec = NULL, depth1_vec = NULL; - LLVMValueRef row_stride0_vec = NULL, row_stride1_vec = NULL; - LLVMValueRef img_stride0_vec = NULL, img_stride1_vec = NULL; - LLVMValueRef data_ptr0, data_ptr1 = NULL; LLVMValueRef face_ddx[4], face_ddy[4]; + LLVMValueRef texels[4]; + LLVMTypeRef i32t = LLVMInt32Type(); + LLVMValueRef i32t_zero = LLVMConstInt(i32t, 0, 0); + unsigned chan; /* printf("%s mip %d min %d mag %d\n", __FUNCTION__, @@ -924,12 +963,12 @@ lp_build_sample_general(struct lp_build_sample_context *bld, r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */ /* recompute ddx, ddy using the new (s,t) face texcoords */ - face_ddx[0] = lp_build_ddx(&bld->coord_bld, s); - face_ddx[1] = lp_build_ddx(&bld->coord_bld, t); + face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s); + face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t); face_ddx[2] = NULL; face_ddx[3] = NULL; - face_ddy[0] = lp_build_ddy(&bld->coord_bld, s); - face_ddy[1] = lp_build_ddy(&bld->coord_bld, t); + face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s); + face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t); face_ddy[2] = NULL; face_ddy[3] = NULL; ddx = face_ddx; @@ -944,126 +983,100 @@ lp_build_sample_general(struct lp_build_sample_context *bld, /* Need to compute lod either to choose mipmap levels or to * distinguish between minification/magnification with one mipmap level. */ - lod = lp_build_lod_selector(bld, unit, ddx, ddy, - lod_bias, explicit_lod, - width, height, depth); + lp_build_lod_selector(bld, unit, ddx, ddy, + lod_bias, explicit_lod, + mip_filter, + &lod_ipart, &lod_fpart); + } else { + lod_ipart = i32t_zero; } /* - * Compute integer mipmap level(s) to fetch texels from. + * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1 */ - if (mip_filter == PIPE_TEX_MIPFILTER_NONE) { + switch (mip_filter) { + default: + assert(0 && "bad mip_filter value in lp_build_sample_soa()"); + /* fall-through */ + case PIPE_TEX_MIPFILTER_NONE: /* always use mip level 0 */ if (bld->static_state->target == PIPE_TEXTURE_CUBE) { /* XXX this is a work-around for an apparent bug in LLVM 2.7. * We should be able to set ilevel0 = const(0) but that causes * bad x86 code to be emitted. */ - lod = lp_build_const_elem(bld->coord_bld.type, 0.0); - lp_build_nearest_mip_level(bld, unit, lod, &ilevel0); + assert(lod_ipart); + lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0); } else { - ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0); - } - } - else { - assert(lod); - if (mip_filter == PIPE_TEX_MIPFILTER_NEAREST) { - lp_build_nearest_mip_level(bld, unit, lod, &ilevel0); - } - else { - assert(mip_filter == PIPE_TEX_MIPFILTER_LINEAR); - lp_build_linear_mip_levels(bld, unit, lod, &ilevel0, &ilevel1, - &lod_fpart); - lod_fpart = lp_build_broadcast_scalar(&bld->coord_bld, lod_fpart); + ilevel0 = i32t_zero; } + break; + case PIPE_TEX_MIPFILTER_NEAREST: + assert(lod_ipart); + lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0); + break; + case PIPE_TEX_MIPFILTER_LINEAR: + assert(lod_ipart); + assert(lod_fpart); + lp_build_linear_mip_levels(bld, unit, + lod_ipart, &lod_fpart, + &ilevel0, &ilevel1); + break; } - /* compute image size(s) of source mipmap level(s) */ - lp_build_mipmap_level_sizes(bld, dims, width_vec, height_vec, depth_vec, - ilevel0, ilevel1, - row_stride_array, img_stride_array, - &width0_vec, &width1_vec, - &height0_vec, &height1_vec, - &depth0_vec, &depth1_vec, - &row_stride0_vec, &row_stride1_vec, - &img_stride0_vec, &img_stride1_vec); - /* - * Get pointer(s) to image data for mipmap level(s). + * Get/interpolate texture colors. */ - data_ptr0 = lp_build_get_mipmap_level(bld, data_array, ilevel0); - if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { - data_ptr1 = lp_build_get_mipmap_level(bld, data_array, ilevel1); + + for (chan = 0; chan < 4; ++chan) { + texels[chan] = lp_build_alloca(builder, bld->texel_bld.vec_type, ""); + lp_build_name(texels[chan], "sampler%u_texel_%c_var", unit, "xyzw"[chan]); } - /* - * Get/interpolate texture colors. - */ if (min_filter == mag_filter) { /* no need to distinquish between minification and magnification */ lp_build_sample_mipmap(bld, unit, - min_filter, mip_filter, s, t, r, lod_fpart, - width0_vec, width1_vec, - height0_vec, height1_vec, - depth0_vec, depth1_vec, - row_stride0_vec, row_stride1_vec, - img_stride0_vec, img_stride1_vec, - data_ptr0, data_ptr1, - colors_out); + min_filter, mip_filter, + s, t, r, + ilevel0, ilevel1, lod_fpart, + texels); } else { /* Emit conditional to choose min image filter or mag image filter - * depending on the lod being >0 or <= 0, respectively. + * depending on the lod being > 0 or <= 0, respectively. */ - struct lp_build_flow_context *flow_ctx; struct lp_build_if_state if_ctx; LLVMValueRef minify; - flow_ctx = lp_build_flow_create(bld->builder); - lp_build_flow_scope_begin(flow_ctx); + /* minify = lod >= 0.0 */ + minify = LLVMBuildICmp(builder, LLVMIntSGE, + lod_ipart, int_bld->zero, ""); - lp_build_flow_scope_declare(flow_ctx, &colors_out[0]); - lp_build_flow_scope_declare(flow_ctx, &colors_out[1]); - lp_build_flow_scope_declare(flow_ctx, &colors_out[2]); - lp_build_flow_scope_declare(flow_ctx, &colors_out[3]); - - /* minify = lod > 0.0 */ - minify = LLVMBuildFCmp(bld->builder, LLVMRealUGE, - lod, float_bld->zero, ""); - - lp_build_if(&if_ctx, flow_ctx, bld->builder, minify); + lp_build_if(&if_ctx, builder, minify); { /* Use the minification filter */ lp_build_sample_mipmap(bld, unit, min_filter, mip_filter, - s, t, r, lod_fpart, - width0_vec, width1_vec, - height0_vec, height1_vec, - depth0_vec, depth1_vec, - row_stride0_vec, row_stride1_vec, - img_stride0_vec, img_stride1_vec, - data_ptr0, data_ptr1, - colors_out); + s, t, r, + ilevel0, ilevel1, lod_fpart, + texels); } lp_build_else(&if_ctx); { /* Use the magnification filter */ lp_build_sample_mipmap(bld, unit, - mag_filter, mip_filter, - s, t, r, lod_fpart, - width0_vec, width1_vec, - height0_vec, height1_vec, - depth0_vec, depth1_vec, - row_stride0_vec, row_stride1_vec, - img_stride0_vec, img_stride1_vec, - data_ptr0, data_ptr1, - colors_out); + mag_filter, PIPE_TEX_MIPFILTER_NONE, + s, t, r, + i32t_zero, NULL, NULL, + texels); } lp_build_endif(&if_ctx); + } - lp_build_flow_scope_end(flow_ctx); - lp_build_flow_destroy(flow_ctx); + for (chan = 0; chan < 4; ++chan) { + colors_out[chan] = LLVMBuildLoad(builder, texels[chan], ""); + lp_build_name(colors_out[chan], "sampler%u_texel_%c", unit, "xyzw"[chan]); } } @@ -1147,12 +1160,10 @@ lp_build_sample_soa(LLVMBuilderRef builder, LLVMValueRef explicit_lod, /* optional */ LLVMValueRef texel_out[4]) { + unsigned dims = texture_dims(static_state->target); struct lp_build_sample_context bld; - LLVMValueRef width, width_vec; - LLVMValueRef height, height_vec; - LLVMValueRef depth, depth_vec; - LLVMValueRef row_stride_array, img_stride_array; - LLVMValueRef data_array; + LLVMTypeRef i32t = LLVMInt32Type(); + LLVMValueRef s; LLVMValueRef t; LLVMValueRef r; @@ -1171,12 +1182,15 @@ lp_build_sample_soa(LLVMBuilderRef builder, bld.static_state = static_state; bld.dynamic_state = dynamic_state; bld.format_desc = util_format_description(static_state->format); + bld.dims = dims; bld.float_type = lp_type_float(32); bld.int_type = lp_type_int(32); bld.coord_type = type; - bld.uint_coord_type = lp_uint_type(type); bld.int_coord_type = lp_int_type(type); + bld.float_size_type = lp_type_float(32); + bld.float_size_type.length = dims > 1 ? 4 : 1; + bld.int_size_type = lp_int_type(bld.float_size_type); bld.texel_type = type; float_vec_type = lp_type_float_vec(32); @@ -1185,27 +1199,40 @@ lp_build_sample_soa(LLVMBuilderRef builder, lp_build_context_init(&bld.float_vec_bld, builder, float_vec_type); lp_build_context_init(&bld.int_bld, builder, bld.int_type); lp_build_context_init(&bld.coord_bld, builder, bld.coord_type); - lp_build_context_init(&bld.uint_coord_bld, builder, bld.uint_coord_type); lp_build_context_init(&bld.int_coord_bld, builder, bld.int_coord_type); + lp_build_context_init(&bld.int_size_bld, builder, bld.int_size_type); + lp_build_context_init(&bld.float_size_bld, builder, bld.float_size_type); lp_build_context_init(&bld.texel_bld, builder, bld.texel_type); /* Get the dynamic state */ - width = dynamic_state->width(dynamic_state, builder, unit); - height = dynamic_state->height(dynamic_state, builder, unit); - depth = dynamic_state->depth(dynamic_state, builder, unit); - row_stride_array = dynamic_state->row_stride(dynamic_state, builder, unit); - img_stride_array = dynamic_state->img_stride(dynamic_state, builder, unit); - data_array = dynamic_state->data_ptr(dynamic_state, builder, unit); + bld.width = dynamic_state->width(dynamic_state, builder, unit); + bld.height = dynamic_state->height(dynamic_state, builder, unit); + bld.depth = dynamic_state->depth(dynamic_state, builder, unit); + bld.row_stride_array = dynamic_state->row_stride(dynamic_state, builder, unit); + bld.img_stride_array = dynamic_state->img_stride(dynamic_state, builder, unit); + bld.data_array = dynamic_state->data_ptr(dynamic_state, builder, unit); /* Note that data_array is an array[level] of pointers to texture images */ s = coords[0]; t = coords[1]; r = coords[2]; - /* width, height, depth as uint vectors */ - width_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, width); - height_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, height); - depth_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, depth); + /* width, height, depth as single int vector */ + if (dims <= 1) { + bld.int_size = bld.width; + } + else { + bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_bld.undef, + bld.width, LLVMConstInt(i32t, 0, 0), ""); + if (dims >= 2) { + bld.int_size = LLVMBuildInsertElement(builder, bld.int_size, + bld.height, LLVMConstInt(i32t, 1, 0), ""); + if (dims >= 3) { + bld.int_size = LLVMBuildInsertElement(builder, bld.int_size, + bld.depth, LLVMConstInt(i32t, 2, 0), ""); + } + } + } if (0) { /* For debug: no-op texture sampling */ @@ -1217,10 +1244,7 @@ lp_build_sample_soa(LLVMBuilderRef builder, /* do sampling/filtering with fixed pt arithmetic */ lp_build_sample_aos(&bld, unit, s, t, r, ddx, ddy, lod_bias, explicit_lod, - width, height, depth, - width_vec, height_vec, depth_vec, - row_stride_array, img_stride_array, - data_array, texel_out); + texel_out); } else { @@ -1238,10 +1262,6 @@ lp_build_sample_soa(LLVMBuilderRef builder, lp_build_sample_general(&bld, unit, s, t, r, ddx, ddy, lod_bias, explicit_lod, - width, height, depth, - width_vec, height_vec, depth_vec, - row_stride_array, img_stride_array, - data_array, texel_out); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c index 2e9e8386de0..4685a90e418 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c @@ -101,6 +101,83 @@ lp_build_broadcast_scalar(struct lp_build_context *bld, /** + * Combined extract and broadcast (or a mere shuffle when the two types match) + */ +LLVMValueRef +lp_build_extract_broadcast(LLVMBuilderRef builder, + struct lp_type src_type, + struct lp_type dst_type, + LLVMValueRef vector, + LLVMValueRef index) +{ + LLVMTypeRef i32t = LLVMInt32Type(); + LLVMValueRef res; + + assert(src_type.floating == dst_type.floating); + assert(src_type.width == dst_type.width); + + assert(lp_check_value(src_type, vector)); + assert(LLVMTypeOf(index) == i32t); + + if (src_type.length == 1) { + if (dst_type.length == 1) { + /* + * Trivial scalar -> scalar. + */ + + res = vector; + } + else { + /* + * Broadcast scalar -> vector. + */ + + res = lp_build_broadcast(builder, + lp_build_vec_type(dst_type), + vector); + } + } + else { + if (dst_type.length == src_type.length) { + /* + * Special shuffle of the same size. + */ + + LLVMValueRef shuffle; + shuffle = lp_build_broadcast(builder, + LLVMVectorType(i32t, dst_type.length), + index); + res = LLVMBuildShuffleVector(builder, vector, + LLVMGetUndef(lp_build_vec_type(dst_type)), + shuffle, ""); + } + else { + LLVMValueRef scalar; + scalar = LLVMBuildExtractElement(builder, vector, index, ""); + if (dst_type.length == 1) { + /* + * Trivial extract scalar from vector. + */ + + res = scalar; + } + else { + /* + * General case of different sized vectors. + */ + + res = lp_build_broadcast(builder, + lp_build_vec_type(dst_type), + vector); + } + } + } + + return res; +} + + +/** * Swizzle one channel into all other three channels. */ LLVMValueRef diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h index f9b6a5e7258..fdea8442aef 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h @@ -55,6 +55,14 @@ lp_build_broadcast_scalar(struct lp_build_context *bld, LLVMValueRef scalar); +LLVMValueRef +lp_build_extract_broadcast(LLVMBuilderRef builder, + struct lp_type src_type, + struct lp_type dst_type, + LLVMValueRef vector, + LLVMValueRef index); + + /** * Broadcast one channel of a vector composed of arrays of XYZW structures into * all four channel. diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h index 97318b3456c..a4d3b750c3c 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h @@ -36,6 +36,9 @@ #define LP_BLD_TGSI_H #include "gallivm/lp_bld.h" +#include "pipe/p_compiler.h" +#include "pipe/p_state.h" +#include "tgsi/tgsi_scan.h" struct tgsi_token; @@ -55,6 +58,75 @@ enum lp_build_tex_modifier { /** + * Describe a channel of a register. + * + * The value can be a: + * - immediate value (i.e. derived from a IMM register) + * - CONST[n].x/y/z/w + * - IN[n].x/y/z/w + * - undetermined (when .file == TGSI_FILE_NULL) + * + * This is one of the analysis results, and is used to described + * the output color in terms of inputs. + */ +struct lp_tgsi_channel_info +{ + unsigned file:4; /* TGSI_FILE_* */ + unsigned swizzle:3; /* PIPE_SWIZZLE_x */ + union { + uint32_t index; + float value; /* for TGSI_FILE_IMMEDIATE */ + } u; +}; + + +/** + * Describe a texture sampler interpolator. + * + * The interpolation is described in terms of regular inputs. + */ +struct lp_tgsi_texture_info +{ + struct lp_tgsi_channel_info coord[4]; + unsigned target:8; /* TGSI_TEXTURE_* */ + unsigned unit:8; /* Sampler unit */ + unsigned modifier:8; /* LP_BLD_TEX_MODIFIER_* */ +}; + + +struct lp_tgsi_info +{ + struct tgsi_shader_info base; + + /* + * Whether any of the texture opcodes access a register file other than + * TGSI_FILE_INPUT. + * + * We could also handle TGSI_FILE_CONST/IMMEDIATE here, but there is little + * benefit. + */ + unsigned indirect_textures:1; + + /* + * Texture opcode description. Aimed at detecting and described direct + * texture opcodes. + */ + unsigned num_texs; + struct lp_tgsi_texture_info tex[PIPE_MAX_SAMPLERS]; + + /* + * Output description. Aimed at detecting and describing simple blit + * shaders. + */ + struct lp_tgsi_channel_info output[PIPE_MAX_SHADER_OUTPUTS][4]; + + /* + * Shortcut pointers into the above (for fragment shaders). + */ + const struct lp_tgsi_channel_info *cbuf[PIPE_MAX_COLOR_BUFS]; +}; + +/** * Sampler code generation interface. * * Although texture sampling is a requirement for TGSI translation, it is @@ -97,6 +169,11 @@ struct lp_build_sampler_aos void +lp_build_tgsi_info(const struct tgsi_token *tokens, + struct lp_tgsi_info *info); + + +void lp_build_tgsi_soa(LLVMBuilderRef builder, const struct tgsi_token *tokens, struct lp_type type, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c index d5f963be58d..c3c082b2b95 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c @@ -513,7 +513,7 @@ emit_instruction( { LLVMValueRef src0, src1, src2; LLVMValueRef tmp0, tmp1; - LLVMValueRef dst0; + LLVMValueRef dst0 = NULL; /* * Stores and write masks are handled in a general fashion after the long diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c new file mode 100644 index 00000000000..ad514463de0 --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c @@ -0,0 +1,479 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + **************************************************************************/ + + +#include "util/u_memory.h" +#include "util/u_math.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_util.h" +#include "tgsi/tgsi_dump.h" +#include "lp_bld_debug.h" +#include "lp_bld_tgsi.h" + + +/** + * Analysis context. + * + * This is where we keep store the value of each channel of the IMM/TEMP/OUT + * register values, as we walk the shader. + */ +struct analysis_context +{ + struct lp_tgsi_info *info; + + unsigned num_imms; + float imm[32][4]; + + struct lp_tgsi_channel_info temp[32][4]; +}; + + +/** + * Describe the specified channel of the src register. + */ +static void +analyse_src(struct analysis_context *ctx, + struct lp_tgsi_channel_info *chan_info, + const struct tgsi_src_register *src, + unsigned chan) +{ + chan_info->file = TGSI_FILE_NULL; + if (!src->Indirect && !src->Absolute && !src->Negate) { + unsigned swizzle = tgsi_util_get_src_register_swizzle(src, chan); + if (src->File == TGSI_FILE_TEMPORARY) { + if (src->Index < Elements(ctx->temp)) { + *chan_info = ctx->temp[src->Index][swizzle]; + } + } else { + chan_info->file = src->File; + if (src->File == TGSI_FILE_IMMEDIATE) { + assert(src->Index < Elements(ctx->imm)); + if (src->Index < Elements(ctx->imm)) { + chan_info->u.value = ctx->imm[src->Index][swizzle]; + } + } else { + chan_info->u.index = src->Index; + chan_info->swizzle = swizzle; + } + } + } +} + + +/** + * Whether this register channel refers to a specific immediate value. + */ +static boolean +is_immediate(const struct lp_tgsi_channel_info *chan_info, float value) +{ + return chan_info->file == TGSI_FILE_IMMEDIATE && + chan_info->u.value == value; +} + + +static void +analyse_tex(struct analysis_context *ctx, + const struct tgsi_full_instruction *inst, + enum lp_build_tex_modifier modifier) +{ + struct lp_tgsi_info *info = ctx->info; + unsigned chan; + + if (info->num_texs < Elements(info->tex)) { + struct lp_tgsi_texture_info *tex_info = &info->tex[info->num_texs]; + bool indirect = FALSE; + unsigned readmask = 0; + + tex_info->target = inst->Texture.Texture; + switch (inst->Texture.Texture) { + case TGSI_TEXTURE_1D: + readmask = TGSI_WRITEMASK_X; + break; + case TGSI_TEXTURE_2D: + case TGSI_TEXTURE_RECT: + readmask = TGSI_WRITEMASK_XY; + break; + case TGSI_TEXTURE_SHADOW1D: + case TGSI_TEXTURE_SHADOW2D: + case TGSI_TEXTURE_SHADOWRECT: + case TGSI_TEXTURE_3D: + case TGSI_TEXTURE_CUBE: + readmask = TGSI_WRITEMASK_XYZ; + break; + default: + assert(0); + return; + } + + if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) { + /* We don't track explicit derivatives, although we could */ + indirect = TRUE; + tex_info->unit = inst->Src[3].Register.Index; + } else { + if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED || + modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS || + modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) { + readmask |= TGSI_WRITEMASK_W; + } + tex_info->unit = inst->Src[1].Register.Index; + } + + for (chan = 0; chan < 4; ++chan) { + struct lp_tgsi_channel_info *chan_info = &tex_info->coord[chan]; + if (readmask & (1 << chan)) { + analyse_src(ctx, chan_info, &inst->Src[0].Register, chan); + if (chan_info->file != TGSI_FILE_INPUT) { + indirect = TRUE; + } + } else { + memset(chan_info, 0, sizeof *chan_info); + } + } + + if (indirect) { + info->indirect_textures = TRUE; + } + + ++info->num_texs; + } else { + info->indirect_textures = TRUE; + } +} + + +/** + * Process an instruction, and update the register values accordingly. + */ +static void +analyse_instruction(struct analysis_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct lp_tgsi_info *info = ctx->info; + struct lp_tgsi_channel_info (*regs)[4]; + unsigned max_regs; + unsigned i; + unsigned index; + unsigned chan; + + for (i = 0; i < inst->Instruction.NumDstRegs; ++i) { + const struct tgsi_dst_register *dst = &inst->Dst[i].Register; + + /* + * Get the lp_tgsi_channel_info array corresponding to the destination + * register file. + */ + + if (dst->File == TGSI_FILE_TEMPORARY) { + regs = ctx->temp; + max_regs = Elements(ctx->temp); + } else if (dst->File == TGSI_FILE_OUTPUT) { + regs = info->output; + max_regs = Elements(info->output); + } else if (dst->File == TGSI_FILE_ADDRESS || + dst->File == TGSI_FILE_PREDICATE) { + continue; + } else { + assert(0); + continue; + } + + /* + * Detect direct TEX instructions + */ + + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_TEX: + analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_NONE); + break; + case TGSI_OPCODE_TXD: + analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV); + break; + case TGSI_OPCODE_TXB: + analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_LOD_BIAS); + break; + case TGSI_OPCODE_TXL: + analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD); + break; + case TGSI_OPCODE_TXP: + analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_PROJECTED); + break; + default: + break; + } + + /* + * Keep track of assignments and writes + */ + + if (dst->Indirect) { + /* + * It could be any register index so clear all register indices. + */ + + for (chan = 0; chan < 4; ++chan) { + if (dst->WriteMask & (1 << chan)) { + for (index = 0; index < max_regs; ++index) { + regs[index][chan].file = TGSI_FILE_NULL; + } + } + } + } else if (dst->Index < max_regs) { + /* + * Update this destination register value. + */ + + struct lp_tgsi_channel_info res[4]; + + memset(res, 0, sizeof res); + + if (!inst->Instruction.Predicate && + !inst->Instruction.Saturate) { + for (chan = 0; chan < 4; ++chan) { + if (dst->WriteMask & (1 << chan)) { + if (inst->Instruction.Opcode == TGSI_OPCODE_MOV) { + analyse_src(ctx, &res[chan], + &inst->Src[0].Register, chan); + } else if (inst->Instruction.Opcode == TGSI_OPCODE_MUL) { + /* + * Propagate values across 1.0 and 0.0 multiplications. + */ + + struct lp_tgsi_channel_info src0; + struct lp_tgsi_channel_info src1; + + analyse_src(ctx, &src0, &inst->Src[0].Register, chan); + analyse_src(ctx, &src1, &inst->Src[1].Register, chan); + + if (is_immediate(&src0, 0.0f)) { + res[chan] = src0; + } else if (is_immediate(&src1, 0.0f)) { + res[chan] = src1; + } else if (is_immediate(&src0, 1.0f)) { + res[chan] = src1; + } else if (is_immediate(&src1, 1.0f)) { + res[chan] = src0; + } + } + } + } + } + + for (chan = 0; chan < 4; ++chan) { + if (dst->WriteMask & (1 << chan)) { + regs[dst->Index][chan] = res[chan]; + } + } + } + } + + /* + * Clear all temporaries information in presence of a control flow opcode. + */ + + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_IF: + case TGSI_OPCODE_IFC: + case TGSI_OPCODE_ELSE: + case TGSI_OPCODE_ENDIF: + case TGSI_OPCODE_BGNLOOP: + case TGSI_OPCODE_BRK: + case TGSI_OPCODE_BREAKC: + case TGSI_OPCODE_CONT: + case TGSI_OPCODE_ENDLOOP: + case TGSI_OPCODE_CALLNZ: + case TGSI_OPCODE_CAL: + case TGSI_OPCODE_BGNSUB: + case TGSI_OPCODE_ENDSUB: + case TGSI_OPCODE_SWITCH: + case TGSI_OPCODE_CASE: + case TGSI_OPCODE_DEFAULT: + case TGSI_OPCODE_ENDSWITCH: + case TGSI_OPCODE_RET: + case TGSI_OPCODE_END: + /* XXX: Are there more cases? */ + memset(&ctx->temp, 0, sizeof ctx->temp); + memset(&info->output, 0, sizeof info->output); + default: + break; + } +} + + +static INLINE void +dump_info(const struct tgsi_token *tokens, + struct lp_tgsi_info *info) +{ + unsigned index; + unsigned chan; + + tgsi_dump(tokens, 0); + + for (index = 0; index < info->num_texs; ++index) { + const struct lp_tgsi_texture_info *tex_info = &info->tex[index]; + debug_printf("TEX[%u] =", index); + for (chan = 0; chan < 4; ++chan) { + const struct lp_tgsi_channel_info *chan_info = + &tex_info->coord[chan]; + if (chan_info->file != TGSI_FILE_NULL) { + debug_printf(" %s[%u].%c", + tgsi_file_names[chan_info->file], + chan_info->u.index, + "xyzw01"[chan_info->swizzle]); + } else { + debug_printf(" _"); + } + } + debug_printf(", SAMP[%u], %s\n", + tex_info->unit, + tgsi_texture_names[tex_info->target]); + } + + for (index = 0; index < PIPE_MAX_SHADER_OUTPUTS; ++index) { + for (chan = 0; chan < 4; ++chan) { + const struct lp_tgsi_channel_info *chan_info = + &info->output[index][chan]; + if (chan_info->file != TGSI_FILE_NULL) { + debug_printf("OUT[%u].%c = ", index, "xyzw"[chan]); + if (chan_info->file == TGSI_FILE_IMMEDIATE) { + debug_printf("%f", chan_info->u.value); + } else { + const char *file_name; + switch (chan_info->file) { + case TGSI_FILE_CONSTANT: + file_name = "CONST"; + break; + case TGSI_FILE_INPUT: + file_name = "IN"; + break; + default: + file_name = "???"; + break; + } + debug_printf("%s[%u].%c", + file_name, + chan_info->u.index, + "xyzw01"[chan_info->swizzle]); + } + debug_printf("\n"); + } + } + } +} + + +/** + * Detect any direct relationship between the output color + */ +void +lp_build_tgsi_info(const struct tgsi_token *tokens, + struct lp_tgsi_info *info) +{ + struct tgsi_parse_context parse; + struct analysis_context ctx; + unsigned index; + unsigned chan; + + memset(info, 0, sizeof *info); + + tgsi_scan_shader(tokens, &info->base); + + memset(&ctx, 0, sizeof ctx); + ctx.info = info; + + tgsi_parse_init(&parse, tokens); + + while (!tgsi_parse_end_of_tokens(&parse)) { + tgsi_parse_token(&parse); + + switch (parse.FullToken.Token.Type) { + case TGSI_TOKEN_TYPE_DECLARATION: + break; + + case TGSI_TOKEN_TYPE_INSTRUCTION: + { + struct tgsi_full_instruction *inst = + &parse.FullToken.FullInstruction; + + if (inst->Instruction.Opcode == TGSI_OPCODE_END || + inst->Instruction.Opcode == TGSI_OPCODE_BGNSUB) { + /* We reached the end of main function body. */ + goto finished; + } + + analyse_instruction(&ctx, inst); + } + break; + + case TGSI_TOKEN_TYPE_IMMEDIATE: + { + const unsigned size = + parse.FullToken.FullImmediate.Immediate.NrTokens - 1; + assert(size <= 4); + if (ctx.num_imms < Elements(ctx.imm)) { + for (chan = 0; chan < size; ++chan) { + ctx.imm[ctx.num_imms][chan] = + parse.FullToken.FullImmediate.u[chan].Float; + } + ++ctx.num_imms; + } + } + break; + + case TGSI_TOKEN_TYPE_PROPERTY: + break; + + default: + assert(0); + } + } +finished: + + tgsi_parse_free(&parse); + + + /* + * Link the output color values. + */ + + for (index = 0; index < PIPE_MAX_COLOR_BUFS; ++index) { + const struct lp_tgsi_channel_info null_output[4]; + info->cbuf[index] = null_output; + } + + for (index = 0; index < info->base.num_outputs; ++index) { + unsigned semantic_name = info->base.output_semantic_name[index]; + unsigned semantic_index = info->base.output_semantic_index[index]; + if (semantic_name == TGSI_SEMANTIC_COLOR && + semantic_index < PIPE_MAX_COLOR_BUFS) { + info->cbuf[semantic_index] = info->output[index]; + } + } + + if (gallivm_debug & GALLIVM_DEBUG_TGSI) { + dump_info(tokens, info); + } +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c index 441aebae298..3c318cc8c80 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c @@ -887,21 +887,25 @@ emit_tex( struct lp_build_tgsi_soa_context *bld, } if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) { + LLVMTypeRef i32t = LLVMInt32Type(); + LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); for (i = 0; i < num_coords; i++) { - ddx[i] = emit_fetch( bld, inst, 1, i ); - ddy[i] = emit_fetch( bld, inst, 2, i ); + LLVMValueRef src1 = emit_fetch( bld, inst, 1, i ); + LLVMValueRef src2 = emit_fetch( bld, inst, 2, i ); + ddx[i] = LLVMBuildExtractElement(bld->base.builder, src1, index0, ""); + ddy[i] = LLVMBuildExtractElement(bld->base.builder, src2, index0, ""); } unit = inst->Src[3].Register.Index; } else { for (i = 0; i < num_coords; i++) { - ddx[i] = lp_build_ddx( &bld->base, coords[i] ); - ddy[i] = lp_build_ddy( &bld->base, coords[i] ); + ddx[i] = lp_build_scalar_ddx( &bld->base, coords[i] ); + ddy[i] = lp_build_scalar_ddy( &bld->base, coords[i] ); } unit = inst->Src[1].Register.Index; } for (i = num_coords; i < 3; i++) { - ddx[i] = bld->base.undef; - ddy[i] = bld->base.undef; + ddx[i] = LLVMGetUndef(bld->base.elem_type); + ddy[i] = LLVMGetUndef(bld->base.elem_type); } bld->sampler->emit_fetch_texel(bld->sampler, @@ -913,6 +917,43 @@ emit_tex( struct lp_build_tgsi_soa_context *bld, texel); } +static boolean +near_end_of_shader(struct lp_build_tgsi_soa_context *bld, + int pc) +{ + int i; + + for (i = 0; i < 5; i++) { + unsigned opcode; + + if (pc + i >= bld->info->num_instructions) + return TRUE; + + opcode = bld->instructions[pc + i].Instruction.Opcode; + + if (opcode == TGSI_OPCODE_END) + return TRUE; + + if (opcode == TGSI_OPCODE_TEX || + opcode == TGSI_OPCODE_TXP || + opcode == TGSI_OPCODE_TXD || + opcode == TGSI_OPCODE_TXB || + opcode == TGSI_OPCODE_TXL || + opcode == TGSI_OPCODE_TXF || + opcode == TGSI_OPCODE_TXQ || + opcode == TGSI_OPCODE_CAL || + opcode == TGSI_OPCODE_CALLNZ || + opcode == TGSI_OPCODE_IF || + opcode == TGSI_OPCODE_IFC || + opcode == TGSI_OPCODE_BGNLOOP || + opcode == TGSI_OPCODE_SWITCH) + return FALSE; + } + + return TRUE; +} + + /** * Kill fragment if any of the src register values are negative. @@ -920,7 +961,8 @@ emit_tex( struct lp_build_tgsi_soa_context *bld, static void emit_kil( struct lp_build_tgsi_soa_context *bld, - const struct tgsi_full_instruction *inst ) + const struct tgsi_full_instruction *inst, + int pc) { const struct tgsi_full_src_register *reg = &inst->Src[0]; LLVMValueRef terms[NUM_CHANNELS]; @@ -959,8 +1001,12 @@ emit_kil( } } - if(mask) + if(mask) { lp_build_mask_update(bld->mask, mask); + + if (!near_end_of_shader(bld, pc)) + lp_build_mask_check(bld->mask); + } } @@ -972,7 +1018,8 @@ emit_kil( */ static void emit_kilp(struct lp_build_tgsi_soa_context *bld, - const struct tgsi_full_instruction *inst) + const struct tgsi_full_instruction *inst, + int pc) { LLVMValueRef mask; @@ -983,10 +1030,14 @@ emit_kilp(struct lp_build_tgsi_soa_context *bld, mask = LLVMBuildNot(bld->base.builder, bld->exec_mask.exec_mask, "kilp"); } else { - mask = bld->base.zero; + LLVMValueRef zero = LLVMConstNull(bld->base.int_vec_type); + mask = zero; } lp_build_mask_update(bld->mask, mask); + + if (!near_end_of_shader(bld, pc)) + lp_build_mask_check(bld->mask); } static void @@ -1535,12 +1586,12 @@ emit_instruction( case TGSI_OPCODE_KILP: /* predicated kill */ - emit_kilp( bld, inst ); + emit_kilp( bld, inst, (*pc)-1 ); break; case TGSI_OPCODE_KIL: /* conditional kill */ - emit_kil( bld, inst ); + emit_kil( bld, inst, (*pc)-1 ); break; case TGSI_OPCODE_PK2H: diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c index ef4b306cb67..330838d23cf 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c @@ -97,7 +97,7 @@ void (*ppc_get_func(struct ppc_function *p))(void) return (void (*)(void)) NULL; else #endif - return (void (*)(void)) p->store; + return (void (*)(void)) pointer_to_func(p->store); } diff --git a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h index 036c1ee48a8..34bfa527db0 100644 --- a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h +++ b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h @@ -23,26 +23,13 @@ #include "cell/ppu/cell_public.h" #endif + static INLINE struct pipe_screen * -sw_screen_create(struct sw_winsys *winsys) +sw_screen_create_named(struct sw_winsys *winsys, const char *driver) { - const char *default_driver; - const char *driver; struct pipe_screen *screen = NULL; #if defined(GALLIUM_CELL) - default_driver = "cell"; -#elif defined(GALLIUM_LLVMPIPE) - default_driver = "llvmpipe"; -#elif defined(GALLIUM_SOFTPIPE) - default_driver = "softpipe"; -#else - default_driver = ""; -#endif - - driver = debug_get_option("GALLIUM_DRIVER", default_driver); - -#if defined(GALLIUM_CELL) if (screen == NULL && strcmp(driver, "cell") == 0) screen = cell_create_screen(winsys); #endif @@ -60,4 +47,26 @@ sw_screen_create(struct sw_winsys *winsys) return screen; } + +static INLINE struct pipe_screen * +sw_screen_create(struct sw_winsys *winsys) +{ + const char *default_driver; + const char *driver; + +#if defined(GALLIUM_CELL) + default_driver = "cell"; +#elif defined(GALLIUM_LLVMPIPE) + default_driver = "llvmpipe"; +#elif defined(GALLIUM_SOFTPIPE) + default_driver = "softpipe"; +#else + default_driver = ""; +#endif + + driver = debug_get_option("GALLIUM_DRIVER", default_driver); + return sw_screen_create_named(winsys, driver); +} + + #endif diff --git a/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h index 0b4e7404034..e4effa713e9 100644 --- a/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h +++ b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h @@ -13,22 +13,28 @@ static INLINE struct pipe_screen * sw_screen_wrap(struct pipe_screen *screen) { struct sw_winsys *sws; - struct pipe_screen *sw_screen; + struct pipe_screen *sw_screen = NULL; + const char *driver; - sws = wrapper_sw_winsys_warp_pipe_screen(screen); + driver = debug_get_option("GALLIUM_DRIVER", "native"); + if (strcmp(driver, "native") == 0) + return screen; + + sws = wrapper_sw_winsys_wrap_pipe_screen(screen); if (!sws) goto err; - sw_screen = sw_screen_create(sws); - if (sw_screen == screen) + sw_screen = sw_screen_create_named(sws, driver); + + if (!sw_screen) goto err_winsys; return sw_screen; err_winsys: - sws->destroy(sws); + return wrapper_sw_winsys_dewrap_pipe_screen(sws); err: - return screen; + return screen; } #endif diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c index f71ffb70308..77bde86684e 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_dump.c +++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c @@ -90,7 +90,8 @@ static const char *processor_type_names[] = "GEOM" }; -static const char *file_names[TGSI_FILE_COUNT] = +const char * +tgsi_file_names[TGSI_FILE_COUNT] = { "NULL", "CONST", @@ -125,7 +126,8 @@ static const char *semantic_names[] = "FACE", "EDGEFLAG", "PRIM_ID", - "INSTANCEID" + "INSTANCEID", + "STENCIL" }; static const char *immediate_type_names[] = @@ -135,7 +137,8 @@ static const char *immediate_type_names[] = "INT32" }; -static const char *swizzle_names[] = +const char * +tgsi_swizzle_names[] = { "x", "y", @@ -143,7 +146,8 @@ static const char *swizzle_names[] = "w" }; -static const char *texture_names[] = +const char * +tgsi_texture_names[] = { "UNKNOWN", "1D", @@ -201,15 +205,15 @@ _dump_register_src( struct dump_ctx *ctx, const struct tgsi_full_src_register *src ) { - ENM(src->Register.File, file_names); + ENM(src->Register.File, tgsi_file_names); if (src->Register.Dimension) { if (src->Dimension.Indirect) { CHR( '[' ); - ENM( src->DimIndirect.File, file_names ); + ENM( src->DimIndirect.File, tgsi_file_names ); CHR( '[' ); SID( src->DimIndirect.Index ); TXT( "]." ); - ENM( src->DimIndirect.SwizzleX, swizzle_names ); + ENM( src->DimIndirect.SwizzleX, tgsi_swizzle_names ); if (src->Dimension.Index != 0) { if (src->Dimension.Index > 0) CHR( '+' ); @@ -224,11 +228,11 @@ _dump_register_src( } if (src->Register.Indirect) { CHR( '[' ); - ENM( src->Indirect.File, file_names ); + ENM( src->Indirect.File, tgsi_file_names ); CHR( '[' ); SID( src->Indirect.Index ); TXT( "]." ); - ENM( src->Indirect.SwizzleX, swizzle_names ); + ENM( src->Indirect.SwizzleX, tgsi_swizzle_names ); if (src->Register.Index != 0) { if (src->Register.Index > 0) CHR( '+' ); @@ -248,15 +252,15 @@ _dump_register_dst( struct dump_ctx *ctx, const struct tgsi_full_dst_register *dst ) { - ENM(dst->Register.File, file_names); + ENM(dst->Register.File, tgsi_file_names); if (dst->Register.Dimension) { if (dst->Dimension.Indirect) { CHR( '[' ); - ENM( dst->DimIndirect.File, file_names ); + ENM( dst->DimIndirect.File, tgsi_file_names ); CHR( '[' ); SID( dst->DimIndirect.Index ); TXT( "]." ); - ENM( dst->DimIndirect.SwizzleX, swizzle_names ); + ENM( dst->DimIndirect.SwizzleX, tgsi_swizzle_names ); if (dst->Dimension.Index != 0) { if (dst->Dimension.Index > 0) CHR( '+' ); @@ -271,11 +275,11 @@ _dump_register_dst( } if (dst->Register.Indirect) { CHR( '[' ); - ENM( dst->Indirect.File, file_names ); + ENM( dst->Indirect.File, tgsi_file_names ); CHR( '[' ); SID( dst->Indirect.Index ); TXT( "]." ); - ENM( dst->Indirect.SwizzleX, swizzle_names ); + ENM( dst->Indirect.SwizzleX, tgsi_swizzle_names ); if (dst->Register.Index != 0) { if (dst->Register.Index > 0) CHR( '+' ); @@ -351,7 +355,7 @@ iter_declaration( TXT( "DCL " ); - ENM(decl->Declaration.File, file_names); + ENM(decl->Declaration.File, tgsi_file_names); /* all geometry shader inputs are two dimensional */ if (decl->Declaration.File == TGSI_FILE_INPUT && @@ -585,10 +589,10 @@ iter_instruction( inst->Predicate.SwizzleZ != TGSI_SWIZZLE_Z || inst->Predicate.SwizzleW != TGSI_SWIZZLE_W) { CHR( '.' ); - ENM( inst->Predicate.SwizzleX, swizzle_names ); - ENM( inst->Predicate.SwizzleY, swizzle_names ); - ENM( inst->Predicate.SwizzleZ, swizzle_names ); - ENM( inst->Predicate.SwizzleW, swizzle_names ); + ENM( inst->Predicate.SwizzleX, tgsi_swizzle_names ); + ENM( inst->Predicate.SwizzleY, tgsi_swizzle_names ); + ENM( inst->Predicate.SwizzleZ, tgsi_swizzle_names ); + ENM( inst->Predicate.SwizzleW, tgsi_swizzle_names ); } TXT( ") " ); @@ -641,10 +645,10 @@ iter_instruction( src->Register.SwizzleZ != TGSI_SWIZZLE_Z || src->Register.SwizzleW != TGSI_SWIZZLE_W) { CHR( '.' ); - ENM( src->Register.SwizzleX, swizzle_names ); - ENM( src->Register.SwizzleY, swizzle_names ); - ENM( src->Register.SwizzleZ, swizzle_names ); - ENM( src->Register.SwizzleW, swizzle_names ); + ENM( src->Register.SwizzleX, tgsi_swizzle_names ); + ENM( src->Register.SwizzleY, tgsi_swizzle_names ); + ENM( src->Register.SwizzleZ, tgsi_swizzle_names ); + ENM( src->Register.SwizzleW, tgsi_swizzle_names ); } if (src->Register.Absolute) @@ -655,7 +659,7 @@ iter_instruction( if (inst->Instruction.Texture) { TXT( ", " ); - ENM( inst->Texture.Texture, texture_names ); + ENM( inst->Texture.Texture, tgsi_texture_names ); } switch (inst->Instruction.Opcode) { diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.h b/src/gallium/auxiliary/tgsi/tgsi_dump.h index dd78b361007..fc0429ad8d9 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_dump.h +++ b/src/gallium/auxiliary/tgsi/tgsi_dump.h @@ -35,6 +35,15 @@ extern "C" { #endif +extern const char * +tgsi_file_names[TGSI_FILE_COUNT]; + +extern const char * +tgsi_swizzle_names[]; + +extern const char * +tgsi_texture_names[]; + void tgsi_dump_str( const struct tgsi_token *tokens, diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c index 90198a4f604..6585da3e838 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_scan.c +++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c @@ -147,6 +147,7 @@ tgsi_scan_shader(const struct tgsi_token *tokens, info->input_semantic_name[reg] = (ubyte)fulldecl->Semantic.Name; info->input_semantic_index[reg] = (ubyte)fulldecl->Semantic.Index; info->input_interpolate[reg] = (ubyte)fulldecl->Declaration.Interpolate; + info->input_centroid[reg] = (ubyte)fulldecl->Declaration.Centroid; info->input_cylindrical_wrap[reg] = (ubyte)fulldecl->Declaration.CylindricalWrap; info->num_inputs++; } @@ -157,9 +158,11 @@ tgsi_scan_shader(const struct tgsi_token *tokens, /* extra info for special outputs */ if (procType == TGSI_PROCESSOR_FRAGMENT && - fulldecl->Semantic.Name == TGSI_SEMANTIC_POSITION) { - info->writes_z = TRUE; - } + fulldecl->Semantic.Name == TGSI_SEMANTIC_POSITION) + info->writes_z = TRUE; + if (procType == TGSI_PROCESSOR_FRAGMENT && + fulldecl->Semantic.Name == TGSI_SEMANTIC_STENCIL) + info->writes_stencil = TRUE; if (procType == TGSI_PROCESSOR_VERTEX && fulldecl->Semantic.Name == TGSI_SEMANTIC_EDGEFLAG) { info->writes_edgeflag = TRUE; diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h index f8aa90cf065..104097fbc03 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_scan.h +++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h @@ -45,6 +45,7 @@ struct tgsi_shader_info ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */ ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS]; ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS]; + ubyte input_centroid[PIPE_MAX_SHADER_INPUTS]; ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS]; ubyte input_cylindrical_wrap[PIPE_MAX_SHADER_INPUTS]; ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */ @@ -60,6 +61,7 @@ struct tgsi_shader_info uint opcode_count[TGSI_OPCODE_LAST]; /**< opcode histogram */ boolean writes_z; /**< does fragment shader write Z value? */ + boolean writes_stencil; /**< does fragment shader write stencil value? */ boolean writes_edgeflag; /**< vertex shader outputs edgeflag */ boolean uses_kill; /**< KIL or KILP instruction used? */ diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h index a75380228b1..850ef39ef21 100644 --- a/src/gallium/auxiliary/translate/translate.h +++ b/src/gallium/auxiliary/translate/translate.h @@ -68,6 +68,33 @@ struct translate_key { }; +struct translate; + + +typedef void (PIPE_CDECL *run_elts_func)(struct translate *, + const unsigned *elts, + unsigned count, + unsigned instance_id, + void *output_buffer); + +typedef void (PIPE_CDECL *run_elts16_func)(struct translate *, + const uint16_t *elts, + unsigned count, + unsigned instance_id, + void *output_buffer); + +typedef void (PIPE_CDECL *run_elts8_func)(struct translate *, + const uint8_t *elts, + unsigned count, + unsigned instance_id, + void *output_buffer); + +typedef void (PIPE_CDECL *run_func)(struct translate *, + unsigned start, + unsigned count, + unsigned instance_id, + void *output_buffer); + struct translate { struct translate_key key; @@ -79,42 +106,14 @@ struct translate { unsigned stride, unsigned max_index ); - void (PIPE_CDECL *run_elts)( struct translate *, - const unsigned *elts, - unsigned count, - unsigned instance_id, - void *output_buffer); - - void (PIPE_CDECL *run_elts16)( struct translate *, - const uint16_t *elts, - unsigned count, - unsigned instance_id, - void *output_buffer); - - void (PIPE_CDECL *run_elts8)( struct translate *, - const uint8_t *elts, - unsigned count, - unsigned instance_id, - void *output_buffer); - - void (PIPE_CDECL *run)( struct translate *, - unsigned start, - unsigned count, - unsigned instance_id, - void *output_buffer); + run_elts_func run_elts; + run_elts16_func run_elts16; + run_elts8_func run_elts8; + run_func run; }; -#if 0 -struct translate_context *translate_context_create( void ); -void translate_context_destroy( struct translate_context * ); - -struct translate *translate_lookup_or_create( struct translate_context *tctx, - const struct translate_key *key ); -#endif - - struct translate *translate_create( const struct translate_key *key ); boolean translate_is_output_format_supported(enum pipe_format format); diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c index f8bf5b46692..ef7f4be4c3e 100644 --- a/src/gallium/auxiliary/translate/translate_sse.c +++ b/src/gallium/auxiliary/translate/translate_sse.c @@ -1495,19 +1495,19 @@ struct translate *translate_sse2_create( const struct translate_key *key ) if (!build_vertex_emit(p, &p->elt8_func, 1)) goto fail; - p->translate.run = (void*)x86_get_func(&p->linear_func); + p->translate.run = (run_func) x86_get_func(&p->linear_func); if (p->translate.run == NULL) goto fail; - p->translate.run_elts = (void*)x86_get_func(&p->elt_func); + p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func); if (p->translate.run_elts == NULL) goto fail; - p->translate.run_elts16 = (void*)x86_get_func(&p->elt16_func); + p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func); if (p->translate.run_elts16 == NULL) goto fail; - p->translate.run_elts8 = (void*)x86_get_func(&p->elt8_func); + p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func); if (p->translate.run_elts8 == NULL) goto fail; diff --git a/src/gallium/auxiliary/util/u_dl.c b/src/gallium/auxiliary/util/u_dl.c index 220860ebf4b..aca435d6cad 100644 --- a/src/gallium/auxiliary/util/u_dl.c +++ b/src/gallium/auxiliary/util/u_dl.c @@ -38,6 +38,7 @@ #endif #include "u_dl.h" +#include "u_pointer.h" struct util_dl_library * @@ -58,7 +59,7 @@ util_dl_get_proc_address(struct util_dl_library *library, const char *procname) { #if defined(PIPE_OS_UNIX) - return (util_dl_proc)dlsym((void *)library, procname); + return (util_dl_proc) pointer_to_func(dlsym((void *)library, procname)); #elif defined(PIPE_OS_WINDOWS) return (util_dl_proc)GetProcAddress((HMODULE)library, procname); #else diff --git a/src/gallium/auxiliary/util/u_format.csv b/src/gallium/auxiliary/util/u_format.csv index 0811280b97b..8e5d4487a67 100644 --- a/src/gallium/auxiliary/util/u_format.csv +++ b/src/gallium/auxiliary/util/u_format.csv @@ -109,9 +109,12 @@ PIPE_FORMAT_Z32_UNORM , plain, 1, 1, un32, , , , x___, PIPE_FORMAT_Z32_FLOAT , plain, 1, 1, f32 , , , , x___, zs PIPE_FORMAT_Z24_UNORM_S8_USCALED , plain, 1, 1, un24, u8 , , , xy__, zs PIPE_FORMAT_S8_USCALED_Z24_UNORM , plain, 1, 1, u8 , un24, , , yx__, zs +PIPE_FORMAT_X24S8_USCALED , plain, 1, 1, x24, u8 , , , _y__, zs +PIPE_FORMAT_S8X24_USCALED , plain, 1, 1, u8 , x24 , , , _x__, zs PIPE_FORMAT_Z24X8_UNORM , plain, 1, 1, un24, x8 , , , x___, zs PIPE_FORMAT_X8Z24_UNORM , plain, 1, 1, x8 , un24, , , y___, zs PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED , plain, 1, 1, f32, u8 , x24 , , xy__, zs +PIPE_FORMAT_X32_S8X24_USCALED , plain, 1, 1, x32, u8 , x24 , , _y__, zs # YUV formats # http://www.fourcc.org/yuv.php#UYVY diff --git a/src/gallium/auxiliary/util/u_format_zs.c b/src/gallium/auxiliary/util/u_format_zs.c index 792d69c214c..80081e22f7c 100644 --- a/src/gallium/auxiliary/util/u_format_zs.c +++ b/src/gallium/auxiliary/util/u_format_zs.c @@ -918,3 +918,56 @@ util_format_z32_float_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned d } } + +void +util_format_x24s8_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) +{ + util_format_z24_unorm_s8_uscaled_unpack_s_8uscaled(dst_row, dst_stride, + src_row, src_stride, + width, height); +} + +void +util_format_x24s8_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) +{ + util_format_z24_unorm_s8_uscaled_pack_s_8uscaled(dst_row, dst_stride, + src_row, src_stride, + width, height); +} + +void +util_format_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) +{ + util_format_s8_uscaled_z24_unorm_unpack_s_8uscaled(dst_row, dst_stride, + src_row, src_stride, + width, height); +} + +void +util_format_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) +{ + util_format_s8_uscaled_z24_unorm_pack_s_8uscaled(dst_row, dst_stride, + src_row, src_stride, + width, height); +} + +void +util_format_x32_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, + const uint8_t *src_row, unsigned src_stride, + unsigned width, unsigned height) +{ + util_format_z32_float_s8x24_uscaled_unpack_s_8uscaled(dst_row, dst_stride, + src_row, src_stride, + width, height); + +} + +void +util_format_x32_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, + const uint8_t *src_row, unsigned src_stride, + unsigned width, unsigned height) +{ + util_format_z32_float_s8x24_uscaled_pack_s_8uscaled(dst_row, dst_stride, + src_row, src_stride, + width, height); +} diff --git a/src/gallium/auxiliary/util/u_format_zs.h b/src/gallium/auxiliary/util/u_format_zs.h index 650db4b95fd..1604cc3eee2 100644 --- a/src/gallium/auxiliary/util/u_format_zs.h +++ b/src/gallium/auxiliary/util/u_format_zs.h @@ -192,5 +192,21 @@ util_format_z32_float_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned void util_format_z32_float_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); +void +util_format_x24s8_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); + +void +util_format_x24s8_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); + +void +util_format_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); +void +util_format_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); + +void +util_format_x32_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); + +void +util_format_x32_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_sride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); #endif /* U_FORMAT_ZS_H_ */ diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h index 69a76814945..37294b7203f 100644 --- a/src/gallium/auxiliary/util/u_math.h +++ b/src/gallium/auxiliary/util/u_math.h @@ -118,6 +118,11 @@ __inline double __cdecl atan2(double val) #endif +#ifndef M_SQRT2 +#define M_SQRT2 1.41421356237309504880 +#endif + + #if defined(_MSC_VER) #if _MSC_VER < 1400 && !defined(__cplusplus) || defined(PIPE_SUBSYSTEM_WINDOWS_CE) diff --git a/src/gallium/auxiliary/util/u_pack_color.h b/src/gallium/auxiliary/util/u_pack_color.h index c90b0fdbc3f..5378f2d782f 100644 --- a/src/gallium/auxiliary/util/u_pack_color.h +++ b/src/gallium/auxiliary/util/u_pack_color.h @@ -434,8 +434,8 @@ util_pack_color(const float rgba[4], enum pipe_format format, union util_color * /* Integer versions of util_pack_z and util_pack_z_stencil - useful for * constructing clear masks. */ -static INLINE uint -util_pack_uint_z(enum pipe_format format, unsigned z) +static INLINE uint32_t +util_pack_mask_z(enum pipe_format format, uint32_t z) { switch (format) { case PIPE_FORMAT_Z16_UNORM: @@ -452,29 +452,32 @@ util_pack_uint_z(enum pipe_format format, unsigned z) case PIPE_FORMAT_S8_USCALED: return 0; default: - debug_print_format("gallium: unhandled format in util_pack_z()", format); + debug_print_format("gallium: unhandled format in util_pack_mask_z()", format); assert(0); return 0; } } -static INLINE uint -util_pack_uint_z_stencil(enum pipe_format format, double z, uint s) +static INLINE uint32_t +util_pack_mask_z_stencil(enum pipe_format format, uint32_t z, uint8_t s) { - unsigned packed = util_pack_uint_z(format, z); - - s &= 0xff; + uint32_t packed = util_pack_mask_z(format, z); switch (format) { case PIPE_FORMAT_Z24_UNORM_S8_USCALED: - return packed | (s << 24); + packed |= (uint32_t)s << 24; + break; case PIPE_FORMAT_S8_USCALED_Z24_UNORM: - return packed | s; + packed |= s; + break; case PIPE_FORMAT_S8_USCALED: - return packed | s; + packed |= s; + break; default: - return packed; + break; } + + return packed; } @@ -482,9 +485,11 @@ util_pack_uint_z_stencil(enum pipe_format format, double z, uint s) /** * Note: it's assumed that z is in [0,1] */ -static INLINE uint +static INLINE uint32_t util_pack_z(enum pipe_format format, double z) { + union fi fui; + if (z == 0.0) return 0; @@ -492,24 +497,25 @@ util_pack_z(enum pipe_format format, double z) case PIPE_FORMAT_Z16_UNORM: if (z == 1.0) return 0xffff; - return (uint) (z * 0xffff); + return (uint32_t) (z * 0xffff); case PIPE_FORMAT_Z32_UNORM: /* special-case to avoid overflow */ if (z == 1.0) return 0xffffffff; - return (uint) (z * 0xffffffff); + return (uint32_t) (z * 0xffffffff); case PIPE_FORMAT_Z32_FLOAT: - return (uint)z; + fui.f = (float)z; + return fui.ui; case PIPE_FORMAT_Z24_UNORM_S8_USCALED: case PIPE_FORMAT_Z24X8_UNORM: if (z == 1.0) return 0xffffff; - return (uint) (z * 0xffffff); + return (uint32_t) (z * 0xffffff); case PIPE_FORMAT_S8_USCALED_Z24_UNORM: case PIPE_FORMAT_X8Z24_UNORM: if (z == 1.0) return 0xffffff00; - return ((uint) (z * 0xffffff)) << 8; + return ((uint32_t) (z * 0xffffff)) << 8; case PIPE_FORMAT_S8_USCALED: /* this case can get it via util_pack_z_stencil() */ return 0; @@ -525,14 +531,14 @@ util_pack_z(enum pipe_format format, double z) * Pack Z and/or stencil values into a 32-bit value described by format. * Note: it's assumed that z is in [0,1] and s in [0,255] */ -static INLINE uint -util_pack_z_stencil(enum pipe_format format, double z, uint s) +static INLINE uint32_t +util_pack_z_stencil(enum pipe_format format, double z, uint8_t s) { - unsigned packed = util_pack_z(format, z); + uint32_t packed = util_pack_z(format, z); switch (format) { case PIPE_FORMAT_Z24_UNORM_S8_USCALED: - packed |= s << 24; + packed |= (uint32_t)s << 24; break; case PIPE_FORMAT_S8_USCALED_Z24_UNORM: packed |= s; diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h index 03198c91da4..1df6c872677 100644 --- a/src/gallium/auxiliary/util/u_sse.h +++ b/src/gallium/auxiliary/util/u_sse.h @@ -71,6 +71,96 @@ _mm_castps_si128(__m128 a) #endif /* defined(_MSC_VER) && _MSC_VER < 1500 */ +union m128i { + __m128i m; + ubyte ub[16]; + ushort us[8]; + uint ui[4]; +}; + +static INLINE void u_print_epi8(const char *name, __m128i r) +{ + union { __m128i m; ubyte ub[16]; } u; + u.m = r; + + debug_printf("%s: " + "%02x/" + "%02x/" + "%02x/" + "%02x/" + "%02x/" + "%02x/" + "%02x/" + "%02x/" + "%02x/" + "%02x/" + "%02x/" + "%02x/" + "%02x/" + "%02x/" + "%02x/" + "%02x\n", + name, + u.ub[0], u.ub[1], u.ub[2], u.ub[3], + u.ub[4], u.ub[5], u.ub[6], u.ub[7], + u.ub[8], u.ub[9], u.ub[10], u.ub[11], + u.ub[12], u.ub[13], u.ub[14], u.ub[15]); +} + +static INLINE void u_print_epi16(const char *name, __m128i r) +{ + union { __m128i m; ushort us[8]; } u; + u.m = r; + + debug_printf("%s: " + "%04x/" + "%04x/" + "%04x/" + "%04x/" + "%04x/" + "%04x/" + "%04x/" + "%04x\n", + name, + u.us[0], u.us[1], u.us[2], u.us[3], + u.us[4], u.us[5], u.us[6], u.us[7]); +} + +static INLINE void u_print_epi32(const char *name, __m128i r) +{ + union { __m128i m; uint ui[4]; } u; + u.m = r; + + debug_printf("%s: " + "%08x/" + "%08x/" + "%08x/" + "%08x\n", + name, + u.ui[0], u.ui[1], u.ui[2], u.ui[3]); +} + +static INLINE void u_print_ps(const char *name, __m128 r) +{ + union { __m128 m; float f[4]; } u; + u.m = r; + + debug_printf("%s: " + "%f/" + "%f/" + "%f/" + "%f\n", + name, + u.f[0], u.f[1], u.f[2], u.f[3]); +} + + +#define U_DUMP_EPI32(a) u_print_epi32(#a, a) +#define U_DUMP_EPI16(a) u_print_epi16(#a, a) +#define U_DUMP_EPI8(a) u_print_epi8(#a, a) +#define U_DUMP_PS(a) u_print_ps(#a, a) + + #if defined(PIPE_ARCH_SSSE3) @@ -98,6 +188,68 @@ _mm_shuffle_epi8(__m128i a, __m128i mask) #endif /* !PIPE_ARCH_SSSE3 */ -#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */ + + +/* Provide an SSE2 implementation of _mm_mullo_epi32() in terms of + * _mm_mul_epu32(). + * + * I suspect this works fine for us because one of our operands is + * always positive, but not sure that this can be used for general + * signed integer multiplication. + * + * This seems close enough to the speed of SSE4 and the real + * _mm_mullo_epi32() intrinsic as to not justify adding an sse4 + * dependency at this point. + */ +static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b) +{ + __m128i a4 = _mm_srli_epi64(a, 32); /* shift by one dword */ + __m128i b4 = _mm_srli_epi64(b, 32); /* shift by one dword */ + __m128i ba = _mm_mul_epu32(b, a); /* multply dwords 0, 2 */ + __m128i b4a4 = _mm_mul_epu32(b4, a4); /* multiply dwords 1, 3 */ + + /* Interleave the results, either with shuffles or (slightly + * faster) direct bit operations: + */ +#if 0 + __m128i ba8 = _mm_shuffle_epi32(ba, 8); + __m128i b4a48 = _mm_shuffle_epi32(b4a4, 8); + __m128i result = _mm_unpacklo_epi32(ba8, b4a48); +#else + __m128i mask = _mm_setr_epi32(~0,0,~0,0); + __m128i ba_mask = _mm_and_si128(ba, mask); + __m128i b4a4_mask_shift = _mm_slli_epi64(b4a4, 32); + __m128i result = _mm_or_si128(ba_mask, b4a4_mask_shift); +#endif + + return result; +} + + +static INLINE void +transpose4_epi32(const __m128i * restrict a, + const __m128i * restrict b, + const __m128i * restrict c, + const __m128i * restrict d, + __m128i * restrict o, + __m128i * restrict p, + __m128i * restrict q, + __m128i * restrict r) +{ + __m128i t0 = _mm_unpacklo_epi32(*a, *b); + __m128i t1 = _mm_unpacklo_epi32(*c, *d); + __m128i t2 = _mm_unpackhi_epi32(*a, *b); + __m128i t3 = _mm_unpackhi_epi32(*c, *d); + + *o = _mm_unpacklo_epi64(t0, t1); + *p = _mm_unpackhi_epi64(t0, t1); + *q = _mm_unpacklo_epi64(t2, t3); + *r = _mm_unpackhi_epi64(t2, t3); +} + +#define SCALAR_EPI32(m, i) _mm_shuffle_epi32((m), _MM_SHUFFLE(i,i,i,i)) + + +#endif /* PIPE_ARCH_SSE */ #endif /* U_SSE_H_ */ diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c index f7aa1403d08..44cadbfcdd0 100644 --- a/src/gallium/auxiliary/util/u_tile.c +++ b/src/gallium/auxiliary/util/u_tile.c @@ -217,6 +217,81 @@ z24s8_get_tile_rgba(const unsigned *src, } } +/*** PIPE_FORMAT_S8X24_USCALED ***/ + +/** + * Return S component as four uint32_t in [0..255]. Z part ignored. + */ +static void +s8x24_get_tile_rgba(const unsigned *src, + unsigned w, unsigned h, + float *p, + unsigned dst_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + float *pRow = p; + + for (j = 0; j < w; j++, pRow += 4) { + pRow[0] = + pRow[1] = + pRow[2] = + pRow[3] = (float)((*src++ >> 24) & 0xff); + } + + p += dst_stride; + } +} + +/*** PIPE_FORMAT_X24S8_USCALED ***/ + +/** + * Return S component as four uint32_t in [0..255]. Z part ignored. + */ +static void +x24s8_get_tile_rgba(const unsigned *src, + unsigned w, unsigned h, + float *p, + unsigned dst_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + pRow[0] = + pRow[1] = + pRow[2] = + pRow[3] = (float)(*src++ & 0xff); + } + p += dst_stride; + } +} + + +/** + * Return S component as four uint32_t in [0..255]. Z part ignored. + */ +static void +s8_get_tile_rgba(const unsigned char *src, + unsigned w, unsigned h, + float *p, + unsigned dst_stride) +{ + unsigned i, j; + + for (i = 0; i < h; i++) { + float *pRow = p; + for (j = 0; j < w; j++, pRow += 4) { + pRow[0] = + pRow[1] = + pRow[2] = + pRow[3] = (float)(*src++ & 0xff); + } + p += dst_stride; + } +} /*** PIPE_FORMAT_Z32_FLOAT ***/ @@ -261,10 +336,19 @@ pipe_tile_raw_to_rgba(enum pipe_format format, case PIPE_FORMAT_Z24X8_UNORM: s8z24_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride); break; + case PIPE_FORMAT_S8_USCALED: + s8_get_tile_rgba((unsigned char *) src, w, h, dst, dst_stride); + break; + case PIPE_FORMAT_X24S8_USCALED: + s8x24_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride); + break; case PIPE_FORMAT_S8_USCALED_Z24_UNORM: case PIPE_FORMAT_X8Z24_UNORM: z24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride); break; + case PIPE_FORMAT_S8X24_USCALED: + x24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride); + break; case PIPE_FORMAT_Z32_FLOAT: z32f_get_tile_rgba((float *) src, w, h, dst, dst_stride); break; |