diff options
Diffstat (limited to 'src/gallium/auxiliary/translate/translate_sse.c')
-rw-r--r-- | src/gallium/auxiliary/translate/translate_sse.c | 1378 |
1 files changed, 1046 insertions, 332 deletions
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c index ef3aa674a34..f8bf5b46692 100644 --- a/src/gallium/auxiliary/translate/translate_sse.c +++ b/src/gallium/auxiliary/translate/translate_sse.c @@ -30,11 +30,12 @@ #include "pipe/p_compiler.h" #include "util/u_memory.h" #include "util/u_math.h" +#include "util/u_format.h" #include "translate.h" -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) #include "rtasm/rtasm_cpu.h" #include "rtasm/rtasm_x86sse.h" @@ -46,21 +47,9 @@ #define W 3 -typedef void (PIPE_CDECL *run_func)( struct translate *translate, - unsigned start, - unsigned count, - unsigned instance_id, - void *output_buffer); - -typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate, - const unsigned *elts, - unsigned count, - unsigned instance_id, - void *output_buffer); - struct translate_buffer { const void *base_ptr; - unsigned stride; + uintptr_t stride; unsigned max_index; }; @@ -73,21 +62,43 @@ struct translate_buffer_varient { #define ELEMENT_BUFFER_INSTANCE_ID 1001 +#define NUM_CONSTS 7 + +enum +{ + CONST_IDENTITY, + CONST_INV_127, + CONST_INV_255, + CONST_INV_32767, + CONST_INV_65535, + CONST_INV_2147483647, + CONST_255 +}; + +#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)} +static float consts[NUM_CONSTS][4] = { + {0, 0, 0, 1}, + C(1.0 / 127.0), + C(1.0 / 255.0), + C(1.0 / 32767.0), + C(1.0 / 65535.0), + C(1.0 / 2147483647.0), + C(255.0) +}; +#undef C struct translate_sse { struct translate translate; struct x86_function linear_func; struct x86_function elt_func; + struct x86_function elt16_func; + struct x86_function elt8_func; struct x86_function *func; - boolean loaded_identity; - boolean loaded_255; - boolean loaded_inv_255; - - float identity[4]; - float float_255[4]; - float inv_255[4]; + PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4]; + int8_t reg_to_const[16]; + int8_t const_to_reg[NUM_CONSTS]; struct translate_buffer buffer[PIPE_MAX_ATTRIBS]; unsigned nr_buffers; @@ -102,17 +113,16 @@ struct translate_sse { boolean use_instancing; unsigned instance_id; - run_func gen_run; - run_elts_func gen_run_elts; - /* these are actually known values, but putting them in a struct * like this is helpful to keep them in sync across the file. */ struct x86_reg tmp_EAX; - struct x86_reg idx_EBX; /* either start+i or &elt[i] */ - struct x86_reg outbuf_ECX; - struct x86_reg machine_EDX; - struct x86_reg count_ESI; /* decrements to zero */ + struct x86_reg tmp2_EDX; + struct x86_reg src_ECX; + struct x86_reg idx_ESI; /* either start+i or &elt[i] */ + struct x86_reg machine_EDI; + struct x86_reg outbuf_EBX; + struct x86_reg count_EBP; /* decrements to zero */ }; static int get_offset( const void *a, const void *b ) @@ -120,281 +130,950 @@ static int get_offset( const void *a, const void *b ) return (const char *)b - (const char *)a; } +static struct x86_reg get_const( struct translate_sse *p, unsigned id) +{ + struct x86_reg reg; + unsigned i; + if(p->const_to_reg[id] >= 0) + return x86_make_reg(file_XMM, p->const_to_reg[id]); -static struct x86_reg get_identity( struct translate_sse *p ) -{ - struct x86_reg reg = x86_make_reg(file_XMM, 6); - - if (!p->loaded_identity) { - p->loaded_identity = TRUE; - p->identity[0] = 0; - p->identity[1] = 0; - p->identity[2] = 0; - p->identity[3] = 1; - - sse_movups(p->func, reg, - x86_make_disp(p->machine_EDX, - get_offset(p, &p->identity[0]))); + for(i = 2; i < 8; ++i) + { + if(p->reg_to_const[i] < 0) + break; } + /* TODO: be smarter here */ + if(i == 8) + --i; + + reg = x86_make_reg(file_XMM, i); + + if(p->reg_to_const[i] >= 0) + p->const_to_reg[p->reg_to_const[i]] = -1; + + p->reg_to_const[i] = id; + p->const_to_reg[id] = i; + + /* TODO: this should happen outside the loop, if possible */ + sse_movaps(p->func, reg, + x86_make_disp(p->machine_EDI, + get_offset(p, &p->consts[id][0]))); + return reg; } -static struct x86_reg get_255( struct translate_sse *p ) +/* load the data in a SSE2 register, padding with zeros */ +static boolean emit_load_sse2( struct translate_sse *p, + struct x86_reg data, + struct x86_reg src, + unsigned size) { - struct x86_reg reg = x86_make_reg(file_XMM, 7); - - if (!p->loaded_255) { - p->loaded_255 = TRUE; - p->float_255[0] = - p->float_255[1] = - p->float_255[2] = - p->float_255[3] = 255.0f; - - sse_movups(p->func, reg, - x86_make_disp(p->machine_EDX, - get_offset(p, &p->float_255[0]))); + struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); + struct x86_reg tmp = p->tmp_EAX; + switch(size) + { + case 1: + x86_movzx8(p->func, tmp, src); + sse2_movd(p->func, data, tmp); + break; + case 2: + x86_movzx16(p->func, tmp, src); + sse2_movd(p->func, data, tmp); + break; + case 3: + x86_movzx8(p->func, tmp, x86_make_disp(src, 2)); + x86_shl_imm(p->func, tmp, 16); + x86_mov16(p->func, tmp, src); + sse2_movd(p->func, data, tmp); + break; + case 4: + sse2_movd(p->func, data, src); + break; + case 6: + sse2_movd(p->func, data, src); + x86_movzx16(p->func, tmp, x86_make_disp(src, 4)); + sse2_movd(p->func, tmpXMM, tmp); + sse2_punpckldq(p->func, data, tmpXMM); + break; + case 8: + sse2_movq(p->func, data, src); + break; + case 12: + sse2_movq(p->func, data, src); + sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8)); + sse2_punpcklqdq(p->func, data, tmpXMM); + break; + case 16: + sse2_movdqu(p->func, data, src); + break; + default: + return FALSE; } - - return reg; + return TRUE; } -static struct x86_reg get_inv_255( struct translate_sse *p ) +/* this value can be passed for the out_chans argument */ +#define CHANNELS_0001 5 + +/* this function will load #chans float values, and will + * pad the register with zeroes at least up to out_chans. + * + * If out_chans is set to CHANNELS_0001, then the fourth + * value will be padded with 1. Only pass this value if + * chans < 4 or results are undefined. + */ +static void emit_load_float32( struct translate_sse *p, + struct x86_reg data, + struct x86_reg arg0, + unsigned out_chans, + unsigned chans) { - struct x86_reg reg = x86_make_reg(file_XMM, 5); - - if (!p->loaded_inv_255) { - p->loaded_inv_255 = TRUE; - p->inv_255[0] = - p->inv_255[1] = - p->inv_255[2] = - p->inv_255[3] = 1.0f / 255.0f; - - sse_movups(p->func, reg, - x86_make_disp(p->machine_EDX, - get_offset(p, &p->inv_255[0]))); + switch(chans) + { + case 1: + /* a 0 0 0 + * a 0 0 1 + */ + sse_movss(p->func, data, arg0); + if(out_chans == CHANNELS_0001) + sse_orps(p->func, data, get_const(p, CONST_IDENTITY) ); + break; + case 2: + /* 0 0 0 1 + * a b 0 1 + */ + if(out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) ); + else if(out_chans > 2) + sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) ); + sse_movlps(p->func, data, arg0); + break; + case 3: + /* Have to jump through some hoops: + * + * c 0 0 0 + * c 0 0 1 if out_chans == CHANNELS_0001 + * 0 0 c 0/1 + * a b c 0/1 + */ + sse_movss(p->func, data, x86_make_disp(arg0, 8)); + if(out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X,Y,Z,W) ); + sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) ); + sse_movlps(p->func, data, arg0); + break; + case 4: + sse_movups(p->func, data, arg0); + break; } - - return reg; } +/* this function behaves like emit_load_float32, but loads + 64-bit floating point numbers, converting them to 32-bit + ones */ +static void emit_load_float64to32( struct translate_sse *p, + struct x86_reg data, + struct x86_reg arg0, + unsigned out_chans, + unsigned chans) +{ + struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); + switch(chans) + { + case 1: + sse2_movsd(p->func, data, arg0); + if(out_chans > 1) + sse2_cvtpd2ps(p->func, data, data); + else + sse2_cvtsd2ss(p->func, data, data); + if(out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) ); + break; + case 2: + sse2_movupd(p->func, data, arg0); + sse2_cvtpd2ps(p->func, data, data); + if(out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) ); + else if(out_chans > 2) + sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) ); + break; + case 3: + sse2_movupd(p->func, data, arg0); + sse2_cvtpd2ps(p->func, data, data); + sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16)); + if(out_chans > 3) + sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); + else + sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM); + sse_movlhps(p->func, data, tmpXMM); + if(out_chans == CHANNELS_0001) + sse_orps(p->func, data, get_const(p, CONST_IDENTITY) ); + break; + case 4: + sse2_movupd(p->func, data, arg0); + sse2_cvtpd2ps(p->func, data, data); + sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16)); + sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); + sse_movlhps(p->func, data, tmpXMM); + break; + } +} -static void emit_load_R32G32B32A32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0 ) +static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr, struct x86_reg src_xmm) { - sse_movups(p->func, data, arg0); + if(x86_target(p->func) != X86_32) + x64_mov64(p->func, dst_gpr, src_gpr); + else + { + /* TODO: when/on which CPUs is SSE2 actually better than SSE? */ + if(x86_target_caps(p->func) & X86_SSE2) + sse2_movq(p->func, dst_xmm, src_xmm); + else + sse_movlps(p->func, dst_xmm, src_xmm); + } } -static void emit_load_R32G32B32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0 ) +static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src) { - /* Have to jump through some hoops: - * - * c 0 0 0 - * c 0 0 1 - * 0 0 c 1 - * a b c 1 - */ - sse_movss(p->func, data, x86_make_disp(arg0, 8)); - sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) ); - sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) ); - sse_movlps(p->func, data, arg0); + emit_mov64(p, dst_gpr, dst_xmm, src, src); } -static void emit_load_R32G32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0 ) +static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm) { - /* 0 0 0 1 - * a b 0 1 - */ - sse_movups(p->func, data, get_identity(p) ); - sse_movlps(p->func, data, arg0); + emit_mov64(p, dst, dst, src_gpr, src_xmm); } +static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src) +{ + if(x86_target_caps(p->func) & X86_SSE2) + sse2_movdqu(p->func, dst, src); + else + sse_movups(p->func, dst, src); +} -static void emit_load_R32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0 ) +/* TODO: this uses unaligned accesses liberally, which is great on Nehalem, + * but may or may not be good on older processors + * TODO: may perhaps want to use non-temporal stores here if possible + */ +static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size) { - /* a 0 0 0 - * a 0 0 1 - */ - sse_movss(p->func, data, arg0); - sse_orps(p->func, data, get_identity(p) ); + struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); + struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1); + struct x86_reg dataGPR = p->tmp_EAX; + struct x86_reg dataGPR2 = p->tmp2_EDX; + + if(size < 8) + { + switch (size) + { + case 1: + x86_mov8(p->func, dataGPR, src); + x86_mov8(p->func, dst, dataGPR); + break; + case 2: + x86_mov16(p->func, dataGPR, src); + x86_mov16(p->func, dst, dataGPR); + break; + case 3: + x86_mov16(p->func, dataGPR, src); + x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2)); + x86_mov16(p->func, dst, dataGPR); + x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2); + break; + case 4: + x86_mov(p->func, dataGPR, src); + x86_mov(p->func, dst, dataGPR); + break; + case 6: + x86_mov(p->func, dataGPR, src); + x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4)); + x86_mov(p->func, dst, dataGPR); + x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2); + break; + } + } + else if(!(x86_target_caps(p->func) & X86_SSE)) + { + unsigned i = 0; + assert((size & 3) == 0); + for(i = 0; i < size; i += 4) + { + x86_mov(p->func, dataGPR, x86_make_disp(src, i)); + x86_mov(p->func, x86_make_disp(dst, i), dataGPR); + } + } + else + { + switch(size) + { + case 8: + emit_load64(p, dataGPR, dataXMM, src); + emit_store64(p, dst, dataGPR, dataXMM); + break; + case 12: + emit_load64(p, dataGPR2, dataXMM, src); + x86_mov(p->func, dataGPR, x86_make_disp(src, 8)); + emit_store64(p, dst, dataGPR2, dataXMM); + x86_mov(p->func, x86_make_disp(dst, 8), dataGPR); + break; + case 16: + emit_mov128(p, dataXMM, src); + emit_mov128(p, dst, dataXMM); + break; + case 24: + emit_mov128(p, dataXMM, src); + emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16)); + emit_mov128(p, dst, dataXMM); + emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2); + break; + case 32: + emit_mov128(p, dataXMM, src); + emit_mov128(p, dataXMM2, x86_make_disp(src, 16)); + emit_mov128(p, dst, dataXMM); + emit_mov128(p, x86_make_disp(dst, 16), dataXMM2); + break; + default: + assert(0); + } + } } +static boolean translate_attr_convert( struct translate_sse *p, + const struct translate_element *a, + struct x86_reg src, + struct x86_reg dst) -static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p, - struct x86_reg data, - struct x86_reg src ) { + const struct util_format_description* input_desc = util_format_description(a->input_format); + const struct util_format_description* output_desc = util_format_description(a->output_format); + unsigned i; + boolean id_swizzle = TRUE; + unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE}; + unsigned needed_chans = 0; + unsigned imms[2] = {0, 0x3f800000}; - /* Load and unpack twice: - */ - sse_movss(p->func, data, src); - sse2_punpcklbw(p->func, data, get_identity(p)); - sse2_punpcklbw(p->func, data, get_identity(p)); + if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE) + return FALSE; - /* Convert to float: - */ - sse2_cvtdq2ps(p->func, data, data); + if(input_desc->channel[0].size & 7) + return FALSE; + if(input_desc->colorspace != output_desc->colorspace) + return FALSE; - /* Scale by 1/255.0 - */ - sse_mulps(p->func, data, get_inv_255(p)); -} + for(i = 1; i < input_desc->nr_channels; ++i) + { + if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0]))) + return FALSE; + } + for(i = 1; i < output_desc->nr_channels; ++i) + { + if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0]))) + return FALSE; + } + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(output_desc->swizzle[i] < 4) + swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i]; + } + if((x86_target_caps(p->func) & X86_SSE) && (0 + || a->output_format == PIPE_FORMAT_R32_FLOAT + || a->output_format == PIPE_FORMAT_R32G32_FLOAT + || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT + || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) + { + struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); -static void emit_store_R32G32B32A32( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) -{ - sse_movups(p->func, dest, dataXMM); -} + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels) + swizzle[i] = i; + } -static void emit_store_R32G32B32( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) -{ - /* Emit two, shuffle, emit one. - */ - sse_movlps(p->func, dest, dataXMM); - sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */ - sse_movss(p->func, x86_make_disp(dest,8), dataXMM); -} + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(swizzle[i] < 4) + needed_chans = MAX2(needed_chans, swizzle[i] + 1); + if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i) + id_swizzle = FALSE; + } -static void emit_store_R32G32( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) -{ - sse_movlps(p->func, dest, dataXMM); -} + if(needed_chans > 0) + { + switch(input_desc->channel[0].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + if(!(x86_target_caps(p->func) & X86_SSE2)) + return FALSE; + emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); + + /* TODO: add support for SSE4.1 pmovzx */ + switch(input_desc->channel[0].size) + { + case 8: + /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */ + sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); + sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); + break; + case 16: + sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY)); + break; + case 32: /* we lose precision here */ + sse2_psrld_imm(p->func, dataXMM, 1); + break; + default: + return FALSE; + } + sse2_cvtdq2ps(p->func, dataXMM, dataXMM); + if(input_desc->channel[0].normalized) + { + struct x86_reg factor; + switch(input_desc->channel[0].size) + { + case 8: + factor = get_const(p, CONST_INV_255); + break; + case 16: + factor = get_const(p, CONST_INV_65535); + break; + case 32: + factor = get_const(p, CONST_INV_2147483647); + break; + default: + assert(0); + factor.disp = 0; + factor.file = 0; + factor.idx = 0; + factor.mod = 0; + break; + } + sse_mulps(p->func, dataXMM, factor); + } + else if(input_desc->channel[0].size == 32) + sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */ + break; + case UTIL_FORMAT_TYPE_SIGNED: + if(!(x86_target_caps(p->func) & X86_SSE2)) + return FALSE; + emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); + + /* TODO: add support for SSE4.1 pmovsx */ + switch(input_desc->channel[0].size) + { + case 8: + sse2_punpcklbw(p->func, dataXMM, dataXMM); + sse2_punpcklbw(p->func, dataXMM, dataXMM); + sse2_psrad_imm(p->func, dataXMM, 24); + break; + case 16: + sse2_punpcklwd(p->func, dataXMM, dataXMM); + sse2_psrad_imm(p->func, dataXMM, 16); + break; + case 32: /* we lose precision here */ + break; + default: + return FALSE; + } + sse2_cvtdq2ps(p->func, dataXMM, dataXMM); + if(input_desc->channel[0].normalized) + { + struct x86_reg factor; + switch(input_desc->channel[0].size) + { + case 8: + factor = get_const(p, CONST_INV_127); + break; + case 16: + factor = get_const(p, CONST_INV_32767); + break; + case 32: + factor = get_const(p, CONST_INV_2147483647); + break; + default: + assert(0); + factor.disp = 0; + factor.file = 0; + factor.idx = 0; + factor.mod = 0; + break; + } + sse_mulps(p->func, dataXMM, factor); + } + break; + + break; + case UTIL_FORMAT_TYPE_FLOAT: + if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64) + return FALSE; + if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3) + { + swizzle[3] = UTIL_FORMAT_SWIZZLE_W; + needed_chans = CHANNELS_0001; + } + switch(input_desc->channel[0].size) + { + case 32: + emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels); + break; + case 64: /* we lose precision here */ + if(!(x86_target_caps(p->func) & X86_SSE2)) + return FALSE; + emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels); + break; + default: + return FALSE; + } + break; + default: + return FALSE; + } -static void emit_store_R32( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) -{ - sse_movss(p->func, dest, dataXMM); -} + if(!id_swizzle) + sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) ); + } + if(output_desc->nr_channels >= 4 + && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[1] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[3] < UTIL_FORMAT_SWIZZLE_0 + ) + sse_movups(p->func, dst, dataXMM); + else + { + if(output_desc->nr_channels >= 2 + && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) + sse_movlps(p->func, dst, dataXMM); + else + { + if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0) + sse_movss(p->func, dst, dataXMM); + else + x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); + + if(output_desc->nr_channels >= 2) + { + if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0) + { + sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3)); + sse_movss(p->func, x86_make_disp(dst, 4), dataXMM); + } + else + x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]); + } + } + if(output_desc->nr_channels >= 3) + { + if(output_desc->nr_channels >= 4 + && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) + sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM); + else + { + if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0) + { + sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3)); + sse_movss(p->func, x86_make_disp(dst, 8), dataXMM); + } + else + x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); + + if(output_desc->nr_channels >= 4) + { + if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0) + { + sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3)); + sse_movss(p->func, x86_make_disp(dst, 12), dataXMM); + } + else + x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]); + } + } + } + } + return TRUE; + } + else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16 + && output_desc->channel[0].normalized == input_desc->channel[0].normalized + && (0 + || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) + || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) + || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) + )) + { + struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); + struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); + struct x86_reg tmp = p->tmp_EAX; + unsigned imms[2] = {0, 1}; + + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels) + swizzle[i] = i; + } -static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) -{ - /* Scale by 255.0 - */ - sse_mulps(p->func, dataXMM, get_255(p)); + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(swizzle[i] < 4) + needed_chans = MAX2(needed_chans, swizzle[i] + 1); + if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i) + id_swizzle = FALSE; + } - /* Pack and emit: - */ - sse2_cvtps2dq(p->func, dataXMM, dataXMM); - sse2_packssdw(p->func, dataXMM, dataXMM); - sse2_packuswb(p->func, dataXMM, dataXMM); - sse_movss(p->func, dest, dataXMM); -} + if(needed_chans > 0) + { + emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); + + switch(input_desc->channel[0].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + if(input_desc->channel[0].normalized) + { + sse2_punpcklbw(p->func, dataXMM, dataXMM); + if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) + sse2_psrlw_imm(p->func, dataXMM, 1); + } + else + sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); + break; + case UTIL_FORMAT_TYPE_SIGNED: + if(input_desc->channel[0].normalized) + { + sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY)); + sse2_punpcklbw(p->func, tmpXMM, dataXMM); + sse2_psllw_imm(p->func, dataXMM, 9); + sse2_psrlw_imm(p->func, dataXMM, 8); + sse2_por(p->func, tmpXMM, dataXMM); + sse2_psrlw_imm(p->func, dataXMM, 7); + sse2_por(p->func, tmpXMM, dataXMM); + { + struct x86_reg t = dataXMM; + dataXMM = tmpXMM; + tmpXMM = t; + } + } + else + { + sse2_punpcklbw(p->func, dataXMM, dataXMM); + sse2_psraw_imm(p->func, dataXMM, 8); + } + break; + default: + assert(0); + } + if(output_desc->channel[0].normalized) + imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff; + if(!id_swizzle) + sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6)); + } + if(output_desc->nr_channels >= 4 + && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[1] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[3] < UTIL_FORMAT_SWIZZLE_0 + ) + sse2_movq(p->func, dst, dataXMM); + else + { + if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0) + { + if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) + sse2_movd(p->func, dst, dataXMM); + else + { + sse2_movd(p->func, tmp, dataXMM); + x86_mov16(p->func, dst, tmp); + if(output_desc->nr_channels >= 2) + x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]); + } + } + else + { + if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0) + x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); + else + { + x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); + if(output_desc->nr_channels >= 2) + { + sse2_movd(p->func, tmp, dataXMM); + x86_shr_imm(p->func, tmp, 16); + x86_mov16(p->func, x86_make_disp(dst, 2), tmp); + } + } + } + if(output_desc->nr_channels >= 3) + { + if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0) + { + if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) + { + sse2_psrlq_imm(p->func, dataXMM, 32); + sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM); + } + else + { + sse2_psrlq_imm(p->func, dataXMM, 32); + sse2_movd(p->func, tmp, dataXMM); + x86_mov16(p->func, x86_make_disp(dst, 4), tmp); + if(output_desc->nr_channels >= 4) + { + x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]); + } + } + } + else + { + if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0) + x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); + else + { + x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); + + if(output_desc->nr_channels >= 4) + { + sse2_psrlq_imm(p->func, dataXMM, 48); + sse2_movd(p->func, tmp, dataXMM); + x86_mov16(p->func, x86_make_disp(dst, 6), tmp); + } + } + } + } + } + return TRUE; + } + else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0]))) + { + struct x86_reg tmp = p->tmp_EAX; + unsigned i; + if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4 + && swizzle[0] == UTIL_FORMAT_SWIZZLE_W + && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z + && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y + && swizzle[3] == UTIL_FORMAT_SWIZZLE_X) + { + /* TODO: support movbe */ + x86_mov(p->func, tmp, src); + x86_bswap(p->func, tmp); + x86_mov(p->func, dst, tmp); + return TRUE; + } -/* Extended swizzles? Maybe later. - */ -static void emit_swizzle( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg src, - unsigned char shuffle ) -{ - sse_shufps(p->func, dest, src, shuffle); -} + for(i = 0; i < output_desc->nr_channels; ++i) + { + switch(output_desc->channel[0].size) + { + case 8: + if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) + { + unsigned v = 0; + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) + { + switch(output_desc->channel[0].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + v = output_desc->channel[0].normalized ? 0xff : 1; + break; + case UTIL_FORMAT_TYPE_SIGNED: + v = output_desc->channel[0].normalized ? 0x7f : 1; + break; + default: + return FALSE; + } + } + x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v); + } + else + { + x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1)); + x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp); + } + break; + case 16: + if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) + { + unsigned v = 0; + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) + { + switch(output_desc->channel[1].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + v = output_desc->channel[1].normalized ? 0xffff : 1; + break; + case UTIL_FORMAT_TYPE_SIGNED: + v = output_desc->channel[1].normalized ? 0x7fff : 1; + break; + case UTIL_FORMAT_TYPE_FLOAT: + v = 0x3c00; + break; + default: + return FALSE; + } + } + x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v); + } + else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0) + x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0); + else + { + x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2)); + x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp); + } + break; + case 32: + if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) + { + unsigned v = 0; + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) + { + switch(output_desc->channel[1].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + v = output_desc->channel[1].normalized ? 0xffffffff : 1; + break; + case UTIL_FORMAT_TYPE_SIGNED: + v = output_desc->channel[1].normalized ? 0x7fffffff : 1; + break; + case UTIL_FORMAT_TYPE_FLOAT: + v = 0x3f800000; + break; + default: + return FALSE; + } + } + x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v); + } + else + { + x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4)); + x86_mov(p->func, x86_make_disp(dst, i * 4), tmp); + } + break; + case 64: + if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) + { + unsigned l = 0; + unsigned h = 0; + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) + { + switch(output_desc->channel[1].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + h = output_desc->channel[1].normalized ? 0xffffffff : 0; + l = output_desc->channel[1].normalized ? 0xffffffff : 1; + break; + case UTIL_FORMAT_TYPE_SIGNED: + h = output_desc->channel[1].normalized ? 0x7fffffff : 0; + l = output_desc->channel[1].normalized ? 0xffffffff : 1; + break; + case UTIL_FORMAT_TYPE_FLOAT: + h = 0x3ff00000; + l = 0; + break; + default: + return FALSE; + } + } + x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l); + x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h); + } + else + { + if(x86_target_caps(p->func) & X86_SSE) + { + struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0); + emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8)); + emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM); + } + else + { + x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8)); + x86_mov(p->func, x86_make_disp(dst, i * 8), tmp); + x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4)); + x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp); + } + } + break; + default: + return FALSE; + } + } + return TRUE; + } + /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */ + else if((x86_target_caps(p->func) & X86_SSE2) && + a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && (0 + || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM + || a->output_format == PIPE_FORMAT_R8G8B8A8_UNORM + )) + { + struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); + /* load */ + sse_movups(p->func, dataXMM, src); -static boolean translate_attr( struct translate_sse *p, - const struct translate_element *a, - struct x86_reg srcECX, - struct x86_reg dstEAX) -{ - struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); + if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) + sse_shufps(p->func, dataXMM, dataXMM, SHUF(2,1,0,3)); - switch (a->input_format) { - case PIPE_FORMAT_R32_FLOAT: - emit_load_R32(p, dataXMM, srcECX); - break; - case PIPE_FORMAT_R32G32_FLOAT: - emit_load_R32G32(p, dataXMM, srcECX); - break; - case PIPE_FORMAT_R32G32B32_FLOAT: - emit_load_R32G32B32(p, dataXMM, srcECX); - break; - case PIPE_FORMAT_R32G32B32A32_FLOAT: - emit_load_R32G32B32A32(p, dataXMM, srcECX); - break; - case PIPE_FORMAT_B8G8R8A8_UNORM: - emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX); - emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W)); - break; - case PIPE_FORMAT_R8G8B8A8_UNORM: - emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX); - break; - default: - return FALSE; - } + /* scale by 255.0 */ + sse_mulps(p->func, dataXMM, get_const(p, CONST_255)); - switch (a->output_format) { - case PIPE_FORMAT_R32_FLOAT: - emit_store_R32(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_R32G32_FLOAT: - emit_store_R32G32(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_R32G32B32_FLOAT: - emit_store_R32G32B32(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_R32G32B32A32_FLOAT: - emit_store_R32G32B32A32(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_B8G8R8A8_UNORM: - emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W)); - emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_R8G8B8A8_UNORM: - emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM); - break; - default: - return FALSE; + /* pack and emit */ + sse2_cvtps2dq(p->func, dataXMM, dataXMM); + sse2_packssdw(p->func, dataXMM, dataXMM); + sse2_packuswb(p->func, dataXMM, dataXMM); + sse2_movd(p->func, dst, dataXMM); + + return TRUE; } - return TRUE; + return FALSE; } +static boolean translate_attr( struct translate_sse *p, + const struct translate_element *a, + struct x86_reg src, + struct x86_reg dst) +{ + if(a->input_format == a->output_format) + { + emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1)); + return TRUE; + } + + return translate_attr_convert(p, a, src, dst); +} static boolean init_inputs( struct translate_sse *p, - boolean linear ) + unsigned index_size ) { unsigned i; - struct x86_reg instance_id = x86_make_disp(p->machine_EDX, + struct x86_reg instance_id = x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); for (i = 0; i < p->nr_buffer_varients; i++) { struct translate_buffer_varient *varient = &p->buffer_varient[i]; struct translate_buffer *buffer = &p->buffer[varient->buffer_index]; - if (linear || varient->instance_divisor) { - struct x86_reg buf_stride = x86_make_disp(p->machine_EDX, + if (!index_size || varient->instance_divisor) { + struct x86_reg buf_stride = x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride)); - struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX, + struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, get_offset(p, &varient->ptr)); - struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX, + struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr)); - struct x86_reg elt = p->idx_EBX; + struct x86_reg elt = p->idx_ESI; struct x86_reg tmp_EAX = p->tmp_EAX; /* Calculate pointer to first attrib: @@ -406,20 +1085,16 @@ static boolean init_inputs( struct translate_sse *p, x86_mov(p->func, tmp_EAX, instance_id); if (varient->instance_divisor != 1) { - struct x86_reg tmp_EDX = p->machine_EDX; - struct x86_reg tmp_ECX = p->outbuf_ECX; + struct x86_reg tmp_EDX = p->tmp2_EDX; + struct x86_reg tmp_ECX = p->src_ECX; /* TODO: Add x86_shr() to rtasm and use it whenever * instance divisor is power of two. */ - x86_push(p->func, tmp_EDX); - x86_push(p->func, tmp_ECX); x86_xor(p->func, tmp_EDX, tmp_EDX); x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor); x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */ - x86_pop(p->func, tmp_ECX); - x86_pop(p->func, tmp_EDX); } } else { x86_mov(p->func, tmp_EAX, elt); @@ -430,16 +1105,23 @@ static boolean init_inputs( struct translate_sse *p, */ x86_imul(p->func, tmp_EAX, buf_stride); + x64_rexw(p->func); x86_add(p->func, tmp_EAX, buf_base_ptr); /* In the linear case, keep the buffer pointer instead of the * index number. */ - if (linear && p->nr_buffer_varients == 1) + if (!index_size && p->nr_buffer_varients == 1) + { + x64_rexw(p->func); x86_mov(p->func, elt, tmp_EAX); + } else + { + x64_rexw(p->func); x86_mov(p->func, buf_ptr, tmp_EAX); + } } } @@ -448,44 +1130,57 @@ static boolean init_inputs( struct translate_sse *p, static struct x86_reg get_buffer_ptr( struct translate_sse *p, - boolean linear, + unsigned index_size, unsigned var_idx, struct x86_reg elt ) { if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) { - return x86_make_disp(p->machine_EDX, + return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); } - if (linear && p->nr_buffer_varients == 1) { - return p->idx_EBX; + if (!index_size && p->nr_buffer_varients == 1) { + return p->idx_ESI; } - else if (linear || p->buffer_varient[var_idx].instance_divisor) { - struct x86_reg ptr = p->tmp_EAX; + else if (!index_size || p->buffer_varient[var_idx].instance_divisor) { + struct x86_reg ptr = p->src_ECX; struct x86_reg buf_ptr = - x86_make_disp(p->machine_EDX, + x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer_varient[var_idx].ptr)); + x64_rexw(p->func); x86_mov(p->func, ptr, buf_ptr); return ptr; } else { - struct x86_reg ptr = p->tmp_EAX; + struct x86_reg ptr = p->src_ECX; const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx]; struct x86_reg buf_stride = - x86_make_disp(p->machine_EDX, + x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[varient->buffer_index].stride)); struct x86_reg buf_base_ptr = - x86_make_disp(p->machine_EDX, + x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[varient->buffer_index].base_ptr)); /* Calculate pointer to current attrib: */ - x86_mov(p->func, ptr, buf_stride); - x86_imul(p->func, ptr, elt); + switch(index_size) + { + case 1: + x86_movzx8(p->func, ptr, elt); + break; + case 2: + x86_movzx16(p->func, ptr, elt); + break; + case 4: + x86_mov(p->func, ptr, elt); + break; + } + x86_imul(p->func, ptr, buf_stride); + x64_rexw(p->func); x86_add(p->func, ptr, buf_base_ptr); return ptr; } @@ -494,39 +1189,43 @@ static struct x86_reg get_buffer_ptr( struct translate_sse *p, static boolean incr_inputs( struct translate_sse *p, - boolean linear ) + unsigned index_size ) { - if (linear && p->nr_buffer_varients == 1) { - struct x86_reg stride = x86_make_disp(p->machine_EDX, + if (!index_size && p->nr_buffer_varients == 1) { + struct x86_reg stride = x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[0].stride)); if (p->buffer_varient[0].instance_divisor == 0) { - x86_add(p->func, p->idx_EBX, stride); - sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192)); + x64_rexw(p->func); + x86_add(p->func, p->idx_ESI, stride); + sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192)); } } - else if (linear) { + else if (!index_size) { unsigned i; /* Is this worthwhile?? */ for (i = 0; i < p->nr_buffer_varients; i++) { struct translate_buffer_varient *varient = &p->buffer_varient[i]; - struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX, + struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, get_offset(p, &varient->ptr)); - struct x86_reg buf_stride = x86_make_disp(p->machine_EDX, + struct x86_reg buf_stride = x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[varient->buffer_index].stride)); if (varient->instance_divisor == 0) { - x86_mov(p->func, p->tmp_EAX, buf_ptr); - x86_add(p->func, p->tmp_EAX, buf_stride); + x86_mov(p->func, p->tmp_EAX, buf_stride); + x64_rexw(p->func); + x86_add(p->func, p->tmp_EAX, buf_ptr); if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192)); + x64_rexw(p->func); x86_mov(p->func, buf_ptr, p->tmp_EAX); } } } else { - x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4)); + x64_rexw(p->func); + x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size)); } return TRUE; @@ -551,35 +1250,52 @@ static boolean incr_inputs( struct translate_sse *p, */ static boolean build_vertex_emit( struct translate_sse *p, struct x86_function *func, - boolean linear ) + unsigned index_size ) { int fixup, label; unsigned j; + memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const)); + memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg)); + p->tmp_EAX = x86_make_reg(file_REG32, reg_AX); - p->idx_EBX = x86_make_reg(file_REG32, reg_BX); - p->outbuf_ECX = x86_make_reg(file_REG32, reg_CX); - p->machine_EDX = x86_make_reg(file_REG32, reg_DX); - p->count_ESI = x86_make_reg(file_REG32, reg_SI); + p->idx_ESI = x86_make_reg(file_REG32, reg_SI); + p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX); + p->machine_EDI = x86_make_reg(file_REG32, reg_DI); + p->count_EBP = x86_make_reg(file_REG32, reg_BP); + p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX); + p->src_ECX = x86_make_reg(file_REG32, reg_CX); p->func = func; - p->loaded_inv_255 = FALSE; - p->loaded_255 = FALSE; - p->loaded_identity = FALSE; x86_init_func(p->func); - /* Push a few regs? - */ - x86_push(p->func, p->idx_EBX); - x86_push(p->func, p->count_ESI); + if(x86_target(p->func) == X86_64_WIN64_ABI) + { + /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */ + sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6)); + sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7)); + } - /* Load arguments into regs: - */ - x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1)); - x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2)); - x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3)); - x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5)); + x86_push(p->func, p->outbuf_EBX); + x86_push(p->func, p->count_EBP); + +/* on non-Win64 x86-64, these are already in the right registers */ + if(x86_target(p->func) != X86_64_STD_ABI) + { + x86_push(p->func, p->machine_EDI); + x86_push(p->func, p->idx_ESI); + + x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1)); + x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2)); + } + + x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3)); + + if(x86_target(p->func) != X86_32) + x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5)); + else + x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5)); /* Load instance ID. */ @@ -588,25 +1304,25 @@ static boolean build_vertex_emit( struct translate_sse *p, p->tmp_EAX, x86_fn_arg(p->func, 4)); x86_mov(p->func, - x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)), + x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)), p->tmp_EAX); } /* Get vertex count, compare to zero */ x86_xor(p->func, p->tmp_EAX, p->tmp_EAX); - x86_cmp(p->func, p->count_ESI, p->tmp_EAX); + x86_cmp(p->func, p->count_EBP, p->tmp_EAX); fixup = x86_jcc_forward(p->func, cc_E); /* always load, needed or not: */ - init_inputs(p, linear); + init_inputs(p, index_size); /* Note address for loop jump */ label = x86_get_label(p->func); { - struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX); + struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI); int last_varient = -1; struct x86_reg vb; @@ -618,30 +1334,31 @@ static boolean build_vertex_emit( struct translate_sse *p, */ if (varient != last_varient) { last_varient = varient; - vb = get_buffer_ptr(p, linear, varient, elt); + vb = get_buffer_ptr(p, index_size, varient, elt); } if (!translate_attr( p, a, x86_make_disp(vb, a->input_offset), - x86_make_disp(p->outbuf_ECX, a->output_offset))) + x86_make_disp(p->outbuf_EBX, a->output_offset))) return FALSE; } /* Next output vertex: */ + x64_rexw(p->func); x86_lea(p->func, - p->outbuf_ECX, - x86_make_disp(p->outbuf_ECX, + p->outbuf_EBX, + x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride)); /* Incr index */ - incr_inputs( p, linear ); + incr_inputs( p, index_size ); } /* decr count, loop if not zero */ - x86_dec(p->func, p->count_ESI); + x86_dec(p->func, p->count_EBP); x86_jcc(p->func, cc_NZ, label); /* Exit mmx state? @@ -656,8 +1373,20 @@ static boolean build_vertex_emit( struct translate_sse *p, /* Pop regs and return */ - x86_pop(p->func, p->count_ESI); - x86_pop(p->func, p->idx_EBX); + if(x86_target(p->func) != X86_64_STD_ABI) + { + x86_pop(p->func, p->idx_ESI); + x86_pop(p->func, p->machine_EDI); + } + + x86_pop(p->func, p->count_EBP); + x86_pop(p->func, p->outbuf_EBX); + + if(x86_target(p->func) == X86_64_WIN64_ABI) + { + sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8)); + sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24)); + } x86_ret(p->func); return TRUE; @@ -697,37 +1426,7 @@ static void translate_sse_release( struct translate *translate ) x86_release_func( &p->linear_func ); x86_release_func( &p->elt_func ); - FREE(p); -} - -static void PIPE_CDECL translate_sse_run_elts( struct translate *translate, - const unsigned *elts, - unsigned count, - unsigned instance_id, - void *output_buffer ) -{ - struct translate_sse *p = (struct translate_sse *)translate; - - p->gen_run_elts( translate, - elts, - count, - instance_id, - output_buffer); -} - -static void PIPE_CDECL translate_sse_run( struct translate *translate, - unsigned start, - unsigned count, - unsigned instance_id, - void *output_buffer ) -{ - struct translate_sse *p = (struct translate_sse *)translate; - - p->gen_run( translate, - start, - count, - instance_id, - output_buffer); + os_free_aligned(p); } @@ -736,18 +1435,19 @@ struct translate *translate_sse2_create( const struct translate_key *key ) struct translate_sse *p = NULL; unsigned i; - if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2()) + /* this is misnamed, it actually refers to whether rtasm is enabled or not */ + if (!rtasm_cpu_has_sse()) goto fail; - p = CALLOC_STRUCT( translate_sse ); + p = os_malloc_aligned(sizeof(struct translate_sse), 16); if (p == NULL) goto fail; + memset(p, 0, sizeof(*p)); + memcpy(p->consts, consts, sizeof(consts)); p->translate.key = *key; p->translate.release = translate_sse_release; p->translate.set_buffer = translate_sse_set_buffer; - p->translate.run_elts = translate_sse_run_elts; - p->translate.run = translate_sse_run; for (i = 0; i < key->nr_elements; i++) { if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) { @@ -783,18 +1483,32 @@ struct translate *translate_sse2_create( const struct translate_key *key ) if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers); - if (!build_vertex_emit(p, &p->linear_func, TRUE)) + if (!build_vertex_emit(p, &p->linear_func, 0)) + goto fail; + + if (!build_vertex_emit(p, &p->elt_func, 4)) + goto fail; + + if (!build_vertex_emit(p, &p->elt16_func, 2)) + goto fail; + + if (!build_vertex_emit(p, &p->elt8_func, 1)) + goto fail; + + p->translate.run = (void*)x86_get_func(&p->linear_func); + if (p->translate.run == NULL) goto fail; - if (!build_vertex_emit(p, &p->elt_func, FALSE)) + p->translate.run_elts = (void*)x86_get_func(&p->elt_func); + if (p->translate.run_elts == NULL) goto fail; - p->gen_run = (run_func)x86_get_func(&p->linear_func); - if (p->gen_run == NULL) + p->translate.run_elts16 = (void*)x86_get_func(&p->elt16_func); + if (p->translate.run_elts16 == NULL) goto fail; - p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func); - if (p->gen_run_elts == NULL) + p->translate.run_elts8 = (void*)x86_get_func(&p->elt8_func); + if (p->translate.run_elts8 == NULL) goto fail; return &p->translate; |