diff options
-rw-r--r-- | src/gallium/auxiliary/translate/translate_sse.c | 946 |
1 files changed, 474 insertions, 472 deletions
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c index a72454a808a..b6bc22227d6 100644 --- a/src/gallium/auxiliary/translate/translate_sse.c +++ b/src/gallium/auxiliary/translate/translate_sse.c @@ -47,16 +47,18 @@ #define W 3 -struct translate_buffer { +struct translate_buffer +{ const void *base_ptr; uintptr_t stride; unsigned max_index; }; -struct translate_buffer_variant { +struct translate_buffer_variant +{ unsigned buffer_index; unsigned instance_divisor; - void *ptr; /* updated either per vertex or per instance */ + void *ptr; /* updated either per vertex or per instance */ }; @@ -77,17 +79,19 @@ enum #define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)} static float consts[NUM_CONSTS][4] = { - {0, 0, 0, 1}, - C(1.0 / 127.0), - C(1.0 / 255.0), - C(1.0 / 32767.0), - C(1.0 / 65535.0), - C(1.0 / 2147483647.0), - C(255.0) + {0, 0, 0, 1}, + C(1.0 / 127.0), + C(1.0 / 255.0), + C(1.0 / 32767.0), + C(1.0 / 65535.0), + C(1.0 / 2147483647.0), + C(255.0) }; + #undef C -struct translate_sse { +struct translate_sse +{ struct translate translate; struct x86_function linear_func; @@ -96,7 +100,7 @@ struct translate_sse { struct x86_function elt8_func; struct x86_function *func; - PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4]; + PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4]; int8_t reg_to_const[16]; int8_t const_to_reg[NUM_CONSTS]; @@ -120,38 +124,41 @@ struct translate_sse { struct x86_reg tmp_EAX; struct x86_reg tmp2_EDX; struct x86_reg src_ECX; - struct x86_reg idx_ESI; /* either start+i or &elt[i] */ + struct x86_reg idx_ESI; /* either start+i or &elt[i] */ struct x86_reg machine_EDI; struct x86_reg outbuf_EBX; struct x86_reg count_EBP; /* decrements to zero */ }; -static int get_offset( const void *a, const void *b ) + +static int +get_offset(const void *a, const void *b) { - return (const char *)b - (const char *)a; + return (const char *) b - (const char *) a; } -static struct x86_reg get_const( struct translate_sse *p, unsigned id) + +static struct x86_reg +get_const(struct translate_sse *p, unsigned id) { struct x86_reg reg; unsigned i; - if(p->const_to_reg[id] >= 0) + if (p->const_to_reg[id] >= 0) return x86_make_reg(file_XMM, p->const_to_reg[id]); - for(i = 2; i < 8; ++i) - { - if(p->reg_to_const[i] < 0) + for (i = 2; i < 8; ++i) { + if (p->reg_to_const[i] < 0) break; } /* TODO: be smarter here */ - if(i == 8) + if (i == 8) --i; reg = x86_make_reg(file_XMM, i); - if(p->reg_to_const[i] >= 0) + if (p->reg_to_const[i] >= 0) p->const_to_reg[p->reg_to_const[i]] = -1; p->reg_to_const[i] = id; @@ -159,22 +166,21 @@ static struct x86_reg get_const( struct translate_sse *p, unsigned id) /* TODO: this should happen outside the loop, if possible */ sse_movaps(p->func, reg, - x86_make_disp(p->machine_EDI, - get_offset(p, &p->consts[id][0]))); + x86_make_disp(p->machine_EDI, + get_offset(p, &p->consts[id][0]))); return reg; } + /* load the data in a SSE2 register, padding with zeros */ -static boolean emit_load_sse2( struct translate_sse *p, - struct x86_reg data, - struct x86_reg src, - unsigned size) +static boolean +emit_load_sse2(struct translate_sse *p, + struct x86_reg data, struct x86_reg src, unsigned size) { struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); struct x86_reg tmp = p->tmp_EAX; - switch(size) - { + switch (size) { case 1: x86_movzx8(p->func, tmp, src); sse2_movd(p->func, data, tmp); @@ -215,9 +221,11 @@ static boolean emit_load_sse2( struct translate_sse *p, return TRUE; } + /* this value can be passed for the out_chans argument */ #define CHANNELS_0001 5 + /* this function will load #chans float values, and will * pad the register with zeroes at least up to out_chans. * @@ -225,30 +233,28 @@ static boolean emit_load_sse2( struct translate_sse *p, * value will be padded with 1. Only pass this value if * chans < 4 or results are undefined. */ -static void emit_load_float32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0, - unsigned out_chans, - unsigned chans) +static void +emit_load_float32(struct translate_sse *p, struct x86_reg data, + struct x86_reg arg0, unsigned out_chans, unsigned chans) { - switch(chans) - { + switch (chans) { case 1: /* a 0 0 0 * a 0 0 1 */ sse_movss(p->func, data, arg0); - if(out_chans == CHANNELS_0001) - sse_orps(p->func, data, get_const(p, CONST_IDENTITY) ); + if (out_chans == CHANNELS_0001) + sse_orps(p->func, data, get_const(p, CONST_IDENTITY)); break; case 2: /* 0 0 0 1 * a b 0 1 */ - if(out_chans == CHANNELS_0001) - sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) ); - else if(out_chans > 2) - sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) ); + if (out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), + SHUF(X, Y, Z, W)); + else if (out_chans > 2) + sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY)); sse_movlps(p->func, data, arg0); break; case 3: @@ -260,9 +266,10 @@ static void emit_load_float32( struct translate_sse *p, * a b c 0/1 */ sse_movss(p->func, data, x86_make_disp(arg0, 8)); - if(out_chans == CHANNELS_0001) - sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X,Y,Z,W) ); - sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) ); + if (out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), + SHUF(X, Y, Z, W)); + sse_shufps(p->func, data, data, SHUF(Y, Z, X, W)); sse_movlps(p->func, data, arg0); break; case 4: @@ -274,43 +281,42 @@ static void emit_load_float32( struct translate_sse *p, /* this function behaves like emit_load_float32, but loads 64-bit floating point numbers, converting them to 32-bit ones */ -static void emit_load_float64to32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0, - unsigned out_chans, - unsigned chans) +static void +emit_load_float64to32(struct translate_sse *p, struct x86_reg data, + struct x86_reg arg0, unsigned out_chans, unsigned chans) { struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); - switch(chans) - { + switch (chans) { case 1: sse2_movsd(p->func, data, arg0); - if(out_chans > 1) + if (out_chans > 1) sse2_cvtpd2ps(p->func, data, data); else sse2_cvtsd2ss(p->func, data, data); - if(out_chans == CHANNELS_0001) - sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) ); + if (out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), + SHUF(X, Y, Z, W)); break; case 2: sse2_movupd(p->func, data, arg0); sse2_cvtpd2ps(p->func, data, data); - if(out_chans == CHANNELS_0001) - sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) ); - else if(out_chans > 2) - sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) ); - break; + if (out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), + SHUF(X, Y, Z, W)); + else if (out_chans > 2) + sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY)); + break; case 3: sse2_movupd(p->func, data, arg0); sse2_cvtpd2ps(p->func, data, data); sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16)); - if(out_chans > 3) + if (out_chans > 3) sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); else sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM); sse_movlhps(p->func, data, tmpXMM); - if(out_chans == CHANNELS_0001) - sse_orps(p->func, data, get_const(p, CONST_IDENTITY) ); + if (out_chans == CHANNELS_0001) + sse_orps(p->func, data, get_const(p, CONST_IDENTITY)); break; case 4: sse2_movupd(p->func, data, arg0); @@ -322,53 +328,65 @@ static void emit_load_float64to32( struct translate_sse *p, } } -static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr, struct x86_reg src_xmm) + +static void +emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, + struct x86_reg dst_xmm, struct x86_reg src_gpr, + struct x86_reg src_xmm) { - if(x86_target(p->func) != X86_32) + if (x86_target(p->func) != X86_32) x64_mov64(p->func, dst_gpr, src_gpr); - else - { + else { /* TODO: when/on which CPUs is SSE2 actually better than SSE? */ - if(x86_target_caps(p->func) & X86_SSE2) + if (x86_target_caps(p->func) & X86_SSE2) sse2_movq(p->func, dst_xmm, src_xmm); else sse_movlps(p->func, dst_xmm, src_xmm); } } -static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src) + +static void +emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, + struct x86_reg dst_xmm, struct x86_reg src) { emit_mov64(p, dst_gpr, dst_xmm, src, src); } -static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm) + +static void +emit_store64(struct translate_sse *p, struct x86_reg dst, + struct x86_reg src_gpr, struct x86_reg src_xmm) { emit_mov64(p, dst, dst, src_gpr, src_xmm); } -static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src) + +static void +emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src) { - if(x86_target_caps(p->func) & X86_SSE2) + if (x86_target_caps(p->func) & X86_SSE2) sse2_movdqu(p->func, dst, src); else sse_movups(p->func, dst, src); } + /* TODO: this uses unaligned accesses liberally, which is great on Nehalem, * but may or may not be good on older processors * TODO: may perhaps want to use non-temporal stores here if possible */ -static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size) +static void +emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, + unsigned size) { struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1); struct x86_reg dataGPR = p->tmp_EAX; struct x86_reg dataGPR2 = p->tmp2_EDX; - if(size < 8) - { - switch (size) - { + if (size < 8) { + switch (size) { case 1: x86_mov8(p->func, dataGPR, src); x86_mov8(p->func, dst, dataGPR); @@ -395,20 +413,16 @@ static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_ break; } } - else if(!(x86_target_caps(p->func) & X86_SSE)) - { + else if (!(x86_target_caps(p->func) & X86_SSE)) { unsigned i = 0; assert((size & 3) == 0); - for(i = 0; i < size; i += 4) - { + for (i = 0; i < size; i += 4) { x86_mov(p->func, dataGPR, x86_make_disp(src, i)); x86_mov(p->func, x86_make_disp(dst, i), dataGPR); } } - else - { - switch(size) - { + else { + switch (size) { case 8: emit_load64(p, dataGPR, dataXMM, src); emit_store64(p, dst, dataGPR, dataXMM); @@ -441,101 +455,104 @@ static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_ } } -static boolean translate_attr_convert( struct translate_sse *p, - const struct translate_element *a, - struct x86_reg src, - struct x86_reg dst) - +static boolean +translate_attr_convert(struct translate_sse *p, + const struct translate_element *a, + struct x86_reg src, struct x86_reg dst) { - const struct util_format_description* input_desc = util_format_description(a->input_format); - const struct util_format_description* output_desc = util_format_description(a->output_format); + const struct util_format_description *input_desc = + util_format_description(a->input_format); + const struct util_format_description *output_desc = + util_format_description(a->output_format); unsigned i; boolean id_swizzle = TRUE; - unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE}; + unsigned swizzle[4] = + { UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, + UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE }; unsigned needed_chans = 0; - unsigned imms[2] = {0, 0x3f800000}; + unsigned imms[2] = { 0, 0x3f800000 }; - if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE) + if (a->output_format == PIPE_FORMAT_NONE + || a->input_format == PIPE_FORMAT_NONE) return FALSE; - if(input_desc->channel[0].size & 7) + if (input_desc->channel[0].size & 7) return FALSE; - if(input_desc->colorspace != output_desc->colorspace) + if (input_desc->colorspace != output_desc->colorspace) return FALSE; - for(i = 1; i < input_desc->nr_channels; ++i) - { - if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0]))) + for (i = 1; i < input_desc->nr_channels; ++i) { + if (memcmp + (&input_desc->channel[i], &input_desc->channel[0], + sizeof(input_desc->channel[0]))) return FALSE; } - for(i = 1; i < output_desc->nr_channels; ++i) - { - if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0]))) + for (i = 1; i < output_desc->nr_channels; ++i) { + if (memcmp + (&output_desc->channel[i], &output_desc->channel[0], + sizeof(output_desc->channel[0]))) { return FALSE; + } } - for(i = 0; i < output_desc->nr_channels; ++i) - { - if(output_desc->swizzle[i] < 4) + for (i = 0; i < output_desc->nr_channels; ++i) { + if (output_desc->swizzle[i] < 4) swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i]; } - if((x86_target_caps(p->func) & X86_SSE) && (0 - || a->output_format == PIPE_FORMAT_R32_FLOAT - || a->output_format == PIPE_FORMAT_R32G32_FLOAT - || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT - || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) - { + if ((x86_target_caps(p->func) & X86_SSE) && + (0 || a->output_format == PIPE_FORMAT_R32_FLOAT + || a->output_format == PIPE_FORMAT_R32G32_FLOAT + || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT + || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) { struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); - for(i = 0; i < output_desc->nr_channels; ++i) - { - if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels) + for (i = 0; i < output_desc->nr_channels; ++i) { + if (swizzle[i] == UTIL_FORMAT_SWIZZLE_0 + && i >= input_desc->nr_channels) swizzle[i] = i; } - for(i = 0; i < output_desc->nr_channels; ++i) - { - if(swizzle[i] < 4) + for (i = 0; i < output_desc->nr_channels; ++i) { + if (swizzle[i] < 4) needed_chans = MAX2(needed_chans, swizzle[i] + 1); - if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i) + if (swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i) id_swizzle = FALSE; } - if(needed_chans > 0) - { - switch(input_desc->channel[0].type) - { + if (needed_chans > 0) { + switch (input_desc->channel[0].type) { case UTIL_FORMAT_TYPE_UNSIGNED: - if(!(x86_target_caps(p->func) & X86_SSE2)) + if (!(x86_target_caps(p->func) & X86_SSE2)) return FALSE; - emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); + emit_load_sse2(p, dataXMM, src, + input_desc->channel[0].size * + input_desc->nr_channels >> 3); /* TODO: add support for SSE4.1 pmovzx */ - switch(input_desc->channel[0].size) - { + switch (input_desc->channel[0].size) { case 8: - /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */ + /* TODO: this may be inefficient due to get_identity() being + * used both as a float and integer register. + */ sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); break; case 16: sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY)); break; - case 32: /* we lose precision here */ + case 32: /* we lose precision here */ sse2_psrld_imm(p->func, dataXMM, 1); break; default: return FALSE; } sse2_cvtdq2ps(p->func, dataXMM, dataXMM); - if(input_desc->channel[0].normalized) - { + if (input_desc->channel[0].normalized) { struct x86_reg factor; - switch(input_desc->channel[0].size) - { + switch (input_desc->channel[0].size) { case 8: factor = get_const(p, CONST_INV_255); break; @@ -555,17 +572,19 @@ static boolean translate_attr_convert( struct translate_sse *p, } sse_mulps(p->func, dataXMM, factor); } - else if(input_desc->channel[0].size == 32) - sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */ + else if (input_desc->channel[0].size == 32) + /* compensate for the bit we threw away to fit u32 into s32 */ + sse_addps(p->func, dataXMM, dataXMM); break; case UTIL_FORMAT_TYPE_SIGNED: - if(!(x86_target_caps(p->func) & X86_SSE2)) + if (!(x86_target_caps(p->func) & X86_SSE2)) return FALSE; - emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); + emit_load_sse2(p, dataXMM, src, + input_desc->channel[0].size * + input_desc->nr_channels >> 3); /* TODO: add support for SSE4.1 pmovsx */ - switch(input_desc->channel[0].size) - { + switch (input_desc->channel[0].size) { case 8: sse2_punpcklbw(p->func, dataXMM, dataXMM); sse2_punpcklbw(p->func, dataXMM, dataXMM); @@ -575,17 +594,15 @@ static boolean translate_attr_convert( struct translate_sse *p, sse2_punpcklwd(p->func, dataXMM, dataXMM); sse2_psrad_imm(p->func, dataXMM, 16); break; - case 32: /* we lose precision here */ + case 32: /* we lose precision here */ break; default: return FALSE; } sse2_cvtdq2ps(p->func, dataXMM, dataXMM); - if(input_desc->channel[0].normalized) - { + if (input_desc->channel[0].normalized) { struct x86_reg factor; - switch(input_desc->channel[0].size) - { + switch (input_desc->channel[0].size) { case 8: factor = get_const(p, CONST_INV_127); break; @@ -609,22 +626,25 @@ static boolean translate_attr_convert( struct translate_sse *p, break; case UTIL_FORMAT_TYPE_FLOAT: - if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64) + if (input_desc->channel[0].size != 32 + && input_desc->channel[0].size != 64) { return FALSE; - if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3) - { + } + if (swizzle[3] == UTIL_FORMAT_SWIZZLE_1 + && input_desc->nr_channels <= 3) { swizzle[3] = UTIL_FORMAT_SWIZZLE_W; needed_chans = CHANNELS_0001; } - switch(input_desc->channel[0].size) - { + switch (input_desc->channel[0].size) { case 32: - emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels); + emit_load_float32(p, dataXMM, src, needed_chans, + input_desc->nr_channels); break; - case 64: /* we lose precision here */ - if(!(x86_target_caps(p->func) & X86_SSE2)) + case 64: /* we lose precision here */ + if (!(x86_target_caps(p->func) & X86_SSE2)) return FALSE; - emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels); + emit_load_float64to32(p, dataXMM, src, needed_chans, + input_desc->nr_channels); break; default: return FALSE; @@ -634,119 +654,124 @@ static boolean translate_attr_convert( struct translate_sse *p, return FALSE; } - if(!id_swizzle) - sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) ); + if (!id_swizzle) { + sse_shufps(p->func, dataXMM, dataXMM, + SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3])); + } } - if(output_desc->nr_channels >= 4 - && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 - && swizzle[1] < UTIL_FORMAT_SWIZZLE_0 - && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 - && swizzle[3] < UTIL_FORMAT_SWIZZLE_0 - ) + if (output_desc->nr_channels >= 4 + && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[1] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) { sse_movups(p->func, dst, dataXMM); - else - { - if(output_desc->nr_channels >= 2 - && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 - && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) + } + else { + if (output_desc->nr_channels >= 2 + && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) { sse_movlps(p->func, dst, dataXMM); - else - { - if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0) + } + else { + if (swizzle[0] < UTIL_FORMAT_SWIZZLE_0) { sse_movss(p->func, dst, dataXMM); - else - x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); + } + else { + x86_mov_imm(p->func, dst, + imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); + } - if(output_desc->nr_channels >= 2) - { - if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0) - { + if (output_desc->nr_channels >= 2) { + if (swizzle[1] < UTIL_FORMAT_SWIZZLE_0) { sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3)); sse_movss(p->func, x86_make_disp(dst, 4), dataXMM); } - else - x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]); + else { + x86_mov_imm(p->func, x86_make_disp(dst, 4), + imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]); + } } } - if(output_desc->nr_channels >= 3) - { - if(output_desc->nr_channels >= 4 - && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 - && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) + if (output_desc->nr_channels >= 3) { + if (output_desc->nr_channels >= 4 + && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) { sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM); - else - { - if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0) - { + } + else { + if (swizzle[2] < UTIL_FORMAT_SWIZZLE_0) { sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3)); sse_movss(p->func, x86_make_disp(dst, 8), dataXMM); } - else - x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); + else { + x86_mov_imm(p->func, x86_make_disp(dst, 8), + imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); + } - if(output_desc->nr_channels >= 4) - { - if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0) - { + if (output_desc->nr_channels >= 4) { + if (swizzle[3] < UTIL_FORMAT_SWIZZLE_0) { sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3)); sse_movss(p->func, x86_make_disp(dst, 12), dataXMM); } - else - x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]); + else { + x86_mov_imm(p->func, x86_make_disp(dst, 12), + imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]); + } } } } } return TRUE; } - else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16 - && output_desc->channel[0].normalized == input_desc->channel[0].normalized - && (0 - || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) - || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) - || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) - )) - { + else if ((x86_target_caps(p->func) & X86_SSE2) + && input_desc->channel[0].size == 8 + && output_desc->channel[0].size == 16 + && output_desc->channel[0].normalized == + input_desc->channel[0].normalized && + (0 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED + && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) + || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED + && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) + || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED + && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED))) { struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); struct x86_reg tmp = p->tmp_EAX; - unsigned imms[2] = {0, 1}; + unsigned imms[2] = { 0, 1 }; - for(i = 0; i < output_desc->nr_channels; ++i) - { - if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels) + for (i = 0; i < output_desc->nr_channels; ++i) { + if (swizzle[i] == UTIL_FORMAT_SWIZZLE_0 + && i >= input_desc->nr_channels) { swizzle[i] = i; + } } - for(i = 0; i < output_desc->nr_channels; ++i) - { - if(swizzle[i] < 4) + for (i = 0; i < output_desc->nr_channels; ++i) { + if (swizzle[i] < 4) needed_chans = MAX2(needed_chans, swizzle[i] + 1); - if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i) + if (swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i) id_swizzle = FALSE; } - if(needed_chans > 0) - { - emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); + if (needed_chans > 0) { + emit_load_sse2(p, dataXMM, src, + input_desc->channel[0].size * + input_desc->nr_channels >> 3); - switch(input_desc->channel[0].type) - { + switch (input_desc->channel[0].type) { case UTIL_FORMAT_TYPE_UNSIGNED: - if(input_desc->channel[0].normalized) - { + if (input_desc->channel[0].normalized) { sse2_punpcklbw(p->func, dataXMM, dataXMM); - if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) - sse2_psrlw_imm(p->func, dataXMM, 1); + if (output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) + sse2_psrlw_imm(p->func, dataXMM, 1); } else sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); break; case UTIL_FORMAT_TYPE_SIGNED: - if(input_desc->channel[0].normalized) - { + if (input_desc->channel[0].normalized) { sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY)); sse2_punpcklbw(p->func, tmpXMM, dataXMM); sse2_psllw_imm(p->func, dataXMM, 9); @@ -760,8 +785,7 @@ static boolean translate_attr_convert( struct translate_sse *p, tmpXMM = t; } } - else - { + else { sse2_punpcklbw(p->func, dataXMM, dataXMM); sse2_psraw_imm(p->func, dataXMM, 8); } @@ -770,43 +794,49 @@ static boolean translate_attr_convert( struct translate_sse *p, assert(0); } - if(output_desc->channel[0].normalized) - imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff; + if (output_desc->channel[0].normalized) + imms[1] = + (output_desc->channel[0].type == + UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff; - if(!id_swizzle) - sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6)); + if (!id_swizzle) + sse2_pshuflw(p->func, dataXMM, dataXMM, + (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | + ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6)); } - if(output_desc->nr_channels >= 4 - && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 - && swizzle[1] < UTIL_FORMAT_SWIZZLE_0 - && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 - && swizzle[3] < UTIL_FORMAT_SWIZZLE_0 - ) + if (output_desc->nr_channels >= 4 + && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[1] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) { sse2_movq(p->func, dst, dataXMM); - else - { - if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0) - { - if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) + } + else { + if (swizzle[0] < UTIL_FORMAT_SWIZZLE_0) { + if (output_desc->nr_channels >= 2 + && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) { sse2_movd(p->func, dst, dataXMM); - else - { + } + else { sse2_movd(p->func, tmp, dataXMM); x86_mov16(p->func, dst, tmp); - if(output_desc->nr_channels >= 2) - x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]); + if (output_desc->nr_channels >= 2) + x86_mov16_imm(p->func, x86_make_disp(dst, 2), + imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]); } } - else - { - if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0) - x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); - else - { - x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); - if(output_desc->nr_channels >= 2) - { + else { + if (output_desc->nr_channels >= 2 + && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0) { + x86_mov_imm(p->func, dst, + (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | + imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); + } + else { + x86_mov16_imm(p->func, dst, + imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); + if (output_desc->nr_channels >= 2) { sse2_movd(p->func, tmp, dataXMM); x86_shr_imm(p->func, tmp, 16); x86_mov16(p->func, x86_make_disp(dst, 2), tmp); @@ -814,36 +844,35 @@ static boolean translate_attr_convert( struct translate_sse *p, } } - if(output_desc->nr_channels >= 3) - { - if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0) - { - if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) - { + if (output_desc->nr_channels >= 3) { + if (swizzle[2] < UTIL_FORMAT_SWIZZLE_0) { + if (output_desc->nr_channels >= 4 + && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) { sse2_psrlq_imm(p->func, dataXMM, 32); sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM); } - else - { + else { sse2_psrlq_imm(p->func, dataXMM, 32); sse2_movd(p->func, tmp, dataXMM); x86_mov16(p->func, x86_make_disp(dst, 4), tmp); - if(output_desc->nr_channels >= 4) - { - x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]); + if (output_desc->nr_channels >= 4) { + x86_mov16_imm(p->func, x86_make_disp(dst, 6), + imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]); } } } - else - { - if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0) - x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); - else - { - x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); + else { + if (output_desc->nr_channels >= 4 + && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0) { + x86_mov_imm(p->func, x86_make_disp(dst, 4), + (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) + | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); + } + else { + x86_mov16_imm(p->func, x86_make_disp(dst, 4), + imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); - if(output_desc->nr_channels >= 4) - { + if (output_desc->nr_channels >= 4) { sse2_psrlq_imm(p->func, dataXMM, 48); sse2_movd(p->func, tmp, dataXMM); x86_mov16(p->func, x86_make_disp(dst, 6), tmp); @@ -854,16 +883,17 @@ static boolean translate_attr_convert( struct translate_sse *p, } return TRUE; } - else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0]))) - { + else if (!memcmp(&output_desc->channel[0], &input_desc->channel[0], + sizeof(output_desc->channel[0]))) { struct x86_reg tmp = p->tmp_EAX; unsigned i; - if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4 - && swizzle[0] == UTIL_FORMAT_SWIZZLE_W - && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z - && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y - && swizzle[3] == UTIL_FORMAT_SWIZZLE_X) - { + + if (input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 + && output_desc->nr_channels == 4 + && swizzle[0] == UTIL_FORMAT_SWIZZLE_W + && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z + && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y + && swizzle[3] == UTIL_FORMAT_SWIZZLE_X) { /* TODO: support movbe */ x86_mov(p->func, tmp, src); x86_bswap(p->func, tmp); @@ -871,18 +901,13 @@ static boolean translate_attr_convert( struct translate_sse *p, return TRUE; } - for(i = 0; i < output_desc->nr_channels; ++i) - { - switch(output_desc->channel[0].size) - { + for (i = 0; i < output_desc->nr_channels; ++i) { + switch (output_desc->channel[0].size) { case 8: - if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) - { + if (swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) { unsigned v = 0; - if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) - { - switch(output_desc->channel[0].type) - { + if (swizzle[i] == UTIL_FORMAT_SWIZZLE_1) { + switch (output_desc->channel[0].type) { case UTIL_FORMAT_TYPE_UNSIGNED: v = output_desc->channel[0].normalized ? 0xff : 1; break; @@ -895,20 +920,16 @@ static boolean translate_attr_convert( struct translate_sse *p, } x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v); } - else - { + else { x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1)); x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp); } break; case 16: - if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) - { + if (swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) { unsigned v = 0; - if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) - { - switch(output_desc->channel[1].type) - { + if (swizzle[i] == UTIL_FORMAT_SWIZZLE_1) { + switch (output_desc->channel[1].type) { case UTIL_FORMAT_TYPE_UNSIGNED: v = output_desc->channel[1].normalized ? 0xffff : 1; break; @@ -924,22 +945,19 @@ static boolean translate_attr_convert( struct translate_sse *p, } x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v); } - else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0) + else if (swizzle[i] == UTIL_FORMAT_SWIZZLE_0) { x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0); - else - { + } + else { x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2)); x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp); } break; case 32: - if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) - { + if (swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) { unsigned v = 0; - if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) - { - switch(output_desc->channel[1].type) - { + if (swizzle[i] == UTIL_FORMAT_SWIZZLE_1) { + switch (output_desc->channel[1].type) { case UTIL_FORMAT_TYPE_UNSIGNED: v = output_desc->channel[1].normalized ? 0xffffffff : 1; break; @@ -955,21 +973,17 @@ static boolean translate_attr_convert( struct translate_sse *p, } x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v); } - else - { + else { x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4)); x86_mov(p->func, x86_make_disp(dst, i * 4), tmp); } break; case 64: - if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) - { + if (swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) { unsigned l = 0; unsigned h = 0; - if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) - { - switch(output_desc->channel[1].type) - { + if (swizzle[i] == UTIL_FORMAT_SWIZZLE_1) { + switch (output_desc->channel[1].type) { case UTIL_FORMAT_TYPE_UNSIGNED: h = output_desc->channel[1].normalized ? 0xffffffff : 0; l = output_desc->channel[1].normalized ? 0xffffffff : 1; @@ -989,19 +1003,18 @@ static boolean translate_attr_convert( struct translate_sse *p, x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l); x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h); } - else - { - if(x86_target_caps(p->func) & X86_SSE) - { + else { + if (x86_target_caps(p->func) & X86_SSE) { struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0); - emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8)); + emit_load64(p, tmp, tmpXMM, + x86_make_disp(src, swizzle[i] * 8)); emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM); } - else - { + else { x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8)); x86_mov(p->func, x86_make_disp(dst, i * 8), tmp); - x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4)); + x86_mov(p->func, tmp, + x86_make_disp(src, swizzle[i] * 8 + 4)); x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp); } } @@ -1013,19 +1026,18 @@ static boolean translate_attr_convert( struct translate_sse *p, return TRUE; } /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */ - else if((x86_target_caps(p->func) & X86_SSE2) && - a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && (0 - || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM - || a->output_format == PIPE_FORMAT_R8G8B8A8_UNORM - )) - { + else if ((x86_target_caps(p->func) & X86_SSE2) && + a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && + (0 || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM + || a-> output_format == PIPE_FORMAT_R8G8B8A8_UNORM)) { struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); /* load */ sse_movups(p->func, dataXMM, src); - if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) - sse_shufps(p->func, dataXMM, dataXMM, SHUF(2,1,0,3)); + if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) { + sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 1, 0, 3)); + } /* scale by 255.0 */ sse_mulps(p->func, dataXMM, get_const(p, CONST_255)); @@ -1042,13 +1054,13 @@ static boolean translate_attr_convert( struct translate_sse *p, return FALSE; } -static boolean translate_attr( struct translate_sse *p, - const struct translate_element *a, - struct x86_reg src, - struct x86_reg dst) + +static boolean +translate_attr(struct translate_sse *p, + const struct translate_element *a, + struct x86_reg src, struct x86_reg dst) { - if(a->input_format == a->output_format) - { + if (a->input_format == a->output_format) { emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1)); return TRUE; } @@ -1056,28 +1068,29 @@ static boolean translate_attr( struct translate_sse *p, return translate_attr_convert(p, a, src, dst); } -static boolean init_inputs( struct translate_sse *p, - unsigned index_size ) + +static boolean +init_inputs(struct translate_sse *p, unsigned index_size) { unsigned i; - struct x86_reg instance_id = x86_make_disp(p->machine_EDI, - get_offset(p, &p->instance_id)); - struct x86_reg start_instance = x86_make_disp(p->machine_EDI, - get_offset(p, &p->start_instance)); + struct x86_reg instance_id = + x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); + struct x86_reg start_instance = + x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance)); for (i = 0; i < p->nr_buffer_variants; i++) { struct translate_buffer_variant *variant = &p->buffer_variant[i]; struct translate_buffer *buffer = &p->buffer[variant->buffer_index]; if (!index_size || variant->instance_divisor) { - struct x86_reg buf_max_index = x86_make_disp(p->machine_EDI, - get_offset(p, &buffer->max_index)); - struct x86_reg buf_stride = x86_make_disp(p->machine_EDI, - get_offset(p, &buffer->stride)); - struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, - get_offset(p, &variant->ptr)); - struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI, - get_offset(p, &buffer->base_ptr)); + struct x86_reg buf_max_index = + x86_make_disp(p->machine_EDI, get_offset(p, &buffer->max_index)); + struct x86_reg buf_stride = + x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride)); + struct x86_reg buf_ptr = + x86_make_disp(p->machine_EDI, get_offset(p, &variant->ptr)); + struct x86_reg buf_base_ptr = + x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr)); struct x86_reg elt = p->idx_ESI; struct x86_reg tmp_EAX = p->tmp_EAX; @@ -1099,7 +1112,7 @@ static boolean init_inputs( struct translate_sse *p, */ x86_xor(p->func, tmp_EDX, tmp_EDX); x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor); - x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */ + x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */ /* instance = (instance_id - start_instance) / divisor + * start_instance @@ -1112,7 +1125,8 @@ static boolean init_inputs( struct translate_sse *p, * per-array max value, not the draw->pt.max_index value * that's being given to us via translate->set_buffer(). */ - } else { + } + else { x86_mov(p->func, tmp_EAX, elt); /* Clamp to max_index @@ -1132,13 +1146,11 @@ static boolean init_inputs( struct translate_sse *p, /* In the linear case, keep the buffer pointer instead of the * index number. */ - if (!index_size && p->nr_buffer_variants == 1) - { + if (!index_size && p->nr_buffer_variants == 1) { x64_rexw(p->func); x86_mov(p->func, elt, tmp_EAX); } - else - { + else { x64_rexw(p->func); x86_mov(p->func, buf_ptr, tmp_EAX); } @@ -1149,50 +1161,43 @@ static boolean init_inputs( struct translate_sse *p, } -static struct x86_reg get_buffer_ptr( struct translate_sse *p, - unsigned index_size, - unsigned var_idx, - struct x86_reg elt ) +static struct x86_reg +get_buffer_ptr(struct translate_sse *p, + unsigned index_size, unsigned var_idx, struct x86_reg elt) { if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) { - return x86_make_disp(p->machine_EDI, - get_offset(p, &p->instance_id)); + return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); } if (!index_size && p->nr_buffer_variants == 1) { return p->idx_ESI; } else if (!index_size || p->buffer_variant[var_idx].instance_divisor) { struct x86_reg ptr = p->src_ECX; - struct x86_reg buf_ptr = + struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer_variant[var_idx].ptr)); - + x64_rexw(p->func); x86_mov(p->func, ptr, buf_ptr); return ptr; } else { struct x86_reg ptr = p->src_ECX; - const struct translate_buffer_variant *variant = &p->buffer_variant[var_idx]; - - struct x86_reg buf_stride = + const struct translate_buffer_variant *variant = + &p->buffer_variant[var_idx]; + struct x86_reg buf_stride = x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[variant->buffer_index].stride)); - - struct x86_reg buf_base_ptr = + struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI, - get_offset(p, &p->buffer[variant->buffer_index].base_ptr)); - + get_offset(p, &p->buffer[variant->buffer_index].base_ptr)); struct x86_reg buf_max_index = x86_make_disp(p->machine_EDI, - get_offset(p, &p->buffer[variant->buffer_index].max_index)); - - + get_offset(p, &p->buffer[variant->buffer_index].max_index)); /* Calculate pointer to current attrib: */ - switch(index_size) - { + switch (index_size) { case 1: x86_movzx8(p->func, ptr, elt); break; @@ -1219,13 +1224,12 @@ static struct x86_reg get_buffer_ptr( struct translate_sse *p, } - -static boolean incr_inputs( struct translate_sse *p, - unsigned index_size ) +static boolean +incr_inputs(struct translate_sse *p, unsigned index_size) { if (!index_size && p->nr_buffer_variants == 1) { - struct x86_reg stride = x86_make_disp(p->machine_EDI, - get_offset(p, &p->buffer[0].stride)); + struct x86_reg stride = + x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[0].stride)); if (p->buffer_variant[0].instance_divisor == 0) { x64_rexw(p->func); @@ -1242,24 +1246,26 @@ static boolean incr_inputs( struct translate_sse *p, struct translate_buffer_variant *variant = &p->buffer_variant[i]; struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, get_offset(p, &variant->ptr)); - struct x86_reg buf_stride = x86_make_disp(p->machine_EDI, - get_offset(p, &p->buffer[variant->buffer_index].stride)); + struct x86_reg buf_stride = + x86_make_disp(p->machine_EDI, + get_offset(p, &p->buffer[variant->buffer_index].stride)); if (variant->instance_divisor == 0) { x86_mov(p->func, p->tmp_EAX, buf_stride); x64_rexw(p->func); x86_add(p->func, p->tmp_EAX, buf_ptr); - if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192)); + if (i == 0) + sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192)); x64_rexw(p->func); x86_mov(p->func, buf_ptr, p->tmp_EAX); } } - } + } else { x64_rexw(p->func); x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size)); } - + return TRUE; } @@ -1280,9 +1286,9 @@ static boolean incr_inputs( struct translate_sse *p, * ECX -- pointer to current attribute * */ -static boolean build_vertex_emit( struct translate_sse *p, - struct x86_function *func, - unsigned index_size ) +static boolean +build_vertex_emit(struct translate_sse *p, + struct x86_function *func, unsigned index_size) { int fixup, label; unsigned j; @@ -1290,66 +1296,63 @@ static boolean build_vertex_emit( struct translate_sse *p, memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const)); memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg)); - p->tmp_EAX = x86_make_reg(file_REG32, reg_AX); - p->idx_ESI = x86_make_reg(file_REG32, reg_SI); - p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX); - p->machine_EDI = x86_make_reg(file_REG32, reg_DI); - p->count_EBP = x86_make_reg(file_REG32, reg_BP); - p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX); - p->src_ECX = x86_make_reg(file_REG32, reg_CX); + p->tmp_EAX = x86_make_reg(file_REG32, reg_AX); + p->idx_ESI = x86_make_reg(file_REG32, reg_SI); + p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX); + p->machine_EDI = x86_make_reg(file_REG32, reg_DI); + p->count_EBP = x86_make_reg(file_REG32, reg_BP); + p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX); + p->src_ECX = x86_make_reg(file_REG32, reg_CX); p->func = func; x86_init_func(p->func); - if(x86_target(p->func) == X86_64_WIN64_ABI) - { - /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */ - sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6)); - sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7)); + if (x86_target(p->func) == X86_64_WIN64_ABI) { + /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" + * above the return address + */ + sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), + x86_make_reg(file_XMM, 6)); + sse2_movdqa(p->func, + x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), + x86_make_reg(file_XMM, 7)); } x86_push(p->func, p->outbuf_EBX); x86_push(p->func, p->count_EBP); /* on non-Win64 x86-64, these are already in the right registers */ - if(x86_target(p->func) != X86_64_STD_ABI) - { + if (x86_target(p->func) != X86_64_STD_ABI) { x86_push(p->func, p->machine_EDI); x86_push(p->func, p->idx_ESI); - if(x86_target(p->func) != X86_32) - { - x64_mov64(p->func, p->machine_EDI, x86_fn_arg(p->func, 1)); - x64_mov64(p->func, p->idx_ESI, x86_fn_arg(p->func, 2)); + if (x86_target(p->func) != X86_32) { + x64_mov64(p->func, p->machine_EDI, x86_fn_arg(p->func, 1)); + x64_mov64(p->func, p->idx_ESI, x86_fn_arg(p->func, 2)); } - else - { - x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1)); - x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2)); + else { + x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1)); + x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2)); } } x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3)); - if(x86_target(p->func) != X86_32) + if (x86_target(p->func) != X86_32) x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6)); else x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6)); /* Load instance ID. */ - if (p->use_instancing) { - x86_mov(p->func, - p->tmp2_EDX, - x86_fn_arg(p->func, 4)); + if (p->use_instancing) { + x86_mov(p->func, p->tmp2_EDX, x86_fn_arg(p->func, 4)); x86_mov(p->func, - x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance)), - p->tmp2_EDX); + x86_make_disp(p->machine_EDI, + get_offset(p, &p->start_instance)), p->tmp2_EDX); - x86_mov(p->func, - p->tmp_EAX, - x86_fn_arg(p->func, 5)); + x86_mov(p->func, p->tmp_EAX, x86_fn_arg(p->func, 5)); x86_mov(p->func, x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)), p->tmp_EAX); @@ -1383,24 +1386,22 @@ static boolean build_vertex_emit( struct translate_sse *p, last_variant = variant; vb = get_buffer_ptr(p, index_size, variant, elt); } - - if (!translate_attr( p, a, - x86_make_disp(vb, a->input_offset), - x86_make_disp(p->outbuf_EBX, a->output_offset))) + + if (!translate_attr(p, a, + x86_make_disp(vb, a->input_offset), + x86_make_disp(p->outbuf_EBX, a->output_offset))) return FALSE; } /* Next output vertex: */ x64_rexw(p->func); - x86_lea(p->func, - p->outbuf_EBX, - x86_make_disp(p->outbuf_EBX, - p->translate.key.output_stride)); + x86_lea(p->func, p->outbuf_EBX, + x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride)); /* Incr index - */ - incr_inputs( p, index_size ); + */ + incr_inputs(p, index_size); } /* decr count, loop if not zero @@ -1419,9 +1420,7 @@ static boolean build_vertex_emit( struct translate_sse *p, /* Pop regs and return */ - - if(x86_target(p->func) != X86_64_STD_ABI) - { + if (x86_target(p->func) != X86_64_STD_ABI) { x86_pop(p->func, p->idx_ESI); x86_pop(p->func, p->machine_EDI); } @@ -1429,10 +1428,11 @@ static boolean build_vertex_emit( struct translate_sse *p, x86_pop(p->func, p->count_EBP); x86_pop(p->func, p->outbuf_EBX); - if(x86_target(p->func) == X86_64_WIN64_ABI) - { - sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8)); - sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24)); + if (x86_target(p->func) == X86_64_WIN64_ABI) { + sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), + x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8)); + sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), + x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24)); } x86_ret(p->func); @@ -1440,46 +1440,41 @@ static boolean build_vertex_emit( struct translate_sse *p, } - - - - - -static void translate_sse_set_buffer( struct translate *translate, - unsigned buf, - const void *ptr, - unsigned stride, - unsigned max_index ) +static void +translate_sse_set_buffer(struct translate *translate, + unsigned buf, + const void *ptr, unsigned stride, unsigned max_index) { - struct translate_sse *p = (struct translate_sse *)translate; + struct translate_sse *p = (struct translate_sse *) translate; if (buf < p->nr_buffers) { - p->buffer[buf].base_ptr = (char *)ptr; + p->buffer[buf].base_ptr = (char *) ptr; p->buffer[buf].stride = stride; p->buffer[buf].max_index = max_index; } - if (0) debug_printf("%s %d/%d: %p %d\n", - __FUNCTION__, buf, - p->nr_buffers, - ptr, stride); + if (0) + debug_printf("%s %d/%d: %p %d\n", + __FUNCTION__, buf, p->nr_buffers, ptr, stride); } -static void translate_sse_release( struct translate *translate ) +static void +translate_sse_release(struct translate *translate) { - struct translate_sse *p = (struct translate_sse *)translate; + struct translate_sse *p = (struct translate_sse *) translate; - x86_release_func( &p->elt8_func ); - x86_release_func( &p->elt16_func ); - x86_release_func( &p->elt_func ); - x86_release_func( &p->linear_func ); + x86_release_func(&p->elt8_func); + x86_release_func(&p->elt16_func); + x86_release_func(&p->elt_func); + x86_release_func(&p->linear_func); os_free_aligned(p); } -struct translate *translate_sse2_create( const struct translate_key *key ) +struct translate * +translate_sse2_create(const struct translate_key *key) { struct translate_sse *p = NULL; unsigned i; @@ -1489,8 +1484,9 @@ struct translate *translate_sse2_create( const struct translate_key *key ) goto fail; p = os_malloc_aligned(sizeof(struct translate_sse), 16); - if (p == NULL) + if (p == NULL) goto fail; + memset(p, 0, sizeof(*p)); memcpy(p->consts, consts, sizeof(consts)); @@ -1502,7 +1498,8 @@ struct translate *translate_sse2_create( const struct translate_key *key ) if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) { unsigned j; - p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1); + p->nr_buffers = + MAX2(p->nr_buffers, key->element[i].input_buffer + 1); if (key->element[i].instance_divisor) { p->use_instancing = TRUE; @@ -1512,25 +1509,30 @@ struct translate *translate_sse2_create( const struct translate_key *key ) * Map vertex element to vertex buffer variant. */ for (j = 0; j < p->nr_buffer_variants; j++) { - if (p->buffer_variant[j].buffer_index == key->element[i].input_buffer && - p->buffer_variant[j].instance_divisor == key->element[i].instance_divisor) { + if (p->buffer_variant[j].buffer_index == + key->element[i].input_buffer + && p->buffer_variant[j].instance_divisor == + key->element[i].instance_divisor) { break; } } if (j == p->nr_buffer_variants) { p->buffer_variant[j].buffer_index = key->element[i].input_buffer; - p->buffer_variant[j].instance_divisor = key->element[i].instance_divisor; + p->buffer_variant[j].instance_divisor = + key->element[i].instance_divisor; p->nr_buffer_variants++; } p->element_to_buffer_variant[i] = j; - } else { + } + else { assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID); p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID; } } - if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers); + if (0) + debug_printf("nr_buffers: %d\n", p->nr_buffers); if (!build_vertex_emit(p, &p->linear_func, 0)) goto fail; @@ -1564,16 +1566,16 @@ struct translate *translate_sse2_create( const struct translate_key *key ) fail: if (p) - translate_sse_release( &p->translate ); + translate_sse_release(&p->translate); return NULL; } - #else -struct translate *translate_sse2_create( const struct translate_key *key ) +struct translate * +translate_sse2_create(const struct translate_key *key) { return NULL; } |