summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/gallium/auxiliary/translate/translate_sse.c946
1 files changed, 474 insertions, 472 deletions
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
index a72454a808a..b6bc22227d6 100644
--- a/src/gallium/auxiliary/translate/translate_sse.c
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -47,16 +47,18 @@
#define W 3
-struct translate_buffer {
+struct translate_buffer
+{
const void *base_ptr;
uintptr_t stride;
unsigned max_index;
};
-struct translate_buffer_variant {
+struct translate_buffer_variant
+{
unsigned buffer_index;
unsigned instance_divisor;
- void *ptr; /* updated either per vertex or per instance */
+ void *ptr; /* updated either per vertex or per instance */
};
@@ -77,17 +79,19 @@ enum
#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
static float consts[NUM_CONSTS][4] = {
- {0, 0, 0, 1},
- C(1.0 / 127.0),
- C(1.0 / 255.0),
- C(1.0 / 32767.0),
- C(1.0 / 65535.0),
- C(1.0 / 2147483647.0),
- C(255.0)
+ {0, 0, 0, 1},
+ C(1.0 / 127.0),
+ C(1.0 / 255.0),
+ C(1.0 / 32767.0),
+ C(1.0 / 65535.0),
+ C(1.0 / 2147483647.0),
+ C(255.0)
};
+
#undef C
-struct translate_sse {
+struct translate_sse
+{
struct translate translate;
struct x86_function linear_func;
@@ -96,7 +100,7 @@ struct translate_sse {
struct x86_function elt8_func;
struct x86_function *func;
- PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
+ PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
int8_t reg_to_const[16];
int8_t const_to_reg[NUM_CONSTS];
@@ -120,38 +124,41 @@ struct translate_sse {
struct x86_reg tmp_EAX;
struct x86_reg tmp2_EDX;
struct x86_reg src_ECX;
- struct x86_reg idx_ESI; /* either start+i or &elt[i] */
+ struct x86_reg idx_ESI; /* either start+i or &elt[i] */
struct x86_reg machine_EDI;
struct x86_reg outbuf_EBX;
struct x86_reg count_EBP; /* decrements to zero */
};
-static int get_offset( const void *a, const void *b )
+
+static int
+get_offset(const void *a, const void *b)
{
- return (const char *)b - (const char *)a;
+ return (const char *) b - (const char *) a;
}
-static struct x86_reg get_const( struct translate_sse *p, unsigned id)
+
+static struct x86_reg
+get_const(struct translate_sse *p, unsigned id)
{
struct x86_reg reg;
unsigned i;
- if(p->const_to_reg[id] >= 0)
+ if (p->const_to_reg[id] >= 0)
return x86_make_reg(file_XMM, p->const_to_reg[id]);
- for(i = 2; i < 8; ++i)
- {
- if(p->reg_to_const[i] < 0)
+ for (i = 2; i < 8; ++i) {
+ if (p->reg_to_const[i] < 0)
break;
}
/* TODO: be smarter here */
- if(i == 8)
+ if (i == 8)
--i;
reg = x86_make_reg(file_XMM, i);
- if(p->reg_to_const[i] >= 0)
+ if (p->reg_to_const[i] >= 0)
p->const_to_reg[p->reg_to_const[i]] = -1;
p->reg_to_const[i] = id;
@@ -159,22 +166,21 @@ static struct x86_reg get_const( struct translate_sse *p, unsigned id)
/* TODO: this should happen outside the loop, if possible */
sse_movaps(p->func, reg,
- x86_make_disp(p->machine_EDI,
- get_offset(p, &p->consts[id][0])));
+ x86_make_disp(p->machine_EDI,
+ get_offset(p, &p->consts[id][0])));
return reg;
}
+
/* load the data in a SSE2 register, padding with zeros */
-static boolean emit_load_sse2( struct translate_sse *p,
- struct x86_reg data,
- struct x86_reg src,
- unsigned size)
+static boolean
+emit_load_sse2(struct translate_sse *p,
+ struct x86_reg data, struct x86_reg src, unsigned size)
{
struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
struct x86_reg tmp = p->tmp_EAX;
- switch(size)
- {
+ switch (size) {
case 1:
x86_movzx8(p->func, tmp, src);
sse2_movd(p->func, data, tmp);
@@ -215,9 +221,11 @@ static boolean emit_load_sse2( struct translate_sse *p,
return TRUE;
}
+
/* this value can be passed for the out_chans argument */
#define CHANNELS_0001 5
+
/* this function will load #chans float values, and will
* pad the register with zeroes at least up to out_chans.
*
@@ -225,30 +233,28 @@ static boolean emit_load_sse2( struct translate_sse *p,
* value will be padded with 1. Only pass this value if
* chans < 4 or results are undefined.
*/
-static void emit_load_float32( struct translate_sse *p,
- struct x86_reg data,
- struct x86_reg arg0,
- unsigned out_chans,
- unsigned chans)
+static void
+emit_load_float32(struct translate_sse *p, struct x86_reg data,
+ struct x86_reg arg0, unsigned out_chans, unsigned chans)
{
- switch(chans)
- {
+ switch (chans) {
case 1:
/* a 0 0 0
* a 0 0 1
*/
sse_movss(p->func, data, arg0);
- if(out_chans == CHANNELS_0001)
- sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
+ if (out_chans == CHANNELS_0001)
+ sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
break;
case 2:
/* 0 0 0 1
* a b 0 1
*/
- if(out_chans == CHANNELS_0001)
- sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
- else if(out_chans > 2)
- sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
+ if (out_chans == CHANNELS_0001)
+ sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
+ SHUF(X, Y, Z, W));
+ else if (out_chans > 2)
+ sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
sse_movlps(p->func, data, arg0);
break;
case 3:
@@ -260,9 +266,10 @@ static void emit_load_float32( struct translate_sse *p,
* a b c 0/1
*/
sse_movss(p->func, data, x86_make_disp(arg0, 8));
- if(out_chans == CHANNELS_0001)
- sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X,Y,Z,W) );
- sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
+ if (out_chans == CHANNELS_0001)
+ sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
+ SHUF(X, Y, Z, W));
+ sse_shufps(p->func, data, data, SHUF(Y, Z, X, W));
sse_movlps(p->func, data, arg0);
break;
case 4:
@@ -274,43 +281,42 @@ static void emit_load_float32( struct translate_sse *p,
/* this function behaves like emit_load_float32, but loads
64-bit floating point numbers, converting them to 32-bit
ones */
-static void emit_load_float64to32( struct translate_sse *p,
- struct x86_reg data,
- struct x86_reg arg0,
- unsigned out_chans,
- unsigned chans)
+static void
+emit_load_float64to32(struct translate_sse *p, struct x86_reg data,
+ struct x86_reg arg0, unsigned out_chans, unsigned chans)
{
struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
- switch(chans)
- {
+ switch (chans) {
case 1:
sse2_movsd(p->func, data, arg0);
- if(out_chans > 1)
+ if (out_chans > 1)
sse2_cvtpd2ps(p->func, data, data);
else
sse2_cvtsd2ss(p->func, data, data);
- if(out_chans == CHANNELS_0001)
- sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
+ if (out_chans == CHANNELS_0001)
+ sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
+ SHUF(X, Y, Z, W));
break;
case 2:
sse2_movupd(p->func, data, arg0);
sse2_cvtpd2ps(p->func, data, data);
- if(out_chans == CHANNELS_0001)
- sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
- else if(out_chans > 2)
- sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
- break;
+ if (out_chans == CHANNELS_0001)
+ sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
+ SHUF(X, Y, Z, W));
+ else if (out_chans > 2)
+ sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
+ break;
case 3:
sse2_movupd(p->func, data, arg0);
sse2_cvtpd2ps(p->func, data, data);
sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
- if(out_chans > 3)
+ if (out_chans > 3)
sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
else
sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
sse_movlhps(p->func, data, tmpXMM);
- if(out_chans == CHANNELS_0001)
- sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
+ if (out_chans == CHANNELS_0001)
+ sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
break;
case 4:
sse2_movupd(p->func, data, arg0);
@@ -322,53 +328,65 @@ static void emit_load_float64to32( struct translate_sse *p,
}
}
-static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr, struct x86_reg src_xmm)
+
+static void
+emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr,
+ struct x86_reg dst_xmm, struct x86_reg src_gpr,
+ struct x86_reg src_xmm)
{
- if(x86_target(p->func) != X86_32)
+ if (x86_target(p->func) != X86_32)
x64_mov64(p->func, dst_gpr, src_gpr);
- else
- {
+ else {
/* TODO: when/on which CPUs is SSE2 actually better than SSE? */
- if(x86_target_caps(p->func) & X86_SSE2)
+ if (x86_target_caps(p->func) & X86_SSE2)
sse2_movq(p->func, dst_xmm, src_xmm);
else
sse_movlps(p->func, dst_xmm, src_xmm);
}
}
-static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src)
+
+static void
+emit_load64(struct translate_sse *p, struct x86_reg dst_gpr,
+ struct x86_reg dst_xmm, struct x86_reg src)
{
emit_mov64(p, dst_gpr, dst_xmm, src, src);
}
-static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm)
+
+static void
+emit_store64(struct translate_sse *p, struct x86_reg dst,
+ struct x86_reg src_gpr, struct x86_reg src_xmm)
{
emit_mov64(p, dst, dst, src_gpr, src_xmm);
}
-static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
+
+static void
+emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
{
- if(x86_target_caps(p->func) & X86_SSE2)
+ if (x86_target_caps(p->func) & X86_SSE2)
sse2_movdqu(p->func, dst, src);
else
sse_movups(p->func, dst, src);
}
+
/* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
* but may or may not be good on older processors
* TODO: may perhaps want to use non-temporal stores here if possible
*/
-static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size)
+static void
+emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src,
+ unsigned size)
{
struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
struct x86_reg dataGPR = p->tmp_EAX;
struct x86_reg dataGPR2 = p->tmp2_EDX;
- if(size < 8)
- {
- switch (size)
- {
+ if (size < 8) {
+ switch (size) {
case 1:
x86_mov8(p->func, dataGPR, src);
x86_mov8(p->func, dst, dataGPR);
@@ -395,20 +413,16 @@ static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_
break;
}
}
- else if(!(x86_target_caps(p->func) & X86_SSE))
- {
+ else if (!(x86_target_caps(p->func) & X86_SSE)) {
unsigned i = 0;
assert((size & 3) == 0);
- for(i = 0; i < size; i += 4)
- {
+ for (i = 0; i < size; i += 4) {
x86_mov(p->func, dataGPR, x86_make_disp(src, i));
x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
}
}
- else
- {
- switch(size)
- {
+ else {
+ switch (size) {
case 8:
emit_load64(p, dataGPR, dataXMM, src);
emit_store64(p, dst, dataGPR, dataXMM);
@@ -441,101 +455,104 @@ static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_
}
}
-static boolean translate_attr_convert( struct translate_sse *p,
- const struct translate_element *a,
- struct x86_reg src,
- struct x86_reg dst)
-
+static boolean
+translate_attr_convert(struct translate_sse *p,
+ const struct translate_element *a,
+ struct x86_reg src, struct x86_reg dst)
{
- const struct util_format_description* input_desc = util_format_description(a->input_format);
- const struct util_format_description* output_desc = util_format_description(a->output_format);
+ const struct util_format_description *input_desc =
+ util_format_description(a->input_format);
+ const struct util_format_description *output_desc =
+ util_format_description(a->output_format);
unsigned i;
boolean id_swizzle = TRUE;
- unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE};
+ unsigned swizzle[4] =
+ { UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE,
+ UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE };
unsigned needed_chans = 0;
- unsigned imms[2] = {0, 0x3f800000};
+ unsigned imms[2] = { 0, 0x3f800000 };
- if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE)
+ if (a->output_format == PIPE_FORMAT_NONE
+ || a->input_format == PIPE_FORMAT_NONE)
return FALSE;
- if(input_desc->channel[0].size & 7)
+ if (input_desc->channel[0].size & 7)
return FALSE;
- if(input_desc->colorspace != output_desc->colorspace)
+ if (input_desc->colorspace != output_desc->colorspace)
return FALSE;
- for(i = 1; i < input_desc->nr_channels; ++i)
- {
- if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0])))
+ for (i = 1; i < input_desc->nr_channels; ++i) {
+ if (memcmp
+ (&input_desc->channel[i], &input_desc->channel[0],
+ sizeof(input_desc->channel[0])))
return FALSE;
}
- for(i = 1; i < output_desc->nr_channels; ++i)
- {
- if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0])))
+ for (i = 1; i < output_desc->nr_channels; ++i) {
+ if (memcmp
+ (&output_desc->channel[i], &output_desc->channel[0],
+ sizeof(output_desc->channel[0]))) {
return FALSE;
+ }
}
- for(i = 0; i < output_desc->nr_channels; ++i)
- {
- if(output_desc->swizzle[i] < 4)
+ for (i = 0; i < output_desc->nr_channels; ++i) {
+ if (output_desc->swizzle[i] < 4)
swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
}
- if((x86_target_caps(p->func) & X86_SSE) && (0
- || a->output_format == PIPE_FORMAT_R32_FLOAT
- || a->output_format == PIPE_FORMAT_R32G32_FLOAT
- || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
- || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT))
- {
+ if ((x86_target_caps(p->func) & X86_SSE) &&
+ (0 || a->output_format == PIPE_FORMAT_R32_FLOAT
+ || a->output_format == PIPE_FORMAT_R32G32_FLOAT
+ || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
+ || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) {
struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
- for(i = 0; i < output_desc->nr_channels; ++i)
- {
- if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
+ for (i = 0; i < output_desc->nr_channels; ++i) {
+ if (swizzle[i] == UTIL_FORMAT_SWIZZLE_0
+ && i >= input_desc->nr_channels)
swizzle[i] = i;
}
- for(i = 0; i < output_desc->nr_channels; ++i)
- {
- if(swizzle[i] < 4)
+ for (i = 0; i < output_desc->nr_channels; ++i) {
+ if (swizzle[i] < 4)
needed_chans = MAX2(needed_chans, swizzle[i] + 1);
- if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
+ if (swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
id_swizzle = FALSE;
}
- if(needed_chans > 0)
- {
- switch(input_desc->channel[0].type)
- {
+ if (needed_chans > 0) {
+ switch (input_desc->channel[0].type) {
case UTIL_FORMAT_TYPE_UNSIGNED:
- if(!(x86_target_caps(p->func) & X86_SSE2))
+ if (!(x86_target_caps(p->func) & X86_SSE2))
return FALSE;
- emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
+ emit_load_sse2(p, dataXMM, src,
+ input_desc->channel[0].size *
+ input_desc->nr_channels >> 3);
/* TODO: add support for SSE4.1 pmovzx */
- switch(input_desc->channel[0].size)
- {
+ switch (input_desc->channel[0].size) {
case 8:
- /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */
+ /* TODO: this may be inefficient due to get_identity() being
+ * used both as a float and integer register.
+ */
sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
break;
case 16:
sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
break;
- case 32: /* we lose precision here */
+ case 32: /* we lose precision here */
sse2_psrld_imm(p->func, dataXMM, 1);
break;
default:
return FALSE;
}
sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
- if(input_desc->channel[0].normalized)
- {
+ if (input_desc->channel[0].normalized) {
struct x86_reg factor;
- switch(input_desc->channel[0].size)
- {
+ switch (input_desc->channel[0].size) {
case 8:
factor = get_const(p, CONST_INV_255);
break;
@@ -555,17 +572,19 @@ static boolean translate_attr_convert( struct translate_sse *p,
}
sse_mulps(p->func, dataXMM, factor);
}
- else if(input_desc->channel[0].size == 32)
- sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */
+ else if (input_desc->channel[0].size == 32)
+ /* compensate for the bit we threw away to fit u32 into s32 */
+ sse_addps(p->func, dataXMM, dataXMM);
break;
case UTIL_FORMAT_TYPE_SIGNED:
- if(!(x86_target_caps(p->func) & X86_SSE2))
+ if (!(x86_target_caps(p->func) & X86_SSE2))
return FALSE;
- emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
+ emit_load_sse2(p, dataXMM, src,
+ input_desc->channel[0].size *
+ input_desc->nr_channels >> 3);
/* TODO: add support for SSE4.1 pmovsx */
- switch(input_desc->channel[0].size)
- {
+ switch (input_desc->channel[0].size) {
case 8:
sse2_punpcklbw(p->func, dataXMM, dataXMM);
sse2_punpcklbw(p->func, dataXMM, dataXMM);
@@ -575,17 +594,15 @@ static boolean translate_attr_convert( struct translate_sse *p,
sse2_punpcklwd(p->func, dataXMM, dataXMM);
sse2_psrad_imm(p->func, dataXMM, 16);
break;
- case 32: /* we lose precision here */
+ case 32: /* we lose precision here */
break;
default:
return FALSE;
}
sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
- if(input_desc->channel[0].normalized)
- {
+ if (input_desc->channel[0].normalized) {
struct x86_reg factor;
- switch(input_desc->channel[0].size)
- {
+ switch (input_desc->channel[0].size) {
case 8:
factor = get_const(p, CONST_INV_127);
break;
@@ -609,22 +626,25 @@ static boolean translate_attr_convert( struct translate_sse *p,
break;
case UTIL_FORMAT_TYPE_FLOAT:
- if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64)
+ if (input_desc->channel[0].size != 32
+ && input_desc->channel[0].size != 64) {
return FALSE;
- if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3)
- {
+ }
+ if (swizzle[3] == UTIL_FORMAT_SWIZZLE_1
+ && input_desc->nr_channels <= 3) {
swizzle[3] = UTIL_FORMAT_SWIZZLE_W;
needed_chans = CHANNELS_0001;
}
- switch(input_desc->channel[0].size)
- {
+ switch (input_desc->channel[0].size) {
case 32:
- emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
+ emit_load_float32(p, dataXMM, src, needed_chans,
+ input_desc->nr_channels);
break;
- case 64: /* we lose precision here */
- if(!(x86_target_caps(p->func) & X86_SSE2))
+ case 64: /* we lose precision here */
+ if (!(x86_target_caps(p->func) & X86_SSE2))
return FALSE;
- emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
+ emit_load_float64to32(p, dataXMM, src, needed_chans,
+ input_desc->nr_channels);
break;
default:
return FALSE;
@@ -634,119 +654,124 @@ static boolean translate_attr_convert( struct translate_sse *p,
return FALSE;
}
- if(!id_swizzle)
- sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) );
+ if (!id_swizzle) {
+ sse_shufps(p->func, dataXMM, dataXMM,
+ SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]));
+ }
}
- if(output_desc->nr_channels >= 4
- && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
- && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
- && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
- && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
- )
+ if (output_desc->nr_channels >= 4
+ && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
+ && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
+ && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
+ && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) {
sse_movups(p->func, dst, dataXMM);
- else
- {
- if(output_desc->nr_channels >= 2
- && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
- && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
+ }
+ else {
+ if (output_desc->nr_channels >= 2
+ && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
+ && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) {
sse_movlps(p->func, dst, dataXMM);
- else
- {
- if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
+ }
+ else {
+ if (swizzle[0] < UTIL_FORMAT_SWIZZLE_0) {
sse_movss(p->func, dst, dataXMM);
- else
- x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
+ }
+ else {
+ x86_mov_imm(p->func, dst,
+ imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
+ }
- if(output_desc->nr_channels >= 2)
- {
- if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
- {
+ if (output_desc->nr_channels >= 2) {
+ if (swizzle[1] < UTIL_FORMAT_SWIZZLE_0) {
sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
}
- else
- x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
+ else {
+ x86_mov_imm(p->func, x86_make_disp(dst, 4),
+ imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
+ }
}
}
- if(output_desc->nr_channels >= 3)
- {
- if(output_desc->nr_channels >= 4
- && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
- && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
+ if (output_desc->nr_channels >= 3) {
+ if (output_desc->nr_channels >= 4
+ && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
+ && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) {
sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
- else
- {
- if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
- {
+ }
+ else {
+ if (swizzle[2] < UTIL_FORMAT_SWIZZLE_0) {
sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
}
- else
- x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
+ else {
+ x86_mov_imm(p->func, x86_make_disp(dst, 8),
+ imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
+ }
- if(output_desc->nr_channels >= 4)
- {
- if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
- {
+ if (output_desc->nr_channels >= 4) {
+ if (swizzle[3] < UTIL_FORMAT_SWIZZLE_0) {
sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
}
- else
- x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
+ else {
+ x86_mov_imm(p->func, x86_make_disp(dst, 12),
+ imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
+ }
}
}
}
}
return TRUE;
}
- else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16
- && output_desc->channel[0].normalized == input_desc->channel[0].normalized
- && (0
- || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
- || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
- || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
- ))
- {
+ else if ((x86_target_caps(p->func) & X86_SSE2)
+ && input_desc->channel[0].size == 8
+ && output_desc->channel[0].size == 16
+ && output_desc->channel[0].normalized ==
+ input_desc->channel[0].normalized &&
+ (0 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
+ && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
+ || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
+ && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
+ || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED
+ && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED))) {
struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
struct x86_reg tmp = p->tmp_EAX;
- unsigned imms[2] = {0, 1};
+ unsigned imms[2] = { 0, 1 };
- for(i = 0; i < output_desc->nr_channels; ++i)
- {
- if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
+ for (i = 0; i < output_desc->nr_channels; ++i) {
+ if (swizzle[i] == UTIL_FORMAT_SWIZZLE_0
+ && i >= input_desc->nr_channels) {
swizzle[i] = i;
+ }
}
- for(i = 0; i < output_desc->nr_channels; ++i)
- {
- if(swizzle[i] < 4)
+ for (i = 0; i < output_desc->nr_channels; ++i) {
+ if (swizzle[i] < 4)
needed_chans = MAX2(needed_chans, swizzle[i] + 1);
- if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
+ if (swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
id_swizzle = FALSE;
}
- if(needed_chans > 0)
- {
- emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
+ if (needed_chans > 0) {
+ emit_load_sse2(p, dataXMM, src,
+ input_desc->channel[0].size *
+ input_desc->nr_channels >> 3);
- switch(input_desc->channel[0].type)
- {
+ switch (input_desc->channel[0].type) {
case UTIL_FORMAT_TYPE_UNSIGNED:
- if(input_desc->channel[0].normalized)
- {
+ if (input_desc->channel[0].normalized) {
sse2_punpcklbw(p->func, dataXMM, dataXMM);
- if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
- sse2_psrlw_imm(p->func, dataXMM, 1);
+ if (output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
+ sse2_psrlw_imm(p->func, dataXMM, 1);
}
else
sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
break;
case UTIL_FORMAT_TYPE_SIGNED:
- if(input_desc->channel[0].normalized)
- {
+ if (input_desc->channel[0].normalized) {
sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
sse2_punpcklbw(p->func, tmpXMM, dataXMM);
sse2_psllw_imm(p->func, dataXMM, 9);
@@ -760,8 +785,7 @@ static boolean translate_attr_convert( struct translate_sse *p,
tmpXMM = t;
}
}
- else
- {
+ else {
sse2_punpcklbw(p->func, dataXMM, dataXMM);
sse2_psraw_imm(p->func, dataXMM, 8);
}
@@ -770,43 +794,49 @@ static boolean translate_attr_convert( struct translate_sse *p,
assert(0);
}
- if(output_desc->channel[0].normalized)
- imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
+ if (output_desc->channel[0].normalized)
+ imms[1] =
+ (output_desc->channel[0].type ==
+ UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
- if(!id_swizzle)
- sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
+ if (!id_swizzle)
+ sse2_pshuflw(p->func, dataXMM, dataXMM,
+ (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) |
+ ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
}
- if(output_desc->nr_channels >= 4
- && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
- && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
- && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
- && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
- )
+ if (output_desc->nr_channels >= 4
+ && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
+ && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
+ && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
+ && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) {
sse2_movq(p->func, dst, dataXMM);
- else
- {
- if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
- {
- if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
+ }
+ else {
+ if (swizzle[0] < UTIL_FORMAT_SWIZZLE_0) {
+ if (output_desc->nr_channels >= 2
+ && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) {
sse2_movd(p->func, dst, dataXMM);
- else
- {
+ }
+ else {
sse2_movd(p->func, tmp, dataXMM);
x86_mov16(p->func, dst, tmp);
- if(output_desc->nr_channels >= 2)
- x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
+ if (output_desc->nr_channels >= 2)
+ x86_mov16_imm(p->func, x86_make_disp(dst, 2),
+ imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
}
}
- else
- {
- if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0)
- x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
- else
- {
- x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
- if(output_desc->nr_channels >= 2)
- {
+ else {
+ if (output_desc->nr_channels >= 2
+ && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0) {
+ x86_mov_imm(p->func, dst,
+ (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) |
+ imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
+ }
+ else {
+ x86_mov16_imm(p->func, dst,
+ imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
+ if (output_desc->nr_channels >= 2) {
sse2_movd(p->func, tmp, dataXMM);
x86_shr_imm(p->func, tmp, 16);
x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
@@ -814,36 +844,35 @@ static boolean translate_attr_convert( struct translate_sse *p,
}
}
- if(output_desc->nr_channels >= 3)
- {
- if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
- {
- if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
- {
+ if (output_desc->nr_channels >= 3) {
+ if (swizzle[2] < UTIL_FORMAT_SWIZZLE_0) {
+ if (output_desc->nr_channels >= 4
+ && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) {
sse2_psrlq_imm(p->func, dataXMM, 32);
sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
}
- else
- {
+ else {
sse2_psrlq_imm(p->func, dataXMM, 32);
sse2_movd(p->func, tmp, dataXMM);
x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
- if(output_desc->nr_channels >= 4)
- {
- x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
+ if (output_desc->nr_channels >= 4) {
+ x86_mov16_imm(p->func, x86_make_disp(dst, 6),
+ imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
}
}
}
- else
- {
- if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0)
- x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
- else
- {
- x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
+ else {
+ if (output_desc->nr_channels >= 4
+ && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0) {
+ x86_mov_imm(p->func, x86_make_disp(dst, 4),
+ (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16)
+ | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
+ }
+ else {
+ x86_mov16_imm(p->func, x86_make_disp(dst, 4),
+ imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
- if(output_desc->nr_channels >= 4)
- {
+ if (output_desc->nr_channels >= 4) {
sse2_psrlq_imm(p->func, dataXMM, 48);
sse2_movd(p->func, tmp, dataXMM);
x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
@@ -854,16 +883,17 @@ static boolean translate_attr_convert( struct translate_sse *p,
}
return TRUE;
}
- else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0])))
- {
+ else if (!memcmp(&output_desc->channel[0], &input_desc->channel[0],
+ sizeof(output_desc->channel[0]))) {
struct x86_reg tmp = p->tmp_EAX;
unsigned i;
- if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4
- && swizzle[0] == UTIL_FORMAT_SWIZZLE_W
- && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z
- && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y
- && swizzle[3] == UTIL_FORMAT_SWIZZLE_X)
- {
+
+ if (input_desc->channel[0].size == 8 && input_desc->nr_channels == 4
+ && output_desc->nr_channels == 4
+ && swizzle[0] == UTIL_FORMAT_SWIZZLE_W
+ && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z
+ && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y
+ && swizzle[3] == UTIL_FORMAT_SWIZZLE_X) {
/* TODO: support movbe */
x86_mov(p->func, tmp, src);
x86_bswap(p->func, tmp);
@@ -871,18 +901,13 @@ static boolean translate_attr_convert( struct translate_sse *p,
return TRUE;
}
- for(i = 0; i < output_desc->nr_channels; ++i)
- {
- switch(output_desc->channel[0].size)
- {
+ for (i = 0; i < output_desc->nr_channels; ++i) {
+ switch (output_desc->channel[0].size) {
case 8:
- if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
- {
+ if (swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) {
unsigned v = 0;
- if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
- {
- switch(output_desc->channel[0].type)
- {
+ if (swizzle[i] == UTIL_FORMAT_SWIZZLE_1) {
+ switch (output_desc->channel[0].type) {
case UTIL_FORMAT_TYPE_UNSIGNED:
v = output_desc->channel[0].normalized ? 0xff : 1;
break;
@@ -895,20 +920,16 @@ static boolean translate_attr_convert( struct translate_sse *p,
}
x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
}
- else
- {
+ else {
x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
}
break;
case 16:
- if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
- {
+ if (swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) {
unsigned v = 0;
- if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
- {
- switch(output_desc->channel[1].type)
- {
+ if (swizzle[i] == UTIL_FORMAT_SWIZZLE_1) {
+ switch (output_desc->channel[1].type) {
case UTIL_FORMAT_TYPE_UNSIGNED:
v = output_desc->channel[1].normalized ? 0xffff : 1;
break;
@@ -924,22 +945,19 @@ static boolean translate_attr_convert( struct translate_sse *p,
}
x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
}
- else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0)
+ else if (swizzle[i] == UTIL_FORMAT_SWIZZLE_0) {
x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
- else
- {
+ }
+ else {
x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
}
break;
case 32:
- if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
- {
+ if (swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) {
unsigned v = 0;
- if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
- {
- switch(output_desc->channel[1].type)
- {
+ if (swizzle[i] == UTIL_FORMAT_SWIZZLE_1) {
+ switch (output_desc->channel[1].type) {
case UTIL_FORMAT_TYPE_UNSIGNED:
v = output_desc->channel[1].normalized ? 0xffffffff : 1;
break;
@@ -955,21 +973,17 @@ static boolean translate_attr_convert( struct translate_sse *p,
}
x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
}
- else
- {
+ else {
x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
}
break;
case 64:
- if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
- {
+ if (swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) {
unsigned l = 0;
unsigned h = 0;
- if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
- {
- switch(output_desc->channel[1].type)
- {
+ if (swizzle[i] == UTIL_FORMAT_SWIZZLE_1) {
+ switch (output_desc->channel[1].type) {
case UTIL_FORMAT_TYPE_UNSIGNED:
h = output_desc->channel[1].normalized ? 0xffffffff : 0;
l = output_desc->channel[1].normalized ? 0xffffffff : 1;
@@ -989,19 +1003,18 @@ static boolean translate_attr_convert( struct translate_sse *p,
x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
}
- else
- {
- if(x86_target_caps(p->func) & X86_SSE)
- {
+ else {
+ if (x86_target_caps(p->func) & X86_SSE) {
struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
- emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8));
+ emit_load64(p, tmp, tmpXMM,
+ x86_make_disp(src, swizzle[i] * 8));
emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
}
- else
- {
+ else {
x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
- x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4));
+ x86_mov(p->func, tmp,
+ x86_make_disp(src, swizzle[i] * 8 + 4));
x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
}
}
@@ -1013,19 +1026,18 @@ static boolean translate_attr_convert( struct translate_sse *p,
return TRUE;
}
/* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
- else if((x86_target_caps(p->func) & X86_SSE2) &&
- a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && (0
- || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
- || a->output_format == PIPE_FORMAT_R8G8B8A8_UNORM
- ))
- {
+ else if ((x86_target_caps(p->func) & X86_SSE2) &&
+ a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT &&
+ (0 || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
+ || a-> output_format == PIPE_FORMAT_R8G8B8A8_UNORM)) {
struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
/* load */
sse_movups(p->func, dataXMM, src);
- if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM)
- sse_shufps(p->func, dataXMM, dataXMM, SHUF(2,1,0,3));
+ if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
+ sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 1, 0, 3));
+ }
/* scale by 255.0 */
sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
@@ -1042,13 +1054,13 @@ static boolean translate_attr_convert( struct translate_sse *p,
return FALSE;
}
-static boolean translate_attr( struct translate_sse *p,
- const struct translate_element *a,
- struct x86_reg src,
- struct x86_reg dst)
+
+static boolean
+translate_attr(struct translate_sse *p,
+ const struct translate_element *a,
+ struct x86_reg src, struct x86_reg dst)
{
- if(a->input_format == a->output_format)
- {
+ if (a->input_format == a->output_format) {
emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
return TRUE;
}
@@ -1056,28 +1068,29 @@ static boolean translate_attr( struct translate_sse *p,
return translate_attr_convert(p, a, src, dst);
}
-static boolean init_inputs( struct translate_sse *p,
- unsigned index_size )
+
+static boolean
+init_inputs(struct translate_sse *p, unsigned index_size)
{
unsigned i;
- struct x86_reg instance_id = x86_make_disp(p->machine_EDI,
- get_offset(p, &p->instance_id));
- struct x86_reg start_instance = x86_make_disp(p->machine_EDI,
- get_offset(p, &p->start_instance));
+ struct x86_reg instance_id =
+ x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
+ struct x86_reg start_instance =
+ x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance));
for (i = 0; i < p->nr_buffer_variants; i++) {
struct translate_buffer_variant *variant = &p->buffer_variant[i];
struct translate_buffer *buffer = &p->buffer[variant->buffer_index];
if (!index_size || variant->instance_divisor) {
- struct x86_reg buf_max_index = x86_make_disp(p->machine_EDI,
- get_offset(p, &buffer->max_index));
- struct x86_reg buf_stride = x86_make_disp(p->machine_EDI,
- get_offset(p, &buffer->stride));
- struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
- get_offset(p, &variant->ptr));
- struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI,
- get_offset(p, &buffer->base_ptr));
+ struct x86_reg buf_max_index =
+ x86_make_disp(p->machine_EDI, get_offset(p, &buffer->max_index));
+ struct x86_reg buf_stride =
+ x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride));
+ struct x86_reg buf_ptr =
+ x86_make_disp(p->machine_EDI, get_offset(p, &variant->ptr));
+ struct x86_reg buf_base_ptr =
+ x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr));
struct x86_reg elt = p->idx_ESI;
struct x86_reg tmp_EAX = p->tmp_EAX;
@@ -1099,7 +1112,7 @@ static boolean init_inputs( struct translate_sse *p,
*/
x86_xor(p->func, tmp_EDX, tmp_EDX);
x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor);
- x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */
+ x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */
/* instance = (instance_id - start_instance) / divisor +
* start_instance
@@ -1112,7 +1125,8 @@ static boolean init_inputs( struct translate_sse *p,
* per-array max value, not the draw->pt.max_index value
* that's being given to us via translate->set_buffer().
*/
- } else {
+ }
+ else {
x86_mov(p->func, tmp_EAX, elt);
/* Clamp to max_index
@@ -1132,13 +1146,11 @@ static boolean init_inputs( struct translate_sse *p,
/* In the linear case, keep the buffer pointer instead of the
* index number.
*/
- if (!index_size && p->nr_buffer_variants == 1)
- {
+ if (!index_size && p->nr_buffer_variants == 1) {
x64_rexw(p->func);
x86_mov(p->func, elt, tmp_EAX);
}
- else
- {
+ else {
x64_rexw(p->func);
x86_mov(p->func, buf_ptr, tmp_EAX);
}
@@ -1149,50 +1161,43 @@ static boolean init_inputs( struct translate_sse *p,
}
-static struct x86_reg get_buffer_ptr( struct translate_sse *p,
- unsigned index_size,
- unsigned var_idx,
- struct x86_reg elt )
+static struct x86_reg
+get_buffer_ptr(struct translate_sse *p,
+ unsigned index_size, unsigned var_idx, struct x86_reg elt)
{
if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
- return x86_make_disp(p->machine_EDI,
- get_offset(p, &p->instance_id));
+ return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
}
if (!index_size && p->nr_buffer_variants == 1) {
return p->idx_ESI;
}
else if (!index_size || p->buffer_variant[var_idx].instance_divisor) {
struct x86_reg ptr = p->src_ECX;
- struct x86_reg buf_ptr =
+ struct x86_reg buf_ptr =
x86_make_disp(p->machine_EDI,
get_offset(p, &p->buffer_variant[var_idx].ptr));
-
+
x64_rexw(p->func);
x86_mov(p->func, ptr, buf_ptr);
return ptr;
}
else {
struct x86_reg ptr = p->src_ECX;
- const struct translate_buffer_variant *variant = &p->buffer_variant[var_idx];
-
- struct x86_reg buf_stride =
+ const struct translate_buffer_variant *variant =
+ &p->buffer_variant[var_idx];
+ struct x86_reg buf_stride =
x86_make_disp(p->machine_EDI,
get_offset(p, &p->buffer[variant->buffer_index].stride));
-
- struct x86_reg buf_base_ptr =
+ struct x86_reg buf_base_ptr =
x86_make_disp(p->machine_EDI,
- get_offset(p, &p->buffer[variant->buffer_index].base_ptr));
-
+ get_offset(p, &p->buffer[variant->buffer_index].base_ptr));
struct x86_reg buf_max_index =
x86_make_disp(p->machine_EDI,
- get_offset(p, &p->buffer[variant->buffer_index].max_index));
-
-
+ get_offset(p, &p->buffer[variant->buffer_index].max_index));
/* Calculate pointer to current attrib:
*/
- switch(index_size)
- {
+ switch (index_size) {
case 1:
x86_movzx8(p->func, ptr, elt);
break;
@@ -1219,13 +1224,12 @@ static struct x86_reg get_buffer_ptr( struct translate_sse *p,
}
-
-static boolean incr_inputs( struct translate_sse *p,
- unsigned index_size )
+static boolean
+incr_inputs(struct translate_sse *p, unsigned index_size)
{
if (!index_size && p->nr_buffer_variants == 1) {
- struct x86_reg stride = x86_make_disp(p->machine_EDI,
- get_offset(p, &p->buffer[0].stride));
+ struct x86_reg stride =
+ x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[0].stride));
if (p->buffer_variant[0].instance_divisor == 0) {
x64_rexw(p->func);
@@ -1242,24 +1246,26 @@ static boolean incr_inputs( struct translate_sse *p,
struct translate_buffer_variant *variant = &p->buffer_variant[i];
struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
get_offset(p, &variant->ptr));
- struct x86_reg buf_stride = x86_make_disp(p->machine_EDI,
- get_offset(p, &p->buffer[variant->buffer_index].stride));
+ struct x86_reg buf_stride =
+ x86_make_disp(p->machine_EDI,
+ get_offset(p, &p->buffer[variant->buffer_index].stride));
if (variant->instance_divisor == 0) {
x86_mov(p->func, p->tmp_EAX, buf_stride);
x64_rexw(p->func);
x86_add(p->func, p->tmp_EAX, buf_ptr);
- if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
+ if (i == 0)
+ sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
x64_rexw(p->func);
x86_mov(p->func, buf_ptr, p->tmp_EAX);
}
}
- }
+ }
else {
x64_rexw(p->func);
x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
}
-
+
return TRUE;
}
@@ -1280,9 +1286,9 @@ static boolean incr_inputs( struct translate_sse *p,
* ECX -- pointer to current attribute
*
*/
-static boolean build_vertex_emit( struct translate_sse *p,
- struct x86_function *func,
- unsigned index_size )
+static boolean
+build_vertex_emit(struct translate_sse *p,
+ struct x86_function *func, unsigned index_size)
{
int fixup, label;
unsigned j;
@@ -1290,66 +1296,63 @@ static boolean build_vertex_emit( struct translate_sse *p,
memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
- p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
- p->idx_ESI = x86_make_reg(file_REG32, reg_SI);
- p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX);
- p->machine_EDI = x86_make_reg(file_REG32, reg_DI);
- p->count_EBP = x86_make_reg(file_REG32, reg_BP);
- p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX);
- p->src_ECX = x86_make_reg(file_REG32, reg_CX);
+ p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
+ p->idx_ESI = x86_make_reg(file_REG32, reg_SI);
+ p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX);
+ p->machine_EDI = x86_make_reg(file_REG32, reg_DI);
+ p->count_EBP = x86_make_reg(file_REG32, reg_BP);
+ p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX);
+ p->src_ECX = x86_make_reg(file_REG32, reg_CX);
p->func = func;
x86_init_func(p->func);
- if(x86_target(p->func) == X86_64_WIN64_ABI)
- {
- /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */
- sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6));
- sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7));
+ if (x86_target(p->func) == X86_64_WIN64_ABI) {
+ /* the ABI guarantees a 16-byte aligned 32-byte "shadow space"
+ * above the return address
+ */
+ sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8),
+ x86_make_reg(file_XMM, 6));
+ sse2_movdqa(p->func,
+ x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24),
+ x86_make_reg(file_XMM, 7));
}
x86_push(p->func, p->outbuf_EBX);
x86_push(p->func, p->count_EBP);
/* on non-Win64 x86-64, these are already in the right registers */
- if(x86_target(p->func) != X86_64_STD_ABI)
- {
+ if (x86_target(p->func) != X86_64_STD_ABI) {
x86_push(p->func, p->machine_EDI);
x86_push(p->func, p->idx_ESI);
- if(x86_target(p->func) != X86_32)
- {
- x64_mov64(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
- x64_mov64(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
+ if (x86_target(p->func) != X86_32) {
+ x64_mov64(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
+ x64_mov64(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
}
- else
- {
- x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
- x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
+ else {
+ x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
+ x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
}
}
x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
- if(x86_target(p->func) != X86_32)
+ if (x86_target(p->func) != X86_32)
x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
else
x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
/* Load instance ID.
*/
- if (p->use_instancing) {
- x86_mov(p->func,
- p->tmp2_EDX,
- x86_fn_arg(p->func, 4));
+ if (p->use_instancing) {
+ x86_mov(p->func, p->tmp2_EDX, x86_fn_arg(p->func, 4));
x86_mov(p->func,
- x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance)),
- p->tmp2_EDX);
+ x86_make_disp(p->machine_EDI,
+ get_offset(p, &p->start_instance)), p->tmp2_EDX);
- x86_mov(p->func,
- p->tmp_EAX,
- x86_fn_arg(p->func, 5));
+ x86_mov(p->func, p->tmp_EAX, x86_fn_arg(p->func, 5));
x86_mov(p->func,
x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
p->tmp_EAX);
@@ -1383,24 +1386,22 @@ static boolean build_vertex_emit( struct translate_sse *p,
last_variant = variant;
vb = get_buffer_ptr(p, index_size, variant, elt);
}
-
- if (!translate_attr( p, a,
- x86_make_disp(vb, a->input_offset),
- x86_make_disp(p->outbuf_EBX, a->output_offset)))
+
+ if (!translate_attr(p, a,
+ x86_make_disp(vb, a->input_offset),
+ x86_make_disp(p->outbuf_EBX, a->output_offset)))
return FALSE;
}
/* Next output vertex:
*/
x64_rexw(p->func);
- x86_lea(p->func,
- p->outbuf_EBX,
- x86_make_disp(p->outbuf_EBX,
- p->translate.key.output_stride));
+ x86_lea(p->func, p->outbuf_EBX,
+ x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride));
/* Incr index
- */
- incr_inputs( p, index_size );
+ */
+ incr_inputs(p, index_size);
}
/* decr count, loop if not zero
@@ -1419,9 +1420,7 @@ static boolean build_vertex_emit( struct translate_sse *p,
/* Pop regs and return
*/
-
- if(x86_target(p->func) != X86_64_STD_ABI)
- {
+ if (x86_target(p->func) != X86_64_STD_ABI) {
x86_pop(p->func, p->idx_ESI);
x86_pop(p->func, p->machine_EDI);
}
@@ -1429,10 +1428,11 @@ static boolean build_vertex_emit( struct translate_sse *p,
x86_pop(p->func, p->count_EBP);
x86_pop(p->func, p->outbuf_EBX);
- if(x86_target(p->func) == X86_64_WIN64_ABI)
- {
- sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
- sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
+ if (x86_target(p->func) == X86_64_WIN64_ABI) {
+ sse2_movdqa(p->func, x86_make_reg(file_XMM, 6),
+ x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
+ sse2_movdqa(p->func, x86_make_reg(file_XMM, 7),
+ x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
}
x86_ret(p->func);
@@ -1440,46 +1440,41 @@ static boolean build_vertex_emit( struct translate_sse *p,
}
-
-
-
-
-
-static void translate_sse_set_buffer( struct translate *translate,
- unsigned buf,
- const void *ptr,
- unsigned stride,
- unsigned max_index )
+static void
+translate_sse_set_buffer(struct translate *translate,
+ unsigned buf,
+ const void *ptr, unsigned stride, unsigned max_index)
{
- struct translate_sse *p = (struct translate_sse *)translate;
+ struct translate_sse *p = (struct translate_sse *) translate;
if (buf < p->nr_buffers) {
- p->buffer[buf].base_ptr = (char *)ptr;
+ p->buffer[buf].base_ptr = (char *) ptr;
p->buffer[buf].stride = stride;
p->buffer[buf].max_index = max_index;
}
- if (0) debug_printf("%s %d/%d: %p %d\n",
- __FUNCTION__, buf,
- p->nr_buffers,
- ptr, stride);
+ if (0)
+ debug_printf("%s %d/%d: %p %d\n",
+ __FUNCTION__, buf, p->nr_buffers, ptr, stride);
}
-static void translate_sse_release( struct translate *translate )
+static void
+translate_sse_release(struct translate *translate)
{
- struct translate_sse *p = (struct translate_sse *)translate;
+ struct translate_sse *p = (struct translate_sse *) translate;
- x86_release_func( &p->elt8_func );
- x86_release_func( &p->elt16_func );
- x86_release_func( &p->elt_func );
- x86_release_func( &p->linear_func );
+ x86_release_func(&p->elt8_func);
+ x86_release_func(&p->elt16_func);
+ x86_release_func(&p->elt_func);
+ x86_release_func(&p->linear_func);
os_free_aligned(p);
}
-struct translate *translate_sse2_create( const struct translate_key *key )
+struct translate *
+translate_sse2_create(const struct translate_key *key)
{
struct translate_sse *p = NULL;
unsigned i;
@@ -1489,8 +1484,9 @@ struct translate *translate_sse2_create( const struct translate_key *key )
goto fail;
p = os_malloc_aligned(sizeof(struct translate_sse), 16);
- if (p == NULL)
+ if (p == NULL)
goto fail;
+
memset(p, 0, sizeof(*p));
memcpy(p->consts, consts, sizeof(consts));
@@ -1502,7 +1498,8 @@ struct translate *translate_sse2_create( const struct translate_key *key )
if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
unsigned j;
- p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
+ p->nr_buffers =
+ MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
if (key->element[i].instance_divisor) {
p->use_instancing = TRUE;
@@ -1512,25 +1509,30 @@ struct translate *translate_sse2_create( const struct translate_key *key )
* Map vertex element to vertex buffer variant.
*/
for (j = 0; j < p->nr_buffer_variants; j++) {
- if (p->buffer_variant[j].buffer_index == key->element[i].input_buffer &&
- p->buffer_variant[j].instance_divisor == key->element[i].instance_divisor) {
+ if (p->buffer_variant[j].buffer_index ==
+ key->element[i].input_buffer
+ && p->buffer_variant[j].instance_divisor ==
+ key->element[i].instance_divisor) {
break;
}
}
if (j == p->nr_buffer_variants) {
p->buffer_variant[j].buffer_index = key->element[i].input_buffer;
- p->buffer_variant[j].instance_divisor = key->element[i].instance_divisor;
+ p->buffer_variant[j].instance_divisor =
+ key->element[i].instance_divisor;
p->nr_buffer_variants++;
}
p->element_to_buffer_variant[i] = j;
- } else {
+ }
+ else {
assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID;
}
}
- if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
+ if (0)
+ debug_printf("nr_buffers: %d\n", p->nr_buffers);
if (!build_vertex_emit(p, &p->linear_func, 0))
goto fail;
@@ -1564,16 +1566,16 @@ struct translate *translate_sse2_create( const struct translate_key *key )
fail:
if (p)
- translate_sse_release( &p->translate );
+ translate_sse_release(&p->translate);
return NULL;
}
-
#else
-struct translate *translate_sse2_create( const struct translate_key *key )
+struct translate *
+translate_sse2_create(const struct translate_key *key)
{
return NULL;
}