diff options
-rw-r--r-- | src/mesa/tnl/t_vb_arbprogram.c | 192 | ||||
-rw-r--r-- | src/mesa/tnl/t_vb_arbprogram.h | 9 | ||||
-rw-r--r-- | src/mesa/tnl/t_vb_arbprogram_sse.c | 92 | ||||
-rw-r--r-- | src/mesa/x86/rtasm/x86sse.c | 16 | ||||
-rw-r--r-- | src/mesa/x86/rtasm/x86sse.h | 2 |
5 files changed, 179 insertions, 132 deletions
diff --git a/src/mesa/tnl/t_vb_arbprogram.c b/src/mesa/tnl/t_vb_arbprogram.c index 88d8fe95464..d034929fe0a 100644 --- a/src/mesa/tnl/t_vb_arbprogram.c +++ b/src/mesa/tnl/t_vb_arbprogram.c @@ -115,8 +115,6 @@ static GLfloat rough_approx_log2_0_1(GLfloat x) } - - /** * Perform a reduced swizzle: */ @@ -131,12 +129,42 @@ static void do_RSW( struct arb_vp_machine *m, union instruction op ) /* Need a temporary to be correct in the case where result == arg0. */ COPY_4V(tmp, arg0); - - result[0] = tmp[GET_RSW(swz, 0)]; - result[1] = tmp[GET_RSW(swz, 1)]; - result[2] = tmp[GET_RSW(swz, 2)]; - result[3] = tmp[GET_RSW(swz, 3)]; - + + result[0] = tmp[GET_SWZ(swz, 0)]; + result[1] = tmp[GET_SWZ(swz, 1)]; + result[2] = tmp[GET_SWZ(swz, 2)]; + result[3] = tmp[GET_SWZ(swz, 3)]; + + if (neg) { + if (neg & 0x1) result[0] = -result[0]; + if (neg & 0x2) result[1] = -result[1]; + if (neg & 0x4) result[2] = -result[2]; + if (neg & 0x8) result[3] = -result[3]; + } +} + +/** + * Perform a full swizzle + */ +static void do_SWZ( struct arb_vp_machine *m, union instruction op ) +{ + GLfloat *result = m->File[0][op.rsw.dst]; + const GLfloat *arg0 = m->File[op.rsw.file0][op.rsw.idx0]; + GLuint swz = op.rsw.swz; + GLuint neg = op.rsw.neg; + GLfloat tmp[6]; + tmp[4] = 0.0; + tmp[5] = 1.0; + + /* Need a temporary to be correct in the case where result == arg0. + */ + COPY_4V(tmp, arg0); + + result[0] = tmp[GET_SWZ(swz, 0)]; + result[1] = tmp[GET_SWZ(swz, 1)]; + result[2] = tmp[GET_SWZ(swz, 2)]; + result[3] = tmp[GET_SWZ(swz, 3)]; + if (neg) { if (neg & 0x1) result[0] = -result[0]; if (neg & 0x2) result[1] = -result[1]; @@ -570,11 +598,31 @@ static void print_RSW( union instruction op ) _mesa_printf(", "); print_reg(op.rsw.file0, op.rsw.idx0); _mesa_printf("."); - for (i = 0; i < 4; i++, swz >>= 2) { - const char *cswz = "xyzw"; + for (i = 0; i < 4; i++, swz >>= 3) { + const char *cswz = "xyzw01"; if (neg & (1<<i)) _mesa_printf("-"); - _mesa_printf("%c", cswz[swz&0x3]); + _mesa_printf("%c", cswz[swz&0x7]); + } + _mesa_printf("\n"); +} + +static void print_SWZ( union instruction op ) +{ + GLuint swz = op.rsw.swz; + GLuint neg = op.rsw.neg; + GLuint i; + + _mesa_printf("SWZ "); + print_reg(0, op.rsw.dst); + _mesa_printf(", "); + print_reg(op.rsw.file0, op.rsw.idx0); + _mesa_printf("."); + for (i = 0; i < 4; i++, swz >>= 3) { + const char *cswz = "xyzw01"; + if (neg & (1<<i)) + _mesa_printf("-"); + _mesa_printf("%c", cswz[swz&0x7]); } _mesa_printf("\n"); } @@ -651,9 +699,11 @@ _tnl_disassem_vba_insn( union instruction op ) case OPCODE_RCC: case OPCODE_RET: case OPCODE_SSG: - case OPCODE_SWZ: print_NOP(op); break; + case OPCODE_SWZ: + print_SWZ(op); + break; case RSW: print_RSW(op); break; @@ -728,7 +778,7 @@ static void (* const opcode_func[MAX_OPCODE+3])(struct arb_vp_machine *, union i do_NOP,/*SSG*/ do_NOP,/*STR*/ do_SUB, - do_RSW,/*SWZ*/ + do_SWZ,/*SWZ*/ do_NOP,/*TEX*/ do_NOP,/*TXB*/ do_NOP,/*TXD*/ @@ -833,7 +883,7 @@ static struct reg cvp_emit_arg( struct compilation *cp, { struct reg reg = cvp_load_reg( cp, src->File, src->Index, src->RelAddr, arg ); union instruction rsw, noop; - + /* Emit any necessary swizzling. */ _mesa_bzero(&rsw, sizeof(rsw)); @@ -841,19 +891,17 @@ static struct reg cvp_emit_arg( struct compilation *cp, /* we're expecting 2-bit swizzles below... */ #if 1 /* XXX THESE ASSERTIONS CURRENTLY FAIL DURING GLEAN TESTS! */ +/* hopefully no longer happens? */ ASSERT(GET_SWZ(src->Swizzle, 0) < 4); ASSERT(GET_SWZ(src->Swizzle, 1) < 4); ASSERT(GET_SWZ(src->Swizzle, 2) < 4); ASSERT(GET_SWZ(src->Swizzle, 3) < 4); #endif - rsw.rsw.swz = ((GET_SWZ(src->Swizzle, 0) << 0) | - (GET_SWZ(src->Swizzle, 1) << 2) | - (GET_SWZ(src->Swizzle, 2) << 4) | - (GET_SWZ(src->Swizzle, 3) << 6)); + rsw.rsw.swz = src->Swizzle; _mesa_bzero(&noop, sizeof(noop)); noop.rsw.neg = 0; - noop.rsw.swz = RSW_NOOP; + noop.rsw.swz = SWIZZLE_NOOP; if (_mesa_memcmp(&rsw, &noop, sizeof(rsw)) !=0) { union instruction *op = cvp_next_instruction(cp); @@ -907,46 +955,6 @@ static GLuint cvp_choose_result( struct compilation *cp, } } -static struct reg cvp_emit_rsw( struct compilation *cp, - GLuint dst, - struct reg src, - GLuint neg, - GLuint swz, - GLboolean force) -{ - struct reg retval; - - if (swz != RSW_NOOP || neg != 0) { - union instruction *op = cvp_next_instruction(cp); - op->rsw.opcode = RSW; - op->rsw.dst = dst; - op->rsw.file0 = src.file; - op->rsw.idx0 = src.idx; - op->rsw.neg = neg; - op->rsw.swz = swz; - - retval.file = FILE_REG; - retval.idx = dst; - return retval; - } - else if (force) { - /* Oops. Degenerate case: - */ - union instruction *op = cvp_next_instruction(cp); - op->alu.opcode = OPCODE_MOV; - op->alu.dst = dst; - op->alu.file0 = src.file; - op->alu.idx0 = src.idx; - - retval.file = FILE_REG; - retval.idx = dst; - return retval; - } - else { - return src; - } -} - static void cvp_emit_inst( struct compilation *cp, const struct prog_instruction *inst ) @@ -998,64 +1006,26 @@ static void cvp_emit_inst( struct compilation *cp, op->alu.idx0 = reg[0].idx; break; - case OPCODE_SWZ: { - GLuint swz0 = 0, swz1 = 0; - GLuint neg0 = 0, neg1 = 0; - GLuint mask = 0; - - /* Translate 3-bit-per-element swizzle into two 2-bit swizzles, - * one from the source register the other from a constant - * {0,0,0,1}. - */ - for (i = 0; i < 4; i++) { - GLuint swzelt = GET_SWZ(inst->SrcReg[0].Swizzle, i); - if (swzelt >= SWIZZLE_ZERO) { - neg0 |= inst->SrcReg[0].NegateBase & (1<<i); - if (swzelt == SWIZZLE_ONE) - swz0 |= SWIZZLE_W << (i*2); - else if (i < SWIZZLE_W) - swz0 |= i << (i*2); - } - else { - mask |= 1<<i; - neg1 |= inst->SrcReg[0].NegateBase & (1<<i); - swz1 |= swzelt << (i*2); - } - } + case OPCODE_END: + break; + case OPCODE_SWZ: result = cvp_choose_result( cp, &inst->DstReg, &fixup ); - reg[0].file = FILE_REG; - reg[0].idx = REG_ID; - reg[1] = cvp_emit_arg( cp, &inst->SrcReg[0], REG_ARG0 ); - - if (mask == WRITEMASK_XYZW) { - cvp_emit_rsw(cp, result, reg[0], neg0, swz0, GL_TRUE); - - } - else if (mask == 0) { - cvp_emit_rsw(cp, result, reg[1], neg1, swz1, GL_TRUE); - } - else { - cvp_emit_rsw(cp, result, reg[0], neg0, swz0, GL_TRUE); - reg[1] = cvp_emit_rsw(cp, REG_ARG0, reg[1], neg1, swz1, GL_FALSE); - - op = cvp_next_instruction(cp); - op->msk.opcode = MSK; - op->msk.dst = result; - op->msk.file = reg[1].file; - op->msk.idx = reg[1].idx; - op->msk.mask = mask; - } + reg[0] = cvp_load_reg( cp, inst->SrcReg[0].File, + inst->SrcReg[0].Index, inst->SrcReg[0].RelAddr, REG_ARG0 ); + op = cvp_next_instruction(cp); + op->rsw.opcode = inst->Opcode; + op->rsw.file0 = reg[0].file; + op->rsw.idx0 = reg[0].idx; + op->rsw.dst = result; + op->rsw.swz = inst->SrcReg[0].Swizzle; + op->rsw.neg = inst->SrcReg[0].NegateBase; if (result == REG_RES) { op = cvp_next_instruction(cp); *op = fixup; } break; - } - - case OPCODE_END: - break; default: result = cvp_choose_result( cp, &inst->DstReg, &fixup ); @@ -1074,7 +1044,7 @@ static void cvp_emit_inst( struct compilation *cp, if (result == REG_RES) { op = cvp_next_instruction(cp); *op = fixup; - } + } break; } } @@ -1485,7 +1455,7 @@ static GLboolean init_vertex_program( GLcontext *ctx, */ ASSIGN_4V(m->File[0][REG_ID], 0, 0, 0, 1); ASSIGN_4V(m->File[0][REG_ONES], 1, 1, 1, 1); - ASSIGN_4V(m->File[0][REG_SWZ], -1, 1, 0, 0); + ASSIGN_4V(m->File[0][REG_SWZ], 1, -1, 0, 0); ASSIGN_4V(m->File[0][REG_NEG], -1, -1, -1, -1); ASSIGN_4V(m->File[0][REG_LIT], 1, 0, 0, 1); ASSIGN_4V(m->File[0][REG_LIT2], 1, .5, .2, 1); /* debug value */ diff --git a/src/mesa/tnl/t_vb_arbprogram.h b/src/mesa/tnl/t_vb_arbprogram.h index 60786d6a016..dab725d7f7a 100644 --- a/src/mesa/tnl/t_vb_arbprogram.h +++ b/src/mesa/tnl/t_vb_arbprogram.h @@ -61,7 +61,7 @@ #define REG_IN31 63 #define REG_ID 64 /* 0,0,0,1 */ #define REG_ONES 65 /* 1,1,1,1 */ -#define REG_SWZ 66 /* -1,1,0,0 */ +#define REG_SWZ 66 /* 1,-1,0,0 */ #define REG_NEG 67 /* -1,-1,-1,-1 */ #define REG_LIT 68 /* 1,0,0,1 */ #define REG_LIT2 69 /* 1,0,0,1 */ @@ -98,7 +98,7 @@ union instruction { GLuint file0:2; GLuint idx0:7; GLuint neg:4; - GLuint swz:8; /* xyzw only */ + GLuint swz:12; /* xyzw01 */ } rsw; struct { @@ -114,11 +114,8 @@ union instruction { /** - * Reduced swizzle is a 2-bit field; only X/Y/Z/W are allowed, not 0/1. + * Reduced swizzle is a 3-bit field, for simplicity same as normal swizzle, X/Y/Z/W/0/1 allowed. */ -#define RSW_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6)) -#define GET_RSW(swz, idx) (((swz) >> ((idx)*2)) & 0x3) - struct input { GLuint idx; diff --git a/src/mesa/tnl/t_vb_arbprogram_sse.c b/src/mesa/tnl/t_vb_arbprogram_sse.c index 19061c0d8d1..b9126d6d886 100644 --- a/src/mesa/tnl/t_vb_arbprogram_sse.c +++ b/src/mesa/tnl/t_vb_arbprogram_sse.c @@ -294,11 +294,12 @@ static GLboolean emit_RSW( struct compilation *cp, union instruction op ) { struct x86_reg arg0 = get_arg(cp, op.rsw.file0, op.rsw.idx0); struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.rsw.dst); - GLuint swz = op.rsw.swz; + GLuint swz = GET_SWZ(op.rsw.swz, 0) | (GET_SWZ(op.rsw.swz, 1) << 2) | + (GET_SWZ(op.rsw.swz, 2) << 4| (GET_SWZ(op.rsw.swz, 3) << 6)); GLuint neg = op.rsw.neg; emit_pshufd(cp, dst, arg0, swz); - + if (neg) { struct x86_reg negs = get_arg(cp, FILE_REG, REG_SWZ); struct x86_reg tmp = get_xmm_reg(cp); @@ -306,6 +307,7 @@ static GLboolean emit_RSW( struct compilation *cp, union instruction op ) * Use neg as arg to pshufd * Multiply */ + /* is the emit_pshufd necessary? only SWZ can negate individual components */ emit_pshufd(cp, tmp, negs, SHUF((neg & 1) ? 1 : 0, (neg & 2) ? 1 : 0, @@ -317,6 +319,64 @@ static GLboolean emit_RSW( struct compilation *cp, union instruction op ) return GL_TRUE; } +/* Perform a full swizzle + */ +static GLboolean emit_SWZ( struct compilation *cp, union instruction op ) +{ + struct x86_reg arg0 = get_arg(cp, op.rsw.file0, op.rsw.idx0); + struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.rsw.dst); + struct x86_reg negs = get_arg(cp, FILE_REG, REG_SWZ); + struct x86_reg tmp = get_xmm_reg(cp); + GLubyte neg = op.rsw.neg; + GLubyte shuf2, swz, savepos, savemask, swizzle[4]; + + swizzle[0] = GET_SWZ(op.rsw.swz, 0); + swizzle[1] = GET_SWZ(op.rsw.swz, 1); + swizzle[2] = GET_SWZ(op.rsw.swz, 2); + swizzle[3] = GET_SWZ(op.rsw.swz, 3); + + swz = SHUF((swizzle[0] & 3), (swizzle[1] & 3), + (swizzle[2] & 3), (swizzle[3] & 3)); + + emit_pshufd(cp, dst, arg0, swz); + + /* can handle negation and replace with zero with the same shuffle/mul */ + shuf2 = SHUF(swizzle[0] == 4 ? 2 : (neg & 1), + swizzle[1] == 4 ? 2 : ((neg & 2) >> 1), + swizzle[2] == 4 ? 2 : ((neg & 4) >> 2), + swizzle[3] == 4 ? 2 : ((neg & 8) >> 3)); + + /* now the hard part is getting those 1's in there... */ + savepos = 0; + savemask = 0; + if (swizzle[0] == 5) savepos = 1; + if (swizzle[1] == 5) savepos = 2; + else savemask |= 1 << 2; + if (swizzle[2] == 5) savepos = 3; + else savemask |= 2 << 4; + if (swizzle[3] == 5) savepos = 4; + else savemask |= 3 << 6; + if (savepos) { + /* need a mov first as movss from memory will overwrite high bits of xmm reg */ + sse_movups(&cp->func, tmp, negs); + /* can only replace lowest 32bits, thus move away that part first */ + emit_pshufd(cp, dst, dst, savemask); + sse_movss(&cp->func, dst, tmp); + emit_pshufd(cp, dst, dst, (savepos - 1) | (savemask & 0xfc)); + } + + if (shuf2) { + /* Load 1,-1,0,0 + * Use neg as arg to pshufd + * Multiply + */ + emit_pshufd(cp, tmp, negs, shuf2); + sse_mulps(&cp->func, dst, tmp); + } + + return GL_TRUE; +} + /* Helper for writemask: */ static GLboolean emit_shuf_copy1( struct compilation *cp, @@ -595,20 +655,19 @@ static GLboolean emit_DPH( struct compilation *cp, union instruction op ) struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1); struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst); - struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES); - struct x86_reg tmp = get_xmm_reg(cp); + struct x86_reg tmp = get_xmm_reg(cp); - emit_pshufd(cp, dst, arg0, SHUF(W,X,Y,Z)); - sse_movss(&cp->func, dst, ones); - emit_pshufd(cp, dst, dst, SHUF(W,X,Y,Z)); + sse_movups(&cp->func, dst, arg0); sse_mulps(&cp->func, dst, arg1); - - /* Now the hard bit: sum the values (from DP4): + + /* Now the hard bit: sum the values (from DP3): */ sse_movhlps(&cp->func, tmp, dst); - sse_addps(&cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */ + sse_addss(&cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */ emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); sse_addss(&cp->func, dst, tmp); + emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W)); + sse_addss(&cp->func, dst, tmp); sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X)); return GL_TRUE; } @@ -985,15 +1044,18 @@ static GLboolean emit_RSQ( struct compilation *cp, union instruction op ) { struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst); - - /* TODO: Calculate absolute value - */ #if 0 + struct x86_reg neg = get_reg_ptr(FILE_REG, REG_NEG); + +/* get abs value first. This STILL doesn't work. + Looks like we get bogus neg values ? +*/ sse_movss(&cp->func, dst, arg0); sse_mulss(&cp->func, dst, neg); sse_maxss(&cp->func, dst, arg0); -#endif + sse_rsqrtss(&cp->func, dst, dst); +#endif sse_rsqrtss(&cp->func, dst, arg0); sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X)); return GL_TRUE; @@ -1132,7 +1194,7 @@ static GLboolean (* const emit_func[])(struct compilation *, union instruction) emit_NOP, /* SSG */ emit_NOP, /* STR */ emit_SUB, - emit_RSW, /* SWZ */ + emit_SWZ, /* SWZ */ emit_NOP, /* TEX */ emit_NOP, /* TXB */ emit_NOP, /* TXD */ diff --git a/src/mesa/x86/rtasm/x86sse.c b/src/mesa/x86/rtasm/x86sse.c index 9f34004ba0c..6137aef8ece 100644 --- a/src/mesa/x86/rtasm/x86sse.c +++ b/src/mesa/x86/rtasm/x86sse.c @@ -424,6 +424,14 @@ void sse_maxps( struct x86_function *p, emit_modrm( p, dst, src ); } +void sse_maxss( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_3ub(p, 0xF3, X86_TWOB, 0x5F); + emit_modrm( p, dst, src ); +} + void sse_divss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) @@ -456,6 +464,14 @@ void sse_mulps( struct x86_function *p, emit_modrm( p, dst, src ); } +void sse_mulss( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_3ub(p, 0xF3, X86_TWOB, 0x59); + emit_modrm( p, dst, src ); +} + void sse_addps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) diff --git a/src/mesa/x86/rtasm/x86sse.h b/src/mesa/x86/rtasm/x86sse.h index 430cf2f939d..5ec54894311 100644 --- a/src/mesa/x86/rtasm/x86sse.h +++ b/src/mesa/x86/rtasm/x86sse.h @@ -156,6 +156,7 @@ void sse_divss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) void sse_andps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_cmpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src, GLubyte cc ); void sse_maxps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_maxss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_minps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_movaps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_movhlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); @@ -165,6 +166,7 @@ void sse_movlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src void sse_movss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_movups( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_mulps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_mulss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_subps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_rsqrtss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0, GLubyte shuf ); |