summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/gallium/drivers/r600/r600_asm.c2
-rw-r--r--src/gallium/drivers/r600/r600_shader.c724
2 files changed, 593 insertions, 133 deletions
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 23350e25967..7c76bd5abad 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -191,6 +191,8 @@ static inline unsigned int r600_bytecode_get_num_operands(struct r600_bytecode *
case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE:
case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT:
case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_LOAD_P0:
+ case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT:
+ case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT:
return 1;
default: R600_ERR(
"Need instruction operand number for 0x%x.\n", alu->inst);
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 9d15d0211ae..16e662de09e 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -2019,224 +2019,682 @@ static int tgsi_pow(struct r600_shader_ctx *ctx)
return tgsi_helper_tempx_replicate(ctx);
}
-static int tgsi_idiv(struct r600_shader_ctx *ctx)
+static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
{
struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
struct r600_bytecode_alu alu;
int i, r;
unsigned write_mask = inst->Dst[0].Register.WriteMask;
- int last_inst = tgsi_last_instruction(write_mask);
int tmp0 = ctx->temp_reg;
int tmp1 = r600_get_temp(ctx);
- int unsigned_op = (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_UDIV);
+ int tmp2 = r600_get_temp(ctx);
+
+ /* Unsigned path:
+ *
+ * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
+ *
+ * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error
+ * 2. tmp0.z = lo (tmp0.x * src2)
+ * 3. tmp0.w = -tmp0.z
+ * 4. tmp0.y = hi (tmp0.x * src2)
+ * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2))
+ * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error
+ * 7. tmp1.x = tmp0.x - tmp0.w
+ * 8. tmp1.y = tmp0.x + tmp0.w
+ * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
+ * 10. tmp0.z = hi(tmp0.x * src1) = q
+ * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r
+ *
+ * 12. tmp0.w = src1 - tmp0.y = r
+ * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison)
+ * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison)
+ *
+ * if DIV
+ *
+ * 15. tmp1.z = tmp0.z + 1 = q + 1
+ * 16. tmp1.w = tmp0.z - 1 = q - 1
+ *
+ * else MOD
+ *
+ * 15. tmp1.z = tmp0.w - src2 = r - src2
+ * 16. tmp1.w = tmp0.w + src2 = r + src2
+ *
+ * endif
+ *
+ * 17. tmp1.x = tmp1.x & tmp1.y
+ *
+ * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
+ * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
+ *
+ * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
+ * 20. dst = src2==0 ? MAX_UINT : tmp0.z
+ *
+ * Signed path:
+ *
+ * Same as unsigned, using abs values of the operands,
+ * and fixing the sign of the result in the end.
+ */
- /* tmp0 = float(src0) */
for (i = 0; i < 4; i++) {
if (!(write_mask & (1<<i)))
continue;
- memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ if (signed_op) {
- if (unsigned_op)
- alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT);
- else
- alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT);
+ /* tmp2.x = -src0 */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
- alu.dst.sel = tmp0;
- alu.dst.chan = i;
- alu.dst.write = 1;
+ alu.dst.sel = tmp2;
+ alu.dst.chan = 0;
+ alu.dst.write = 1;
- r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
- alu.last = 1;
- r = r600_bytecode_add_alu(ctx->bc, &alu);
- if (r)
- return r;
- }
+ alu.src[0].sel = V_SQ_ALU_SRC_0;
- if (!unsigned_op) {
- /* tmp1 = tmp0>=0 ? 0.5 : -0.5 for int*/
- for (i = 0; i < 4; i++) {
- if (!(write_mask & (1<<i)))
- continue;
+ r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+
+ /* tmp2.y = -src1 */
memset(&alu, 0, sizeof(struct r600_bytecode_alu));
- alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE);
- alu.is_op3 = 1;
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
- alu.dst.sel = tmp1;
- alu.dst.chan = i;
+ alu.dst.sel = tmp2;
+ alu.dst.chan = 1;
alu.dst.write = 1;
- alu.src[0].sel = tmp0;
- alu.src[0].chan = i;
+ alu.src[0].sel = V_SQ_ALU_SRC_0;
- alu.src[1].sel = V_SQ_ALU_SRC_0_5;
+ r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
- if (unsigned_op)
- alu.src[2].sel = V_SQ_ALU_SRC_0;
- else {
- alu.src[2].sel = V_SQ_ALU_SRC_0_5;
- alu.src[2].neg = 1;
- }
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+
+ /* tmp2.z sign bit is set if src0 and src2 signs are different */
+ /* it will be a sign of the quotient */
+ if (!mod) {
+
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT);
+
+ alu.dst.sel = tmp2;
+ alu.dst.chan = 2;
+ alu.dst.write = 1;
+
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
+ r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
- if (i == last_inst)
alu.last = 1;
- r = r600_bytecode_add_alu(ctx->bc, &alu);
- if (r)
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+ }
+
+ /* tmp2.x = |src0| */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
+ alu.is_op3 = 1;
+
+ alu.dst.sel = tmp2;
+ alu.dst.chan = 0;
+ alu.dst.write = 1;
+
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
+ r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
+ alu.src[2].sel = tmp2;
+ alu.src[2].chan = 0;
+
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+
+ /* tmp2.y = |src1| */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
+ alu.is_op3 = 1;
+
+ alu.dst.sel = tmp2;
+ alu.dst.chan = 1;
+ alu.dst.write = 1;
+
+ r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
+ r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
+ alu.src[2].sel = tmp2;
+ alu.src[2].chan = 1;
+
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
return r;
+
}
- }
- /* tmp0 = tmp0 + tmp1 for int */
- /* tmp0 = tmp0 + 0.5 for uint */
- for (i = 0; i < 4; i++) {
- if (!(write_mask & (1<<i)))
- continue;
+ /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT);
+
+ alu.dst.sel = tmp0;
+ alu.dst.chan = 0;
+ alu.dst.write = 1;
+ if (signed_op) {
+ alu.src[0].sel = tmp2;
+ alu.src[0].chan = 1;
+ } else {
+ r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
+ }
+
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+
+ /* 2. tmp0.z = lo (tmp0.x * src2) */
memset(&alu, 0, sizeof(struct r600_bytecode_alu));
- alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD);
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
alu.dst.sel = tmp0;
- alu.dst.chan = i;
+ alu.dst.chan = 2;
alu.dst.write = 1;
alu.src[0].sel = tmp0;
- alu.src[0].chan = i;
+ alu.src[0].chan = 0;
+ if (signed_op) {
+ alu.src[1].sel = tmp2;
+ alu.src[1].chan = 1;
+ } else {
+ r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
+ }
- if (unsigned_op)
- alu.src[1].sel = V_SQ_ALU_SRC_0_5;
- else {
- alu.src[1].sel = tmp1;
- alu.src[1].chan = i;
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+
+ /* 3. tmp0.w = -tmp0.z */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
+
+ alu.dst.sel = tmp0;
+ alu.dst.chan = 3;
+ alu.dst.write = 1;
+
+ alu.src[0].sel = V_SQ_ALU_SRC_0;
+ alu.src[1].sel = tmp0;
+ alu.src[1].chan = 2;
+
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+
+ /* 4. tmp0.y = hi (tmp0.x * src2) */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
+
+ alu.dst.sel = tmp0;
+ alu.dst.chan = 1;
+ alu.dst.write = 1;
+
+ alu.src[0].sel = tmp0;
+ alu.src[0].chan = 0;
+
+ if (signed_op) {
+ alu.src[1].sel = tmp2;
+ alu.src[1].chan = 1;
+ } else {
+ r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
}
- if (i == last_inst)
- alu.last = 1;
- r = r600_bytecode_add_alu(ctx->bc, &alu);
- if (r)
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
return r;
- }
- /* tmp1 = float(src1) */
- for (i = 0; i < 4; i++) {
- if (!(write_mask & (1<<i)))
- continue;
+ /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
+ alu.is_op3 = 1;
+ alu.dst.sel = tmp0;
+ alu.dst.chan = 2;
+ alu.dst.write = 1;
+
+ alu.src[0].sel = tmp0;
+ alu.src[0].chan = 1;
+ alu.src[1].sel = tmp0;
+ alu.src[1].chan = 3;
+ alu.src[2].sel = tmp0;
+ alu.src[2].chan = 2;
+
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+
+ /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */
memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
- if (unsigned_op)
- alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT);
- else
- alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT);
+ alu.dst.sel = tmp0;
+ alu.dst.chan = 3;
+ alu.dst.write = 1;
+
+ alu.src[0].sel = tmp0;
+ alu.src[0].chan = 2;
+
+ alu.src[1].sel = tmp0;
+ alu.src[1].chan = 0;
+
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+
+ /* 7. tmp1.x = tmp0.x - tmp0.w */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
alu.dst.sel = tmp1;
- alu.dst.chan = i;
+ alu.dst.chan = 0;
alu.dst.write = 1;
- r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
+ alu.src[0].sel = tmp0;
+ alu.src[0].chan = 0;
+ alu.src[1].sel = tmp0;
+ alu.src[1].chan = 3;
+
alu.last = 1;
- r = r600_bytecode_add_alu(ctx->bc, &alu);
- if (r)
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
return r;
- }
-
- /* tmp1 = 1.0/src1 */
- for (i = 0; i < 4; i++) {
- if (!(write_mask & (1<<i)))
- continue;
+ /* 8. tmp1.y = tmp0.x + tmp0.w */
memset(&alu, 0, sizeof(struct r600_bytecode_alu));
- alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
alu.dst.sel = tmp1;
- alu.dst.chan = i;
+ alu.dst.chan = 1;
alu.dst.write = 1;
- alu.src[0].sel = tmp1;
- alu.src[0].chan = i;
+ alu.src[0].sel = tmp0;
+ alu.src[0].chan = 0;
+ alu.src[1].sel = tmp0;
+ alu.src[1].chan = 3;
alu.last = 1;
- r = r600_bytecode_add_alu(ctx->bc, &alu);
- if (r)
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
return r;
- }
- /* tmp1 = tmp0 * tmp1 */
- for (i = 0; i < 4; i++) {
- if (!(write_mask & (1<<i)))
- continue;
+ /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
+ alu.is_op3 = 1;
+
+ alu.dst.sel = tmp0;
+ alu.dst.chan = 0;
+ alu.dst.write = 1;
+ alu.src[0].sel = tmp0;
+ alu.src[0].chan = 1;
+ alu.src[1].sel = tmp1;
+ alu.src[1].chan = 1;
+ alu.src[2].sel = tmp1;
+ alu.src[2].chan = 0;
+
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+
+ /* 10. tmp0.z = hi(tmp0.x * src1) = q */
memset(&alu, 0, sizeof(struct r600_bytecode_alu));
- alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
+
+ alu.dst.sel = tmp0;
+ alu.dst.chan = 2;
+ alu.dst.write = 1;
+
+ alu.src[0].sel = tmp0;
+ alu.src[0].chan = 0;
+
+ if (signed_op) {
+ alu.src[1].sel = tmp2;
+ alu.src[1].chan = 0;
+ } else {
+ r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
+ }
+
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+
+ /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
+
+ alu.dst.sel = tmp0;
+ alu.dst.chan = 1;
+ alu.dst.write = 1;
+
+ if (signed_op) {
+ alu.src[0].sel = tmp2;
+ alu.src[0].chan = 1;
+ } else {
+ r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
+ }
+
+ alu.src[1].sel = tmp0;
+ alu.src[1].chan = 2;
+
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+
+ /* 12. tmp0.w = src1 - tmp0.y = r */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
+
+ alu.dst.sel = tmp0;
+ alu.dst.chan = 3;
+ alu.dst.write = 1;
+
+ if (signed_op) {
+ alu.src[0].sel = tmp2;
+ alu.src[0].chan = 0;
+ } else {
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
+ }
+
+ alu.src[1].sel = tmp0;
+ alu.src[1].chan = 1;
+
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+
+ /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
alu.dst.sel = tmp1;
- alu.dst.chan = i;
+ alu.dst.chan = 0;
alu.dst.write = 1;
- alu.src[0].sel = ctx->temp_reg;
- alu.src[0].chan = i;
+ alu.src[0].sel = tmp0;
+ alu.src[0].chan = 3;
+ if (signed_op) {
+ alu.src[1].sel = tmp2;
+ alu.src[1].chan = 1;
+ } else {
+ r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
+ }
- alu.src[1].sel = tmp1;
- alu.src[1].chan = i;
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
- if (i == last_inst)
- alu.last = 1;
- r = r600_bytecode_add_alu(ctx->bc, &alu);
- if (r)
+ /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
+
+ alu.dst.sel = tmp1;
+ alu.dst.chan = 1;
+ alu.dst.write = 1;
+
+ if (signed_op) {
+ alu.src[0].sel = tmp2;
+ alu.src[0].chan = 0;
+ } else {
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
+ }
+
+ alu.src[1].sel = tmp0;
+ alu.src[1].chan = 1;
+
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
return r;
- }
- /* tmp1 = trunc(tmp1) for evergreen+ */
- if (ctx->bc->chip_class >= EVERGREEN) {
- for (i = 0; i < 4; i++) {
- if (!(write_mask & (1<<i)))
- continue;
+ if (mod) { /* UMOD */
+ /* 15. tmp1.z = tmp0.w - src2 = r - src2 */
memset(&alu, 0, sizeof(struct r600_bytecode_alu));
- alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC);
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
alu.dst.sel = tmp1;
- alu.dst.chan = i;
+ alu.dst.chan = 2;
alu.dst.write = 1;
- alu.src[0].sel = tmp1;
- alu.src[0].chan = i;
+ alu.src[0].sel = tmp0;
+ alu.src[0].chan = 3;
- if (i == last_inst)
- alu.last = 1;
- r = r600_bytecode_add_alu(ctx->bc, &alu);
- if (r)
+ if (signed_op) {
+ alu.src[1].sel = tmp2;
+ alu.src[1].chan = 1;
+ } else {
+ r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
+ }
+
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
return r;
+
+ /* 16. tmp1.w = tmp0.w + src2 = r + src2 */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
+
+ alu.dst.sel = tmp1;
+ alu.dst.chan = 3;
+ alu.dst.write = 1;
+
+ alu.src[0].sel = tmp0;
+ alu.src[0].chan = 3;
+ if (signed_op) {
+ alu.src[1].sel = tmp2;
+ alu.src[1].chan = 1;
+ } else {
+ r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
+ }
+
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+
+ } else { /* UDIV */
+
+ /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
+
+ alu.dst.sel = tmp1;
+ alu.dst.chan = 2;
+ alu.dst.write = 1;
+
+ alu.src[0].sel = tmp0;
+ alu.src[0].chan = 2;
+ alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
+
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+
+ /* 16. tmp1.w = tmp0.z - 1 = q - 1 */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
+
+ alu.dst.sel = tmp1;
+ alu.dst.chan = 3;
+ alu.dst.write = 1;
+
+ alu.src[0].sel = tmp0;
+ alu.src[0].chan = 2;
+ alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
+
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+
}
- }
- /* dst = int(tmp1) */
- for (i = 0; i < 4; i++) {
- if (!(write_mask & (1<<i)))
- continue;
+ /* 17. tmp1.x = tmp1.x & tmp1.y */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT);
+
+ alu.dst.sel = tmp1;
+ alu.dst.chan = 0;
+ alu.dst.write = 1;
+
+ alu.src[0].sel = tmp1;
+ alu.src[0].chan = 0;
+ alu.src[1].sel = tmp1;
+ alu.src[1].chan = 1;
+
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+ /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */
+ /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */
memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
+ alu.is_op3 = 1;
- if (unsigned_op)
- alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT);
- else
- alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT);
+ alu.dst.sel = tmp0;
+ alu.dst.chan = 2;
+ alu.dst.write = 1;
- tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+ alu.src[0].sel = tmp1;
+ alu.src[0].chan = 0;
+ alu.src[1].sel = tmp0;
+ alu.src[1].chan = mod ? 3 : 2;
+ alu.src[2].sel = tmp1;
+ alu.src[2].chan = 2;
+
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+
+ /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
+ alu.is_op3 = 1;
+
+ if (signed_op) {
+ alu.dst.sel = tmp0;
+ alu.dst.chan = 2;
+ alu.dst.write = 1;
+ } else {
+ tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+ }
alu.src[0].sel = tmp1;
- alu.src[0].chan = i;
+ alu.src[0].chan = 1;
+ alu.src[1].sel = tmp1;
+ alu.src[1].chan = 3;
+ alu.src[2].sel = tmp0;
+ alu.src[2].chan = 2;
- if ((ctx->bc->chip_class < EVERGREEN || unsigned_op) || i == last_inst)
- alu.last = 1;
- r = r600_bytecode_add_alu(ctx->bc, &alu);
- if (r)
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
return r;
- }
+ if (signed_op) {
+
+ /* fix the sign of the result */
+
+ if (mod) {
+
+ /* tmp0.x = -tmp0.z */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
+
+ alu.dst.sel = tmp0;
+ alu.dst.chan = 0;
+ alu.dst.write = 1;
+
+ alu.src[0].sel = V_SQ_ALU_SRC_0;
+ alu.src[1].sel = tmp0;
+ alu.src[1].chan = 2;
+
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+
+ /* sign of the remainder is the same as the sign of src0 */
+ /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
+ alu.is_op3 = 1;
+
+ tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
+ alu.src[1].sel = tmp0;
+ alu.src[1].chan = 2;
+ alu.src[2].sel = tmp0;
+ alu.src[2].chan = 0;
+
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+
+ } else {
+
+ /* tmp0.x = -tmp0.z */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
+
+ alu.dst.sel = tmp0;
+ alu.dst.chan = 0;
+ alu.dst.write = 1;
+
+ alu.src[0].sel = V_SQ_ALU_SRC_0;
+ alu.src[1].sel = tmp0;
+ alu.src[1].chan = 2;
+
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+
+ /* fix the quotient sign (same as the sign of src0*src1) */
+ /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
+ alu.is_op3 = 1;
+
+ tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+
+ alu.src[0].sel = tmp2;
+ alu.src[0].chan = 2;
+ alu.src[1].sel = tmp0;
+ alu.src[1].chan = 2;
+ alu.src[2].sel = tmp0;
+ alu.src[2].chan = 0;
+
+ alu.last = 1;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+ }
+ }
+ }
return 0;
}
+static int tgsi_udiv(struct r600_shader_ctx *ctx)
+{
+ return tgsi_divmod(ctx, 0, 0);
+}
+
+static int tgsi_umod(struct r600_shader_ctx *ctx)
+{
+ return tgsi_divmod(ctx, 1, 0);
+}
+
+static int tgsi_idiv(struct r600_shader_ctx *ctx)
+{
+ return tgsi_divmod(ctx, 0, 1);
+}
+
+static int tgsi_imod(struct r600_shader_ctx *ctx)
+{
+ return tgsi_divmod(ctx, 1, 1);
+}
+
+
static int tgsi_f2i(struct r600_shader_ctx *ctx)
{
struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
@@ -4122,7 +4580,7 @@ static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
{88, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
{TGSI_OPCODE_AND, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
{TGSI_OPCODE_OR, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
- {TGSI_OPCODE_MOD, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+ {TGSI_OPCODE_MOD, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
{TGSI_OPCODE_XOR, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
{TGSI_OPCODE_SAD, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
{TGSI_OPCODE_TXF, 0, SQ_TEX_INST_LD, tgsi_tex},
@@ -4164,11 +4622,11 @@ static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
{TGSI_OPCODE_F2U, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_op2},
{TGSI_OPCODE_U2F, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2_trans},
{TGSI_OPCODE_UADD, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
- {TGSI_OPCODE_UDIV, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
+ {TGSI_OPCODE_UDIV, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
{TGSI_OPCODE_UMAD, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
{TGSI_OPCODE_UMAX, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
{TGSI_OPCODE_UMIN, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
- {TGSI_OPCODE_UMOD, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+ {TGSI_OPCODE_UMOD, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
{TGSI_OPCODE_UMUL, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
{TGSI_OPCODE_USEQ, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
{TGSI_OPCODE_USGE, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
@@ -4296,7 +4754,7 @@ static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
{88, 0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
{TGSI_OPCODE_AND, 0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
{TGSI_OPCODE_OR, 0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
- {TGSI_OPCODE_MOD, 0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+ {TGSI_OPCODE_MOD, 0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
{TGSI_OPCODE_XOR, 0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
{TGSI_OPCODE_SAD, 0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
{TGSI_OPCODE_TXF, 0, SQ_TEX_INST_LD, tgsi_tex},
@@ -4338,11 +4796,11 @@ static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
{TGSI_OPCODE_F2U, 0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_f2i},
{TGSI_OPCODE_U2F, 0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2},
{TGSI_OPCODE_UADD, 0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
- {TGSI_OPCODE_UDIV, 0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
+ {TGSI_OPCODE_UDIV, 0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
{TGSI_OPCODE_UMAD, 0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
{TGSI_OPCODE_UMAX, 0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
{TGSI_OPCODE_UMIN, 0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
- {TGSI_OPCODE_UMOD, 0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+ {TGSI_OPCODE_UMOD, 0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
{TGSI_OPCODE_UMUL, 0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
{TGSI_OPCODE_USEQ, 0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
{TGSI_OPCODE_USGE, 0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},