diff options
-rw-r--r-- | src/gallium/drivers/nvc0/nvc0_pc_optimize.c | 127 |
1 files changed, 79 insertions, 48 deletions
diff --git a/src/gallium/drivers/nvc0/nvc0_pc_optimize.c b/src/gallium/drivers/nvc0/nvc0_pc_optimize.c index e0d4e2daf9b..b6d99724a10 100644 --- a/src/gallium/drivers/nvc0/nvc0_pc_optimize.c +++ b/src/gallium/drivers/nvc0/nvc0_pc_optimize.c @@ -607,25 +607,83 @@ constant_operand(struct nv_pc *pc, } } +static void +handle_min_max(struct nv_pass *ctx, struct nv_instruction *nvi) +{ + struct nv_value *src0 = nvi->src[0]->value; + struct nv_value *src1 = nvi->src[1]->value; + + if (src0 != src1 || (nvi->src[0]->mod | nvi->src[1]->mod)) + return; + if (src0->reg.file != NV_FILE_GPR) + return; + nvc0_pc_replace_value(ctx->pc, nvi->def[0], src0); + nvc0_insn_delete(nvi); +} + +/* check if we can MUL + ADD -> MAD/FMA */ +static void +handle_add_mul(struct nv_pass *ctx, struct nv_instruction *nvi) +{ + struct nv_value *src0 = nvi->src[0]->value; + struct nv_value *src1 = nvi->src[1]->value; + struct nv_value *src; + int s; + uint8_t mod[4]; + + if (SRC_IS_MUL(src0) && src0->refc == 1) s = 0; + else + if (SRC_IS_MUL(src1) && src1->refc == 1) s = 1; + else + return; + + if ((src0->insn && src0->insn->bb != nvi->bb) || + (src1->insn && src1->insn->bb != nvi->bb)) + return; + + /* check for immediates from prior constant folding */ + if (src0->reg.file != NV_FILE_GPR || src1->reg.file != NV_FILE_GPR) + return; + src = nvi->src[s]->value; + + mod[0] = nvi->src[0]->mod; + mod[1] = nvi->src[1]->mod; + mod[2] = src->insn->src[0]->mod; + mod[3] = src->insn->src[1]->mod; + + if ((mod[0] | mod[1] | mod[2] | mod[3]) & ~NV_MOD_NEG) + return; + + nvi->opcode = NV_OP_MAD_F32; + + nv_reference(ctx->pc, nvi, s, NULL); + nvi->src[2] = nvi->src[!s]; + nvi->src[!s] = NULL; + + nv_reference(ctx->pc, nvi, 0, src->insn->src[0]->value); + nvi->src[0]->mod = mod[2] ^ mod[s]; + nv_reference(ctx->pc, nvi, 1, src->insn->src[1]->value); + nvi->src[1]->mod = mod[3]; +} + static int -nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b) +nv_pass_algebraic_opt(struct nv_pass *ctx, struct nv_basic_block *b) { struct nv_instruction *nvi, *next; int j; for (nvi = b->entry; nvi; nvi = next) { - struct nv_value *src0, *src1, *src; - int s; - uint8_t mod[4]; + struct nv_value *src0, *src1; + uint baseop = NV_BASEOP(nvi->opcode); next = nvi->next; src0 = nvc0_pc_find_immediate(nvi->src[0]); src1 = nvc0_pc_find_immediate(nvi->src[1]); - if (src0 && src1) + if (src0 && src1) { constant_expression(ctx->pc, nvi, src0, src1); - else { + } else { if (src0) constant_operand(ctx->pc, nvi, src0, 0); else @@ -633,44 +691,13 @@ nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b) constant_operand(ctx->pc, nvi, src1, 1); } - /* check if we can MUL + ADD -> MAD/FMA */ - if (nvi->opcode != NV_OP_ADD) - continue; - - src0 = nvi->src[0]->value; - src1 = nvi->src[1]->value; - - if (SRC_IS_MUL(src0) && src0->refc == 1) - src = src0; - else - if (SRC_IS_MUL(src1) && src1->refc == 1) - src = src1; + if (baseop == NV_OP_MIN || baseop == NV_OP_MAX) + handle_min_max(ctx, nvi); else - continue; - - /* could have an immediate from above constant_* */ - if (src0->reg.file != NV_FILE_GPR || src1->reg.file != NV_FILE_GPR) - continue; - s = (src == src0) ? 0 : 1; - - mod[0] = nvi->src[0]->mod; - mod[1] = nvi->src[1]->mod; - mod[2] = src->insn->src[0]->mod; - mod[3] = src->insn->src[0]->mod; - - if ((mod[0] | mod[1] | mod[2] | mod[3]) & ~NV_MOD_NEG) - continue; - - nvi->opcode = NV_OP_MAD; - nv_reference(ctx->pc, nvi, s, NULL); - nvi->src[2] = nvi->src[!s]; - - nvi->src[0] = new_ref(ctx->pc, src->insn->src[0]->value); - nvi->src[1] = new_ref(ctx->pc, src->insn->src[1]->value); - nvi->src[0]->mod = mod[2] ^ mod[s]; - nvi->src[1]->mod = mod[3]; + if (nvi->opcode == NV_OP_ADD_F32) + handle_add_mul(ctx, nvi); } - DESCEND_ARBITRARY(j, nv_pass_lower_arith); + DESCEND_ARBITRARY(j, nv_pass_algebraic_opt); return 0; } @@ -1158,11 +1185,17 @@ nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root) pass.n = 0; pass.pc = pc; + /* Do CSE so we can just compare values by pointer in subsequent passes. */ + pc->pass_seq++; + ret = nv_pass_cse(&pass, root); + if (ret) + return ret; + /* Do this first, so we don't have to pay attention * to whether sources are supported memory loads. */ pc->pass_seq++; - ret = nv_pass_lower_arith(&pass, root); + ret = nv_pass_algebraic_opt(&pass, root); if (ret) return ret; @@ -1190,11 +1223,9 @@ nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root) reldelim->pc = pc; } - pc->pass_seq++; - ret = nv_pass_cse(&pass, root); - if (ret) - return ret; - + /* May run DCE before load-combining since that pass will clean up + * after itself. + */ dce.pc = pc; do { dce.removed = 0; |