/* * Copyright 2010 Christoph Bumiller * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "nvc0_pc.h" #include "nvc0_program.h" #define DESCEND_ARBITRARY(j, f) \ do { \ b->pass_seq = ctx->pc->pass_seq; \ \ for (j = 0; j < 2; ++j) \ if (b->out[j] && b->out[j]->pass_seq < ctx->pc->pass_seq) \ f(ctx, b->out[j]); \ } while (0) static INLINE boolean registers_interfere(struct nv_value *a, struct nv_value *b) { if (a->reg.file != b->reg.file) return FALSE; if (NV_IS_MEMORY_FILE(a->reg.file) || NV_IS_MEMORY_FILE(b->reg.file)) return FALSE; assert(a->join->reg.id >= 0 && b->join->reg.id >= 0); if (a->join->reg.id < b->join->reg.id) { return (a->join->reg.id + a->reg.size >= b->join->reg.id); } else if (a->join->reg.id > b->join->reg.id) { return (b->join->reg.id + b->reg.size >= a->join->reg.id); } return FALSE; } static INLINE boolean values_equal(struct nv_value *a, struct nv_value *b) { if (a->reg.file != b->reg.file || a->reg.size != b->reg.size) return FALSE; if (NV_IS_MEMORY_FILE(a->reg.file)) return a->reg.address == b->reg.address; else return a->join->reg.id == b->join->reg.id; } #if 0 static INLINE boolean inst_commutation_check(struct nv_instruction *a, struct nv_instruction *b) { int si, di; for (di = 0; di < 4 && a->def[di]; ++di) for (si = 0; si < 5 && b->src[si]; ++si) if (registers_interfere(a->def[di], b->src[si]->value)) return FALSE; return TRUE; } /* Check whether we can swap the order of the instructions, * where a & b may be either the earlier or the later one. */ static boolean inst_commutation_legal(struct nv_instruction *a, struct nv_instruction *b) { return inst_commutation_check(a, b) && inst_commutation_check(b, a); } #endif static INLINE boolean inst_removable(struct nv_instruction *nvi) { if (nvi->opcode == NV_OP_ST) return FALSE; return (!(nvi->terminator || nvi->join || nvi->target || nvi->fixed || nvc0_insn_refcount(nvi))); } /* Check if we do not actually have to emit this instruction. */ static INLINE boolean inst_is_noop(struct nv_instruction *nvi) { if (nvi->opcode == NV_OP_UNDEF || nvi->opcode == NV_OP_BIND) return TRUE; if (nvi->terminator || nvi->join) return FALSE; if (nvi->def[0] && nvi->def[0]->join->reg.id < 0) return TRUE; if (nvi->opcode != NV_OP_MOV && nvi->opcode != NV_OP_SELECT) return FALSE; if (nvi->def[0]->reg.file != nvi->src[0]->value->reg.file) return FALSE; if (nvi->src[0]->value->join->reg.id < 0) { NV50_DBGMSG(PROG_IR, "inst_is_noop: orphaned value detected\n"); return TRUE; } if (nvi->opcode == NV_OP_SELECT) if (!values_equal(nvi->def[0], nvi->src[1]->value)) return FALSE; return values_equal(nvi->def[0], nvi->src[0]->value); } struct nv_pass { struct nv_pc *pc; int n; void *priv; }; static int nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b); static void nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b) { struct nv_pc *pc = (struct nv_pc *)priv; struct nv_basic_block *in; struct nv_instruction *nvi, *next; int j; /* find first non-empty block emitted before b */ for (j = pc->num_blocks - 1; j >= 0 && !pc->bb_list[j]->emit_size; --j); for (; j >= 0; --j) { in = pc->bb_list[j]; /* check for no-op branches (BRA $PC+8) */ if (in->exit && in->exit->opcode == NV_OP_BRA && in->exit->target == b) { in->emit_size -= 8; pc->emit_size -= 8; for (++j; j < pc->num_blocks; ++j) pc->bb_list[j]->emit_pos -= 8; nvc0_insn_delete(in->exit); } b->emit_pos = in->emit_pos + in->emit_size; if (in->emit_size) /* no more no-op branches to b */ break; } pc->bb_list[pc->num_blocks++] = b; /* visit node */ for (nvi = b->entry; nvi; nvi = next) { next = nvi->next; if (inst_is_noop(nvi) || (pc->is_fragprog && nvi->opcode == NV_OP_EXPORT)) { nvc0_insn_delete(nvi); } else b->emit_size += 8; } pc->emit_size += b->emit_size; #if NV50_DEBUG & NV50_DEBUG_PROG_IR if (!b->entry) debug_printf("BB:%i is now empty\n", b->id); else debug_printf("BB:%i size = %u\n", b->id, b->emit_size); #endif } static int nv_pc_pass2(struct nv_pc *pc, struct nv_basic_block *root) { struct nv_pass pass; pass.pc = pc; pc->pass_seq++; nv_pass_flatten(&pass, root); nvc0_pc_pass_in_order(root, nv_pc_pass_pre_emission, pc); return 0; } int nvc0_pc_exec_pass2(struct nv_pc *pc) { int i, ret; NV50_DBGMSG(PROG_IR, "preparing %u blocks for emission\n", pc->num_blocks); pc->num_blocks = 0; /* will reorder bb_list */ for (i = 0; i < pc->num_subroutines + 1; ++i) if (pc->root[i] && (ret = nv_pc_pass2(pc, pc->root[i]))) return ret; return 0; } static INLINE boolean is_cspace_load(struct nv_instruction *nvi) { if (!nvi) return FALSE; assert(nvi->indirect != 0); return (nvi->opcode == NV_OP_LD && nvi->src[0]->value->reg.file >= NV_FILE_MEM_C(0) && nvi->src[0]->value->reg.file <= NV_FILE_MEM_C(15)); } static INLINE boolean is_immd32_load(struct nv_instruction *nvi) { if (!nvi) return FALSE; return (nvi->opcode == NV_OP_MOV && nvi->src[0]->value->reg.file == NV_FILE_IMM && nvi->src[0]->value->reg.size == 4); } static INLINE void check_swap_src_0_1(struct nv_instruction *nvi) { struct nv_ref *src0 = nvi->src[0]; struct nv_ref *src1 = nvi->src[1]; if (!nv_op_commutative(nvi->opcode) && NV_BASEOP(nvi->opcode) != NV_OP_SET && NV_BASEOP(nvi->opcode) != NV_OP_SLCT) return; assert(src0 && src1 && src0->value && src1->value); if (src1->value->reg.file != NV_FILE_GPR) return; if (is_cspace_load(src0->value->insn)) { if (!is_cspace_load(src1->value->insn)) { nvi->src[0] = src1; nvi->src[1] = src0; } } else if (is_immd32_load(src0->value->insn)) { if (!is_cspace_load(src1->value->insn) && !is_immd32_load(src1->value->insn)) { nvi->src[0] = src1; nvi->src[1] = src0; } } if (nvi->src[0] != src0) { if (NV_BASEOP(nvi->opcode) == NV_OP_SET) nvi->set_cond = nvc0_ir_reverse_cc(nvi->set_cond); else if (NV_BASEOP(nvi->opcode) == NV_OP_SLCT) nvi->set_cond = NV_CC_INVERSE(nvi->set_cond); } } static void nvi_set_indirect_load(struct nv_pc *pc, struct nv_instruction *nvi, struct nv_value *val) { for (nvi->indirect = 0; nvi->indirect < 6 && nvi->src[nvi->indirect]; ++nvi->indirect); assert(nvi->indirect < 6); nv_reference(pc, nvi, nvi->indirect, val); } static int nvc0_pass_fold_loads(struct nv_pass *ctx, struct nv_basic_block *b) { struct nv_instruction *nvi, *ld; int s; for (nvi = b->entry; nvi; nvi = nvi->next) { check_swap_src_0_1(nvi); for (s = 0; s < 3 && nvi->src[s]; ++s) { ld = nvi->src[s]->value->insn; if (!ld || (ld->opcode != NV_OP_LD && ld->opcode != NV_OP_MOV)) continue; if (!nvc0_insn_can_load(nvi, s, ld)) continue; /* fold it ! */ nv_reference(ctx->pc, nvi, s, ld->src[0]->value); if (ld->indirect >= 0) nvi_set_indirect_load(ctx->pc, nvi, ld->src[ld->indirect]->value); if (!nvc0_insn_refcount(ld)) nvc0_insn_delete(ld); } } DESCEND_ARBITRARY(s, nvc0_pass_fold_loads); return 0; } /* NOTE: Assumes loads have not yet been folded. */ static int nv_pass_lower_mods(struct nv_pass *ctx, struct nv_basic_block *b) { struct nv_instruction *nvi, *mi, *next; int j; uint8_t mod; for (nvi = b->entry; nvi; nvi = next) { next = nvi->next; if (nvi->opcode == NV_OP_SUB) { nvi->src[1]->mod ^= NV_MOD_NEG; nvi->opcode = NV_OP_ADD; } for (j = 0; j < 3 && nvi->src[j]; ++j) { mi = nvi->src[j]->value->insn; if (!mi) continue; if (mi->def[0]->refc > 1 || mi->predicate >= 0) continue; if (NV_BASEOP(mi->opcode) == NV_OP_NEG) mod = NV_MOD_NEG; else if (NV_BASEOP(mi->opcode) == NV_OP_ABS) mod = NV_MOD_ABS; else continue; assert(!(mod & mi->src[0]->mod & NV_MOD_NEG)); mod |= mi->src[0]->mod; if ((nvi->opcode == NV_OP_ABS) || (nvi->src[j]->mod & NV_MOD_ABS)) { /* abs neg [abs] = abs */ mod &= ~(NV_MOD_NEG | NV_MOD_ABS); } else if ((nvi->opcode == NV_OP_NEG) && (mod & NV_MOD_NEG)) { /* neg as opcode and modifier on same insn cannot occur */ /* neg neg abs = abs, neg neg = identity */ assert(j == 0); if (mod & NV_MOD_ABS) nvi->opcode = NV_OP_ABS; else nvi->opcode = NV_OP_MOV; mod = 0; } if ((nv_op_supported_src_mods(nvi->opcode, j) & mod) != mod) continue; nv_reference(ctx->pc, nvi, j, mi->src[0]->value); nvi->src[j]->mod ^= mod; } if (nvi->opcode == NV_OP_SAT) { mi = nvi->src[0]->value->insn; if (mi->def[0]->refc > 1 || (mi->opcode != NV_OP_ADD && mi->opcode != NV_OP_MUL && mi->opcode != NV_OP_MAD)) continue; mi->saturate = 1; mi->def[0] = nvi->def[0]; mi->def[0]->insn = mi; nvc0_insn_delete(nvi); } } DESCEND_ARBITRARY(j, nv_pass_lower_mods); return 0; } #define SRC_IS_MUL(s) ((s)->insn && (s)->insn->opcode == NV_OP_MUL) static void apply_modifiers(uint32_t *val, uint8_t type, uint8_t mod) { if (mod & NV_MOD_ABS) { if (type == NV_TYPE_F32) *val &= 0x7fffffff; else if ((*val) & (1 << 31)) *val = ~(*val) + 1; } if (mod & NV_MOD_NEG) { if (type == NV_TYPE_F32) *val ^= 0x80000000; else *val = ~(*val) + 1; } if (mod & NV_MOD_SAT) { union { float f; uint32_t u; int32_t i; } u; u.u = *val; if (type == NV_TYPE_F32) { u.f = CLAMP(u.f, -1.0f, 1.0f); } else if (type == NV_TYPE_U16) { u.u = MIN2(u.u, 0xffff); } else if (type == NV_TYPE_S16) { u.i = CLAMP(u.i, -32768, 32767); } *val = u.u; } if (mod & NV_MOD_NOT) *val = ~*val; } static void constant_expression(struct nv_pc *pc, struct nv_instruction *nvi, struct nv_value *src0, struct nv_value *src1) { struct nv_value *val; union { float f32; uint32_t u32; int32_t s32; } u0, u1, u; ubyte type; if (!nvi->def[0]) return; type = NV_OPTYPE(nvi->opcode); u.u32 = 0; u0.u32 = src0->reg.imm.u32; u1.u32 = src1->reg.imm.u32; apply_modifiers(&u0.u32, type, nvi->src[0]->mod); apply_modifiers(&u1.u32, type, nvi->src[1]->mod); switch (nvi->opcode) { case NV_OP_MAD_F32: if (nvi->src[2]->value->reg.file != NV_FILE_GPR) return; /* fall through */ case NV_OP_MUL_F32: u.f32 = u0.f32 * u1.f32; break; case NV_OP_MUL_B32: u.u32 = u0.u32 * u1.u32; break; case NV_OP_ADD_F32: u.f32 = u0.f32 + u1.f32; break; case NV_OP_ADD_B32: u.u32 = u0.u32 + u1.u32; break; case NV_OP_SUB_F32: u.f32 = u0.f32 - u1.f32; break; /* case NV_OP_SUB_B32: u.u32 = u0.u32 - u1.u32; break; */ default: return; } val = new_value(pc, NV_FILE_IMM, nv_type_sizeof(type)); val->reg.imm.u32 = u.u32; nv_reference(pc, nvi, 1, NULL); nv_reference(pc, nvi, 0, val); if (nvi->opcode == NV_OP_MAD_F32) { nvi->src[1] = nvi->src[0]; nvi->src[0] = nvi->src[2]; nvi->src[2] = NULL; nvi->opcode = NV_OP_ADD_F32; if (val->reg.imm.u32 == 0) { nvi->src[1] = NULL; nvi->opcode = NV_OP_MOV; } } else { nvi->opcode = NV_OP_MOV; } } static void constant_operand(struct nv_pc *pc, struct nv_instruction *nvi, struct nv_value *val, int s) { union { float f32; uint32_t u32; int32_t s32; } u; int shift; int t = s ? 0 : 1; uint op; ubyte type; if (!nvi->def[0]) return; type = NV_OPTYPE(nvi->opcode); u.u32 = val->reg.imm.u32; apply_modifiers(&u.u32, type, nvi->src[s]->mod); if (u.u32 == 0 && NV_BASEOP(nvi->opcode) == NV_OP_MUL) { nvi->opcode = NV_OP_MOV; nv_reference(pc, nvi, t, NULL); if (s) { nvi->src[0] = nvi->src[1]; nvi->src[1] = NULL; } return; } switch (nvi->opcode) { case NV_OP_MUL_F32: if (u.f32 == 1.0f || u.f32 == -1.0f) { if (u.f32 == -1.0f) nvi->src[t]->mod ^= NV_MOD_NEG; switch (nvi->src[t]->mod) { case 0: op = nvi->saturate ? NV_OP_SAT : NV_OP_MOV; break; case NV_MOD_NEG: op = NV_OP_NEG_F32; break; case NV_MOD_ABS: op = NV_OP_ABS_F32; break; default: return; } nvi->opcode = op; nv_reference(pc, nvi, 0, nvi->src[t]->value); nv_reference(pc, nvi, 1, NULL); nvi->src[0]->mod = 0; } else if (u.f32 == 2.0f || u.f32 == -2.0f) { if (u.f32 == -2.0f) nvi->src[t]->mod ^= NV_MOD_NEG; nvi->opcode = NV_OP_ADD_F32; nv_reference(pc, nvi, s, nvi->src[t]->value); nvi->src[s]->mod = nvi->src[t]->mod; } break; case NV_OP_ADD_F32: if (u.u32 == 0) { switch (nvi->src[t]->mod) { case 0: op = nvi->saturate ? NV_OP_SAT : NV_OP_MOV; break; case NV_MOD_NEG: op = NV_OP_NEG_F32; break; case NV_MOD_ABS: op = NV_OP_ABS_F32; break; case NV_MOD_NEG | NV_MOD_ABS: op = NV_OP_CVT; nvi->ext.cvt.s = nvi->ext.cvt.d = type; break; default: return; } nvi->opcode = op; nv_reference(pc, nvi, 0, nvi->src[t]->value); nv_reference(pc, nvi, 1, NULL); if (nvi->opcode != NV_OP_CVT) nvi->src[0]->mod = 0; } break; case NV_OP_ADD_B32: if (u.u32 == 0) { assert(nvi->src[t]->mod == 0); nvi->opcode = nvi->saturate ? NV_OP_CVT : NV_OP_MOV; nvi->ext.cvt.s = nvi->ext.cvt.d = type; nv_reference(pc, nvi, 0, nvi->src[t]->value); nv_reference(pc, nvi, 1, NULL); } break; case NV_OP_MUL_B32: /* multiplication by 0 already handled above */ assert(nvi->src[s]->mod == 0); shift = ffs(u.s32) - 1; if (shift == 0) { nvi->opcode = NV_OP_MOV; nv_reference(pc, nvi, 0, nvi->src[t]->value); nv_reference(pc, nvi, 1, NULL); } else if (u.s32 > 0 && u.s32 == (1 << shift)) { nvi->opcode = NV_OP_SHL; (val = new_value(pc, NV_FILE_IMM, 4))->reg.imm.s32 = shift; nv_reference(pc, nvi, 0, nvi->src[t]->value); nv_reference(pc, nvi, 1, val); break; } break; case NV_OP_RCP: u.f32 = 1.0f / u.f32; (val = new_value(pc, NV_FILE_IMM, 4))->reg.imm.f32 = u.f32; nvi->opcode = NV_OP_MOV; assert(s == 0); nv_reference(pc, nvi, 0, val); break; case NV_OP_RSQ: u.f32 = 1.0f / sqrtf(u.f32); (val = new_value(pc, NV_FILE_IMM, 4))->reg.imm.f32 = u.f32; nvi->opcode = NV_OP_MOV; assert(s == 0); nv_reference(pc, nvi, 0, val); break; default: break; } } static void handle_min_max(struct nv_pass *ctx, struct nv_instruction *nvi) { struct nv_value *src0 = nvi->src[0]->value; struct nv_value *src1 = nvi->src[1]->value; if (src0 != src1 || (nvi->src[0]->mod | nvi->src[1]->mod)) return; if (src0->reg.file != NV_FILE_GPR) return; nvc0_pc_replace_value(ctx->pc, nvi->def[0], src0); nvc0_insn_delete(nvi); } /* check if we can MUL + ADD -> MAD/FMA */ static void handle_add_mul(struct nv_pass *ctx, struct nv_instruction *nvi) { struct nv_value *src0 = nvi->src[0]->value; struct nv_value *src1 = nvi->src[1]->value; struct nv_value *src; int s; uint8_t mod[4]; if (SRC_IS_MUL(src0) && src0->refc == 1) s = 0; else if (SRC_IS_MUL(src1) && src1->refc == 1) s = 1; else return; if ((src0->insn && src0->insn->bb != nvi->bb) || (src1->insn && src1->insn->bb != nvi->bb)) return; /* check for immediates from prior constant folding */ if (src0->reg.file != NV_FILE_GPR || src1->reg.file != NV_FILE_GPR) return; src = nvi->src[s]->value; mod[0] = nvi->src[0]->mod; mod[1] = nvi->src[1]->mod; mod[2] = src->insn->src[0]->mod; mod[3] = src->insn->src[1]->mod; if ((mod[0] | mod[1] | mod[2] | mod[3]) & ~NV_MOD_NEG) return; nvi->opcode = NV_OP_MAD_F32; nv_reference(ctx->pc, nvi, s, NULL); nvi->src[2] = nvi->src[!s]; nvi->src[!s] = NULL; nv_reference(ctx->pc, nvi, 0, src->insn->src[0]->value); nvi->src[0]->mod = mod[2] ^ mod[s]; nv_reference(ctx->pc, nvi, 1, src->insn->src[1]->value); nvi->src[1]->mod = mod[3]; } static int nv_pass_algebraic_opt(struct nv_pass *ctx, struct nv_basic_block *b) { struct nv_instruction *nvi, *next; int j; for (nvi = b->entry; nvi; nvi = next) { struct nv_value *src0, *src1; uint baseop = NV_BASEOP(nvi->opcode); next = nvi->next; src0 = nvc0_pc_find_immediate(nvi->src[0]); src1 = nvc0_pc_find_immediate(nvi->src[1]); if (src0 && src1) { constant_expression(ctx->pc, nvi, src0, src1); } else { if (src0) constant_operand(ctx->pc, nvi, src0, 0); else if (src1) constant_operand(ctx->pc, nvi, src1, 1); } if (baseop == NV_OP_MIN || baseop == NV_OP_MAX) handle_min_max(ctx, nvi); else if (nvi->opcode == NV_OP_ADD_F32) handle_add_mul(ctx, nvi); } DESCEND_ARBITRARY(j, nv_pass_algebraic_opt); return 0; } /* TODO: redundant store elimination */ struct mem_record { struct mem_record *next; struct nv_instruction *insn; uint32_t ofst; uint32_t base; uint32_t size; }; #define MEM_RECORD_POOL_SIZE 1024 struct pass_reld_elim { struct nv_pc *pc; struct mem_record *imm; struct mem_record *mem_v; struct mem_record *mem_a; struct mem_record *mem_c[16]; struct mem_record *mem_l; struct mem_record pool[MEM_RECORD_POOL_SIZE]; int alloc; }; /* Extend the load operation in @rec to also cover the data loaded by @ld. * The two loads may not overlap but reference adjacent memory locations. */ static void combine_load(struct nv_pc *pc, struct mem_record *rec, struct nv_instruction *ld) { struct nv_instruction *fv = rec->insn; struct nv_value *mem = ld->src[0]->value; uint32_t size = rec->size + mem->reg.size; int j; int d = rec->size / 4; assert(rec->size < 16); if (rec->ofst > mem->reg.address) { if ((size == 8 && mem->reg.address & 3) || (size > 8 && mem->reg.address & 7)) return; rec->ofst = mem->reg.address; for (j = 0; j < d; ++j) fv->def[mem->reg.size / 4 + j] = fv->def[j]; d = 0; } else if ((size == 8 && rec->ofst & 3) || (size > 8 && rec->ofst & 7)) { return; } for (j = 0; j < mem->reg.size / 4; ++j) { fv->def[d] = ld->def[j]; fv->def[d++]->insn = fv; } if (fv->src[0]->value->refc > 1) nv_reference(pc, fv, 0, new_value_like(pc, fv->src[0]->value)); fv->src[0]->value->reg.address = rec->ofst; fv->src[0]->value->reg.size = rec->size = size; nvc0_insn_delete(ld); } static void combine_export(struct mem_record *rec, struct nv_instruction *ex) { } static INLINE void add_mem_record(struct pass_reld_elim *ctx, struct mem_record **rec, uint32_t base, uint32_t ofst, struct nv_instruction *nvi) { struct mem_record *it = &ctx->pool[ctx->alloc++]; it->next = *rec; *rec = it; it->base = base; it->ofst = ofst; it->insn = nvi; it->size = nvi->src[0]->value->reg.size; } /* vectorize and reuse loads from memory or of immediates */ static int nv_pass_mem_opt(struct pass_reld_elim *ctx, struct nv_basic_block *b) { struct mem_record **rec, *it; struct nv_instruction *ld, *next; struct nv_value *mem; uint32_t base, ofst; int s; for (ld = b->entry; ld; ld = next) { next = ld->next; if (is_cspace_load(ld)) { mem = ld->src[0]->value; rec = &ctx->mem_c[ld->src[0]->value->reg.file - NV_FILE_MEM_C(0)]; } else if (ld->opcode == NV_OP_VFETCH) { mem = ld->src[0]->value; rec = &ctx->mem_a; } else if (ld->opcode == NV_OP_EXPORT) { mem = ld->src[0]->value; if (mem->reg.file != NV_FILE_MEM_V) continue; rec = &ctx->mem_v; } else { continue; } if (ld->def[0] && ld->def[0]->refc == 0) continue; ofst = mem->reg.address; base = (ld->indirect >= 0) ? ld->src[ld->indirect]->value->n : 0; for (it = *rec; it; it = it->next) { if (it->base == base && ((it->ofst >> 4) == (ofst >> 4)) && ((it->ofst + it->size == ofst) || (it->ofst - mem->reg.size == ofst))) { /* only NV_OP_VFETCH can load exactly 12 bytes */ if (ld->opcode == NV_OP_LD && it->size + mem->reg.size == 12) continue; if (it->ofst < ofst) { if ((it->ofst & 0xf) == 4) continue; } else if ((ofst & 0xf) == 4) continue; break; } } if (it) { switch (ld->opcode) { case NV_OP_EXPORT: combine_export(it, ld); break; default: combine_load(ctx->pc, it, ld); break; } } else if (ctx->alloc < MEM_RECORD_POOL_SIZE) { add_mem_record(ctx, rec, base, ofst, ld); } } ctx->alloc = 0; ctx->mem_a = ctx->mem_v = ctx->mem_l = NULL; for (s = 0; s < 16; ++s) ctx->mem_c[s] = NULL; DESCEND_ARBITRARY(s, nv_pass_mem_opt); return 0; } static void eliminate_store(struct mem_record *rec, struct nv_instruction *st) { } /* elimination of redundant stores */ static int pass_store_elim(struct pass_reld_elim *ctx, struct nv_basic_block *b) { struct mem_record **rec, *it; struct nv_instruction *st, *next; struct nv_value *mem; uint32_t base, ofst, size; int s; for (st = b->entry; st; st = next) { next = st->next; if (st->opcode == NV_OP_ST) { mem = st->src[0]->value; rec = &ctx->mem_l; } else if (st->opcode == NV_OP_EXPORT) { mem = st->src[0]->value; if (mem->reg.file != NV_FILE_MEM_V) continue; rec = &ctx->mem_v; } else if (st->opcode == NV_OP_ST) { /* TODO: purge */ } ofst = mem->reg.address; base = (st->indirect >= 0) ? st->src[st->indirect]->value->n : 0; size = mem->reg.size; for (it = *rec; it; it = it->next) { if (it->base == base && (it->ofst <= ofst && (it->ofst + size) > ofst)) break; } if (it) eliminate_store(it, st); else add_mem_record(ctx, rec, base, ofst, st); } DESCEND_ARBITRARY(s, nv_pass_mem_opt); return 0; } /* TODO: properly handle loads from l[] memory in the presence of stores */ static int nv_pass_reload_elim(struct pass_reld_elim *ctx, struct nv_basic_block *b) { #if 0 struct load_record **rec, *it; struct nv_instruction *ld, *next; uint64_t data[2]; struct nv_value *val; int j; for (ld = b->entry; ld; ld = next) { next = ld->next; if (!ld->src[0]) continue; val = ld->src[0]->value; rec = NULL; if (ld->opcode == NV_OP_LINTERP || ld->opcode == NV_OP_PINTERP) { data[0] = val->reg.id; data[1] = 0; rec = &ctx->mem_v; } else if (ld->opcode == NV_OP_LDA) { data[0] = val->reg.id; data[1] = ld->src[4] ? ld->src[4]->value->n : ~0ULL; if (val->reg.file >= NV_FILE_MEM_C(0) && val->reg.file <= NV_FILE_MEM_C(15)) rec = &ctx->mem_c[val->reg.file - NV_FILE_MEM_C(0)]; else if (val->reg.file == NV_FILE_MEM_S) rec = &ctx->mem_s; else if (val->reg.file == NV_FILE_MEM_L) rec = &ctx->mem_l; } else if ((ld->opcode == NV_OP_MOV) && (val->reg.file == NV_FILE_IMM)) { data[0] = val->reg.imm.u32; data[1] = 0; rec = &ctx->imm; } if (!rec || !ld->def[0]->refc) continue; for (it = *rec; it; it = it->next) if (it->data[0] == data[0] && it->data[1] == data[1]) break; if (it) { if (ld->def[0]->reg.id >= 0) it->value = ld->def[0]; else if (!ld->fixed) nvc0_pc_replace_value(ctx->pc, ld->def[0], it->value); } else { if (ctx->alloc == LOAD_RECORD_POOL_SIZE) continue; it = &ctx->pool[ctx->alloc++]; it->next = *rec; it->data[0] = data[0]; it->data[1] = data[1]; it->value = ld->def[0]; *rec = it; } } ctx->imm = NULL; ctx->mem_s = NULL; ctx->mem_v = NULL; for (j = 0; j < 16; ++j) ctx->mem_c[j] = NULL; ctx->mem_l = NULL; ctx->alloc = 0; DESCEND_ARBITRARY(j, nv_pass_reload_elim); #endif return 0; } static int nv_pass_tex_mask(struct nv_pass *ctx, struct nv_basic_block *b) { int i, c, j; for (i = 0; i < ctx->pc->num_instructions; ++i) { struct nv_instruction *nvi = &ctx->pc->instructions[i]; struct nv_value *def[4]; if (!nv_is_texture_op(nvi->opcode)) continue; nvi->tex_mask = 0; for (c = 0; c < 4; ++c) { if (nvi->def[c]->refc) nvi->tex_mask |= 1 << c; def[c] = nvi->def[c]; } j = 0; for (c = 0; c < 4; ++c) if (nvi->tex_mask & (1 << c)) nvi->def[j++] = def[c]; for (c = 0; c < 4; ++c) if (!(nvi->tex_mask & (1 << c))) nvi->def[j++] = def[c]; assert(j == 4); } return 0; } struct nv_pass_dce { struct nv_pc *pc; uint removed; }; static int nv_pass_dce(struct nv_pass_dce *ctx, struct nv_basic_block *b) { int j; struct nv_instruction *nvi, *next; for (nvi = b->phi ? b->phi : b->entry; nvi; nvi = next) { next = nvi->next; if (inst_removable(nvi)) { nvc0_insn_delete(nvi); ++ctx->removed; } } DESCEND_ARBITRARY(j, nv_pass_dce); return 0; } /* Register allocation inserted ELSE blocks for all IF/ENDIF without ELSE. * Returns TRUE if @bb initiates an IF/ELSE/ENDIF clause, or is an IF with * BREAK and dummy ELSE block. */ static INLINE boolean bb_is_if_else_endif(struct nv_basic_block *bb) { if (!bb->out[0] || !bb->out[1]) return FALSE; if (bb->out[0]->out_kind[0] == CFG_EDGE_LOOP_LEAVE) { return (bb->out[0]->out[1] == bb->out[1]->out[0] && !bb->out[1]->out[1]); } else { return (bb->out[0]->out[0] == bb->out[1]->out[0] && !bb->out[0]->out[1] && !bb->out[1]->out[1]); } } /* Predicate instructions and delete any branch at the end if it is * not a break from a loop. */ static void predicate_instructions(struct nv_pc *pc, struct nv_basic_block *b, struct nv_value *pred, uint8_t cc) { struct nv_instruction *nvi, *prev; int s; if (!b->entry) return; for (nvi = b->entry; nvi; nvi = nvi->next) { prev = nvi; if (inst_is_noop(nvi)) continue; for (s = 0; nvi->src[s]; ++s); assert(s < 6); nvi->predicate = s; nvi->cc = cc; nv_reference(pc, nvi, nvi->predicate, pred); } if (prev->opcode == NV_OP_BRA && b->out_kind[0] != CFG_EDGE_LOOP_LEAVE && b->out_kind[1] != CFG_EDGE_LOOP_LEAVE) nvc0_insn_delete(prev); } static INLINE boolean may_predicate_insn(struct nv_instruction *nvi, struct nv_value *pred) { if (nvi->def[0] && values_equal(nvi->def[0], pred)) return FALSE; return nvc0_insn_is_predicateable(nvi); } /* Transform IF/ELSE/ENDIF constructs into predicated instructions * where feasible. */ static int nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b) { struct nv_instruction *nvi; struct nv_value *pred; int k; int n0, n1; /* instruction counts of outgoing blocks */ if (bb_is_if_else_endif(b)) { assert(b->exit && b->exit->opcode == NV_OP_BRA); assert(b->exit->predicate >= 0); pred = b->exit->src[b->exit->predicate]->value; n1 = n0 = 0; for (nvi = b->out[0]->entry; nvi; nvi = nvi->next, ++n0) if (!may_predicate_insn(nvi, pred)) break; if (!nvi) { /* we're after register allocation, so there always is an ELSE block */ for (nvi = b->out[1]->entry; nvi; nvi = nvi->next, ++n1) if (!may_predicate_insn(nvi, pred)) break; } /* 12 is an arbitrary limit */ if (!nvi && n0 < 12 && n1 < 12) { predicate_instructions(ctx->pc, b->out[0], pred, !b->exit->cc); predicate_instructions(ctx->pc, b->out[1], pred, b->exit->cc); nvc0_insn_delete(b->exit); /* delete the branch */ /* and a potential joinat before it */ if (b->exit && b->exit->opcode == NV_OP_JOINAT) nvc0_insn_delete(b->exit); /* remove join operations at the end of the conditional */ k = (b->out[0]->out_kind[0] == CFG_EDGE_LOOP_LEAVE) ? 1 : 0; if ((nvi = b->out[0]->out[k]->entry)) { nvi->join = 0; if (nvi->opcode == NV_OP_JOIN) nvc0_insn_delete(nvi); } } } DESCEND_ARBITRARY(k, nv_pass_flatten); return 0; } /* Tests instructions for equality, but independently of sources. */ static boolean is_operation_equal(struct nv_instruction *a, struct nv_instruction *b) { if (a->opcode != b->opcode) return FALSE; if (nv_is_texture_op(a->opcode)) { if (a->ext.tex.t != b->ext.tex.t || a->ext.tex.s != b->ext.tex.s) return FALSE; if (a->tex_dim != b->tex_dim || a->tex_array != b->tex_array || a->tex_cube != b->tex_cube || a->tex_shadow != b->tex_shadow || a->tex_live != b->tex_live) return FALSE; } else if (a->opcode == NV_OP_CVT) { if (a->ext.cvt.s != b->ext.cvt.s || a->ext.cvt.d != b->ext.cvt.d) return FALSE; } else if (NV_BASEOP(a->opcode) == NV_OP_SET || NV_BASEOP(a->opcode) == NV_OP_SLCT) { if (a->set_cond != b->set_cond) return FALSE; } else if (a->opcode == NV_OP_LINTERP || a->opcode == NV_OP_PINTERP) { if (a->centroid != b->centroid || a->flat != b->flat) return FALSE; } if (a->cc != b->cc) return FALSE; if (a->lanes != b->lanes || a->patch != b->patch || a->saturate != b->saturate) return FALSE; if (a->opcode == NV_OP_QUADOP) /* beware quadon ! */ return FALSE; return TRUE; } /* local common subexpression elimination, stupid O(n^2) implementation */ static int nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b) { struct nv_instruction *ir, *ik, *next; struct nv_instruction *entry = b->phi ? b->phi : b->entry; int s, d; unsigned int reps; do { reps = 0; for (ir = entry; ir; ir = next) { next = ir->next; if (ir->fixed) continue; for (ik = entry; ik != ir; ik = ik->next) { if (!is_operation_equal(ir, ik)) continue; if (!ir->def[0] || !ik->def[0]) continue; if (ik->indirect != ir->indirect || ik->predicate != ir->predicate) continue; for (d = 0; d < 4; ++d) { if ((ir->def[d] ? 1 : 0) != (ik->def[d] ? 1 : 0)) break; if (ir->def[d]) { if (!values_equal(ik->def[0], ir->def[0])) break; } else { d = 4; break; } } if (d != 4) continue; for (s = 0; s < 5; ++s) { struct nv_value *a, *b; if ((ir->src[s] ? 1 : 0) != (ik->src[s] ? 1 : 0)) break; if (!ir->src[s]) { s = 5; break; } if (ik->src[s]->mod != ir->src[s]->mod) break; a = ik->src[s]->value; b = ir->src[s]->value; if (a == b) continue; if (a->reg.file != b->reg.file || a->reg.id < 0 || /* this excludes memory loads/stores */ a->reg.id != b->reg.id) break; } if (s == 5) { nvc0_insn_delete(ir); for (d = 0; d < 4 && ir->def[d]; ++d) nvc0_pc_replace_value(ctx->pc, ir->def[d], ik->def[d]); ++reps; break; } } } } while(reps); DESCEND_ARBITRARY(s, nv_pass_cse); return 0; } /* Make sure all sources of an NV_OP_BIND are distinct, they need to occupy * neighbouring registers. CSE might have messed this up. * Just generate a MOV for each source to avoid conflicts if they're used in * multiple NV_OP_BIND at different positions. * * Add a dummy use of the pointer source of >= 8 byte loads after the load * to prevent it from being assigned a register which overlaps the load's * destination, which would produce random corruptions. */ static int nv_pass_fixups(struct nv_pass *ctx, struct nv_basic_block *b) { struct nv_value *val; struct nv_instruction *fix, *nvi, *next; int s; for (fix = b->entry; fix; fix = next) { next = fix->next; if (fix->opcode == NV_OP_LD) { if (fix->indirect >= 0 && fix->src[0]->value->reg.size >= 8) { nvi = nv_alloc_instruction(ctx->pc, NV_OP_UNDEF); nv_reference(ctx->pc, nvi, 0, fix->src[fix->indirect]->value); nvc0_insn_insert_after(fix, nvi); } continue; } else if (fix->opcode == NV_OP_BIND) { for (s = 0; s < 4 && fix->src[s]; ++s) { val = fix->src[s]->value; nvi = nv_alloc_instruction(ctx->pc, NV_OP_MOV); nvi->def[0] = new_value_like(ctx->pc, val); nvi->def[0]->insn = nvi; nv_reference(ctx->pc, nvi, 0, val); nv_reference(ctx->pc, fix, s, nvi->def[0]); nvc0_insn_insert_before(fix, nvi); } } } DESCEND_ARBITRARY(s, nv_pass_fixups); return 0; } static int nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root) { struct pass_reld_elim *reldelim; struct nv_pass pass; struct nv_pass_dce dce; int ret; pass.n = 0; pass.pc = pc; /* Do CSE so we can just compare values by pointer in subsequent passes. */ pc->pass_seq++; ret = nv_pass_cse(&pass, root); if (ret) return ret; /* Do this first, so we don't have to pay attention * to whether sources are supported memory loads. */ pc->pass_seq++; ret = nv_pass_algebraic_opt(&pass, root); if (ret) return ret; pc->pass_seq++; ret = nv_pass_lower_mods(&pass, root); if (ret) return ret; pc->pass_seq++; ret = nvc0_pass_fold_loads(&pass, root); if (ret) return ret; if (pc->opt_reload_elim) { reldelim = CALLOC_STRUCT(pass_reld_elim); reldelim->pc = pc; pc->pass_seq++; ret = nv_pass_reload_elim(reldelim, root); if (ret) { FREE(reldelim); return ret; } memset(reldelim, 0, sizeof(struct pass_reld_elim)); reldelim->pc = pc; } /* May run DCE before load-combining since that pass will clean up * after itself. */ dce.pc = pc; do { dce.removed = 0; pc->pass_seq++; ret = nv_pass_dce(&dce, root); if (ret) return ret; } while (dce.removed); if (pc->opt_reload_elim) { pc->pass_seq++; ret = nv_pass_mem_opt(reldelim, root); if (!ret) { memset(reldelim, 0, sizeof(struct pass_reld_elim)); reldelim->pc = pc; pc->pass_seq++; ret = nv_pass_mem_opt(reldelim, root); } FREE(reldelim); if (ret) return ret; } ret = nv_pass_tex_mask(&pass, root); if (ret) return ret; pc->pass_seq++; ret = nv_pass_fixups(&pass, root); return ret; } int nvc0_pc_exec_pass0(struct nv_pc *pc) { int i, ret; for (i = 0; i < pc->num_subroutines + 1; ++i) if (pc->root[i] && (ret = nv_pc_pass0(pc, pc->root[i]))) return ret; return 0; }