/* * Copyright (C) 2018 Jonathan Marek * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * Authors: * Jonathan Marek */ #include "ir2_private.h" static bool scalar_possible(struct ir2_instr *instr) { if (instr->alu.scalar_opc == SCALAR_NONE) return false; return src_ncomp(instr) == 1; } static bool is_alu_compatible(struct ir2_instr *a, struct ir2_instr *b) { if (!a) return true; /* dont use same instruction twice */ if (a == b) return false; /* PRED_SET must be alone */ if (b->alu.scalar_opc >= PRED_SETEs && b->alu.scalar_opc <= PRED_SET_RESTOREs) return false; /* must write to same export (issues otherwise?) */ return a->alu.export == b->alu.export; } /* priority of vector instruction for scheduling (lower=higher prio) */ static unsigned alu_vector_prio(struct ir2_instr *instr) { if (instr->alu.vector_opc == VECTOR_NONE) return ~0u; if (is_export(instr)) return 4; /* TODO check src type and ncomps */ if (instr->src_count == 3) return 0; if (!scalar_possible(instr)) return 1; return instr->src_count == 2 ? 2 : 3; } /* priority of scalar instruction for scheduling (lower=higher prio) */ static unsigned alu_scalar_prio(struct ir2_instr *instr) { if (!scalar_possible(instr)) return ~0u; /* this case is dealt with later */ if (instr->src_count > 1) return ~0u; if (is_export(instr)) return 4; /* PRED to end of block */ if (instr->alu.scalar_opc >= PRED_SETEs && instr->alu.scalar_opc <= PRED_SET_RESTOREs) return 5; /* scalar only have highest priority */ return instr->alu.vector_opc == VECTOR_NONE ? 0 : 3; } /* this is a bit messy: * we want to find a slot where we can insert a scalar MOV with * a vector instruction that was already scheduled */ static struct ir2_sched_instr* insert(struct ir2_context *ctx, unsigned block_idx, unsigned reg_idx, struct ir2_src src1, unsigned *comp) { struct ir2_sched_instr *sched = NULL, *s; unsigned i, mask = 0xf; /* go first earliest point where the mov can be inserted */ for (i = ctx->instr_sched_count-1; i > 0; i--) { s = &ctx->instr_sched[i - 1]; if (s->instr && s->instr->block_idx != block_idx) break; if (s->instr_s && s->instr_s->block_idx != block_idx) break; if (src1.type == IR2_SRC_SSA) { if ((s->instr && s->instr->idx == src1.num) || (s->instr_s && s->instr_s->idx == src1.num)) break; } unsigned mr = ~(s->reg_state[reg_idx/8] >> reg_idx%8*4 & 0xf); if ((mask & mr) == 0) break; mask &= mr; if (s->instr_s || s->instr->src_count == 3) continue; if (s->instr->type != IR2_ALU || s->instr->alu.export >= 0) continue; sched = s; } *comp = ffs(mask) - 1; if (sched) { for (s = sched; s != &ctx->instr_sched[ctx->instr_sched_count]; s++) s->reg_state[reg_idx/8] |= 1 << (*comp+reg_idx%8*4); } return sched; } /* case1: * in this case, insert a mov to place the 2nd src into to same reg * (scalar sources come from the same register) * * this is a common case which works when one of the srcs is input/const * but for instrs which have 2 ssa/reg srcs, then its not ideal */ static bool scalarize_case1(struct ir2_context *ctx, struct ir2_instr *instr, bool order) { struct ir2_src src0 = instr->src[ order]; struct ir2_src src1 = instr->src[!order]; struct ir2_sched_instr *sched; struct ir2_instr *ins; struct ir2_reg *reg; unsigned idx, comp; switch (src0.type) { case IR2_SRC_CONST: case IR2_SRC_INPUT: return false; default: break; } /* TODO, insert needs logic for this */ if (src1.type == IR2_SRC_REG) return false; /* we could do something if they match src1.. */ if (src0.negate || src0.abs) return false; reg = get_reg_src(ctx, &src0); /* result not used more since we will overwrite */ for (int i = 0; i < 4; i++) if (reg->comp[i].ref_count != !!(instr->alu.write_mask & 1 << i)) return false; /* find a place to insert the mov */ sched = insert(ctx, instr->block_idx, reg->idx, src1, &comp); if (!sched) return false; ins = &ctx->instr[idx = ctx->instr_count++]; ins->idx = idx; ins->type = IR2_ALU; ins->src[0] = src1; ins->src_count = 1; ins->is_ssa = true; ins->ssa.idx = reg->idx; ins->ssa.ncomp = 1; ins->ssa.comp[0].c = comp; ins->alu.scalar_opc = MAXs; ins->alu.export = -1; ins->alu.write_mask = 1; ins->pred = instr->pred; ins->block_idx = instr->block_idx; instr->src[0] = src0; instr->alu.src1_swizzle = comp; sched->instr_s = ins; return true; } /* fill sched with next fetch or (vector and/or scalar) alu instruction */ static int sched_next(struct ir2_context *ctx, struct ir2_sched_instr *sched) { struct ir2_instr *avail[0x100], *instr_v = NULL, *instr_s = NULL; unsigned avail_count = 0; instr_alloc_type_t export = ~0u; int block_idx = -1; /* XXX merge this loop with the other one somehow? */ ir2_foreach_instr(instr, ctx) { if (!instr->need_emit) continue; if (is_export(instr)) export = MIN2(export, export_buf(instr->alu.export)); } ir2_foreach_instr(instr, ctx) { if (!instr->need_emit) continue; /* dont mix exports */ if (is_export(instr) && export_buf(instr->alu.export) != export) continue; if (block_idx < 0) block_idx = instr->block_idx; else if (block_idx != instr->block_idx || /* must be same block */ instr->type == IR2_CF || /* CF/MEM must be alone */ (is_export(instr) && export == SQ_MEMORY)) break; /* it works because IR2_CF is always at end of block * and somewhat same idea with MEM exports, which might not be alone * but will end up in-order at least */ /* check if dependencies are satisfied */ bool is_ok = true; ir2_foreach_src(src, instr) { if (src->type == IR2_SRC_REG) { /* need to check if all previous instructions in the block * which write the reg have been emitted * slow.. * XXX: check components instead of whole register */ struct ir2_reg *reg = get_reg_src(ctx, src); ir2_foreach_instr(p, ctx) { if (!p->is_ssa && p->reg == reg && p->idx < instr->idx) is_ok &= !p->need_emit; } } else if (src->type == IR2_SRC_SSA) { /* in this case its easy, just check need_emit */ is_ok &= !ctx->instr[src->num].need_emit; } } /* don't reorder non-ssa write before read */ if (!instr->is_ssa) { ir2_foreach_instr(p, ctx) { if (!p->need_emit || p->idx >= instr->idx) continue; ir2_foreach_src(src, p) { if (get_reg_src(ctx, src) == instr->reg) is_ok = false; } } } /* don't reorder across predicates */ if (avail_count && instr->pred != avail[0]->pred) is_ok = false; if (!is_ok) continue; avail[avail_count++] = instr; } if (!avail_count) { assert(block_idx == -1); return -1; } /* priority to FETCH instructions */ ir2_foreach_avail(instr) { if (instr->type == IR2_ALU) continue; ra_src_free(ctx, instr); ra_reg(ctx, get_reg(instr), -1, false, 0); instr->need_emit = false; sched->instr = instr; sched->instr_s = NULL; return block_idx; } /* TODO precompute priorities */ unsigned prio_v = ~0u, prio_s = ~0u, prio; ir2_foreach_avail(instr) { prio = alu_vector_prio(instr); if (prio < prio_v) { instr_v = instr; prio_v = prio; } } /* TODO can still insert scalar if src_count=3, if smart about it */ if (!instr_v || instr_v->src_count < 3) { ir2_foreach_avail(instr) { bool compat = is_alu_compatible(instr_v, instr); prio = alu_scalar_prio(instr); if (prio >= prio_v && !compat) continue; if (prio < prio_s) { instr_s = instr; prio_s = prio; if (!compat) instr_v = NULL; } } } assert(instr_v || instr_s); /* now, we try more complex insertion of vector instruction as scalar * TODO: if we are smart we can still insert if instr_v->src_count==3 */ if (!instr_s && instr_v->src_count < 3) { ir2_foreach_avail(instr) { if (!is_alu_compatible(instr_v, instr) || !scalar_possible(instr)) continue; /* at this point, src_count should always be 2 */ assert(instr->src_count == 2); if (scalarize_case1(ctx, instr, 0)) { instr_s = instr; break; } if (scalarize_case1(ctx, instr, 1)) { instr_s = instr; break; } } } /* free src registers */ if (instr_v) { instr_v->need_emit = false; ra_src_free(ctx, instr_v); } if (instr_s) { instr_s->need_emit = false; ra_src_free(ctx, instr_s); } /* allocate dst registers */ if (instr_v) ra_reg(ctx, get_reg(instr_v), -1, is_export(instr_v), instr_v->alu.write_mask); if (instr_s) ra_reg(ctx, get_reg(instr_s), -1, is_export(instr_s), instr_s->alu.write_mask); sched->instr = instr_v; sched->instr_s = instr_s; return block_idx; } /* scheduling: determine order of instructions */ static void schedule_instrs(struct ir2_context *ctx) { struct ir2_sched_instr *sched; int block_idx; /* allocate input registers */ for (unsigned idx = 0; idx < ARRAY_SIZE(ctx->input); idx++) if (ctx->input[idx].initialized) ra_reg(ctx, &ctx->input[idx], idx, false, 0); for (;;) { sched = &ctx->instr_sched[ctx->instr_sched_count++]; block_idx = sched_next(ctx, sched); if (block_idx < 0) break; memcpy(sched->reg_state, ctx->reg_state, sizeof(ctx->reg_state)); /* catch texture fetch after scheduling and insert the * SET_TEX_LOD right before it if necessary * TODO clean this up */ struct ir2_instr *instr = sched->instr, *tex_lod; if (instr && instr->type == IR2_FETCH && instr->fetch.opc == TEX_FETCH && instr->src_count == 2) { /* generate the SET_LOD instruction */ tex_lod = &ctx->instr[ctx->instr_count++]; tex_lod->type = IR2_FETCH; tex_lod->block_idx = instr->block_idx; tex_lod->pred = instr->pred; tex_lod->fetch.opc = TEX_SET_TEX_LOD; tex_lod->src[0] = instr->src[1]; tex_lod->src_count = 1; sched[1] = sched[0]; sched->instr = tex_lod; ctx->instr_sched_count++; } bool free_block = true; ir2_foreach_instr(instr, ctx) free_block &= instr->block_idx != block_idx; if (free_block) ra_block_free(ctx, block_idx); }; ctx->instr_sched_count--; } void ir2_compile(struct fd2_shader_stateobj *so, unsigned variant, struct fd2_shader_stateobj *fp) { struct ir2_context ctx = { }; bool binning = !fp && so->type == MESA_SHADER_VERTEX; if (fp) so->variant[variant].f = fp->variant[0].f; ctx.so = so; ctx.info = &so->variant[variant].info; ctx.f = &so->variant[variant].f; ctx.info->max_reg = -1; /* convert nir to internal representation */ ir2_nir_compile(&ctx, binning); /* copy propagate srcs */ cp_src(&ctx); /* get ref_counts and kill non-needed instructions */ ra_count_refs(&ctx); /* remove movs used to write outputs */ cp_export(&ctx); /* instruction order.. and vector->scalar conversions */ schedule_instrs(&ctx); /* finally, assemble to bitcode */ assemble(&ctx, binning); }