/* * Copyright (C) 2018 Jonathan Marek * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * Authors: * Jonathan Marek */ #include "ir2_private.h" #include "freedreno_util.h" #include "fd2_program.h" static const nir_shader_compiler_options options = { .lower_fpow = true, .lower_flrp32 = true, .lower_fmod = true, .lower_fdiv = true, .lower_fceil = true, .fuse_ffma = true, /* .fdot_replicates = true, it is replicated, but it makes things worse */ .lower_all_io_to_temps = true, .vertex_id_zero_based = true, /* its not implemented anyway */ .lower_bitops = true, .lower_rotate = true, .lower_vector_cmp = true, }; const nir_shader_compiler_options * ir2_get_compiler_options(void) { return &options; } #define OPT(nir, pass, ...) ({ \ bool this_progress = false; \ NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \ this_progress; \ }) #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__) static void ir2_optimize_loop(nir_shader *s) { bool progress; do { progress = false; OPT_V(s, nir_lower_vars_to_ssa); progress |= OPT(s, nir_opt_copy_prop_vars); progress |= OPT(s, nir_copy_prop); progress |= OPT(s, nir_opt_dce); progress |= OPT(s, nir_opt_cse); /* progress |= OPT(s, nir_opt_gcm, true); */ progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true); progress |= OPT(s, nir_opt_intrinsics); progress |= OPT(s, nir_opt_algebraic); progress |= OPT(s, nir_opt_constant_folding); progress |= OPT(s, nir_opt_dead_cf); if (OPT(s, nir_opt_trivial_continues)) { progress |= true; /* If nir_opt_trivial_continues makes progress, then we need to clean * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll * to make progress. */ OPT(s, nir_copy_prop); OPT(s, nir_opt_dce); } progress |= OPT(s, nir_opt_loop_unroll, nir_var_all); progress |= OPT(s, nir_opt_if, false); progress |= OPT(s, nir_opt_remove_phis); progress |= OPT(s, nir_opt_undef); } while (progress); } /* trig workarounds is the same as ir3.. but we don't want to include ir3 */ bool ir3_nir_apply_trig_workarounds(nir_shader * shader); int ir2_optimize_nir(nir_shader *s, bool lower) { struct nir_lower_tex_options tex_options = { .lower_txp = ~0u, .lower_rect = 0, }; if (fd_mesa_debug & FD_DBG_DISASM) { debug_printf("----------------------\n"); nir_print_shader(s, stdout); debug_printf("----------------------\n"); } OPT_V(s, nir_lower_regs_to_ssa); OPT_V(s, nir_lower_vars_to_ssa); OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out); if (lower) { OPT_V(s, ir3_nir_apply_trig_workarounds); OPT_V(s, nir_lower_tex, &tex_options); } ir2_optimize_loop(s); OPT_V(s, nir_remove_dead_variables, nir_var_function_temp); OPT_V(s, nir_opt_sink, nir_move_const_undef); /* TODO we dont want to get shaders writing to depth for depth textures */ if (s->info.stage == MESA_SHADER_FRAGMENT) { nir_foreach_variable(var, &s->outputs) { if (var->data.location == FRAG_RESULT_DEPTH) return -1; } } return 0; } static struct ir2_src load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp) { struct fd2_shader_stateobj *so = ctx->so; unsigned imm_ncomp, swiz, idx, i, j; uint32_t *value = (uint32_t*) value_f; /* try to merge with existing immediate (TODO: try with neg) */ for (idx = 0; idx < so->num_immediates; idx++) { swiz = 0; imm_ncomp = so->immediates[idx].ncomp; for (i = 0; i < ncomp; i++) { for (j = 0; j < imm_ncomp; j++) { if (value[i] == so->immediates[idx].val[j]) break; } if (j == imm_ncomp) { if (j == 4) break; so->immediates[idx].val[imm_ncomp++] = value[i]; } swiz |= swiz_set(j, i); } /* matched all components */ if (i == ncomp) break; } /* need to allocate new immediate */ if (idx == so->num_immediates) { swiz = 0; imm_ncomp = 0; for (i = 0; i < ncomp; i++) { for (j = 0; j < imm_ncomp; j++) { if (value[i] == ctx->so->immediates[idx].val[j]) break; } if (j == imm_ncomp) { so->immediates[idx].val[imm_ncomp++] = value[i]; } swiz |= swiz_set(j, i); } so->num_immediates++; } so->immediates[idx].ncomp = imm_ncomp; if (ncomp == 1) swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX); return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST); } struct ir2_src ir2_zero(struct ir2_context *ctx) { return load_const(ctx, (float[]) {0.0f}, 1); } static void update_range(struct ir2_context *ctx, struct ir2_reg *reg) { if (!reg->initialized) { reg->initialized = true; reg->loop_depth = ctx->loop_depth; } if (ctx->loop_depth > reg->loop_depth) { reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1]; } else { reg->loop_depth = ctx->loop_depth; reg->block_idx_free = -1; } /* for regs we want to free at the end of the loop in any case * XXX dont do this for ssa */ if (reg->loop_depth) reg->block_idx_free = ctx->loop_last_block[reg->loop_depth]; } static struct ir2_src make_src(struct ir2_context *ctx, nir_src src) { struct ir2_src res = {}; struct ir2_reg *reg; nir_const_value *const_value = nir_src_as_const_value(src); if (const_value) { assert(src.is_ssa); float c[src.ssa->num_components]; nir_const_value_to_array(c, const_value, src.ssa->num_components, f32); return load_const(ctx, c, src.ssa->num_components); } if (!src.is_ssa) { res.num = src.reg.reg->index; res.type = IR2_SRC_REG; reg = &ctx->reg[res.num]; } else { assert(ctx->ssa_map[src.ssa->index] >= 0); res.num = ctx->ssa_map[src.ssa->index]; res.type = IR2_SRC_SSA; reg = &ctx->instr[res.num].ssa; } update_range(ctx, reg); return res; } static void set_index(struct ir2_context *ctx, nir_dest * dst, struct ir2_instr *instr) { struct ir2_reg *reg = &instr->ssa; if (dst->is_ssa) { ctx->ssa_map[dst->ssa.index] = instr->idx; } else { assert(instr->is_ssa); reg = &ctx->reg[dst->reg.reg->index]; instr->is_ssa = false; instr->reg = reg; } update_range(ctx, reg); } static struct ir2_instr * ir2_instr_create(struct ir2_context *ctx, int type) { struct ir2_instr *instr; instr = &ctx->instr[ctx->instr_count++]; instr->idx = ctx->instr_count - 1; instr->type = type; instr->block_idx = ctx->block_idx; instr->pred = ctx->pred; instr->is_ssa = true; return instr; } static struct ir2_instr * instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp) { /* emit_alu will fixup instrs that don't map directly */ static const struct ir2_opc { int8_t scalar, vector; } nir_ir2_opc[nir_num_opcodes+1] = { [0 ... nir_num_opcodes - 1] = {-1, -1}, [nir_op_mov] = {MAXs, MAXv}, [nir_op_fneg] = {MAXs, MAXv}, [nir_op_fabs] = {MAXs, MAXv}, [nir_op_fsat] = {MAXs, MAXv}, [nir_op_fsign] = {-1, CNDGTEv}, [nir_op_fadd] = {ADDs, ADDv}, [nir_op_fsub] = {ADDs, ADDv}, [nir_op_fmul] = {MULs, MULv}, [nir_op_ffma] = {-1, MULADDv}, [nir_op_fmax] = {MAXs, MAXv}, [nir_op_fmin] = {MINs, MINv}, [nir_op_ffloor] = {FLOORs, FLOORv}, [nir_op_ffract] = {FRACs, FRACv}, [nir_op_ftrunc] = {TRUNCs, TRUNCv}, [nir_op_fdot2] = {-1, DOT2ADDv}, [nir_op_fdot3] = {-1, DOT3v}, [nir_op_fdot4] = {-1, DOT4v}, [nir_op_sge] = {-1, SETGTEv}, [nir_op_slt] = {-1, SETGTv}, [nir_op_sne] = {-1, SETNEv}, [nir_op_seq] = {-1, SETEv}, [nir_op_fcsel] = {-1, CNDEv}, [nir_op_frsq] = {RECIPSQ_IEEE, -1}, [nir_op_frcp] = {RECIP_IEEE, -1}, [nir_op_flog2] = {LOG_IEEE, -1}, [nir_op_fexp2] = {EXP_IEEE, -1}, [nir_op_fsqrt] = {SQRT_IEEE, -1}, [nir_op_fcos] = {COS, -1}, [nir_op_fsin] = {SIN, -1}, /* no fsat, fneg, fabs since source mods deal with those */ /* so we can use this function with non-nir op */ #define ir2_op_cube nir_num_opcodes [ir2_op_cube] = {-1, CUBEv}, }; struct ir2_opc op = nir_ir2_opc[opcode]; assert(op.vector >= 0 || op.scalar >= 0); struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU); instr->alu.vector_opc = op.vector; instr->alu.scalar_opc = op.scalar; instr->alu.export = -1; instr->alu.write_mask = (1 << ncomp) - 1; instr->src_count = opcode == ir2_op_cube ? 2 : nir_op_infos[opcode].num_inputs; instr->ssa.ncomp = ncomp; return instr; } static struct ir2_instr * instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode, uint8_t write_mask, struct ir2_instr *share_reg) { struct ir2_instr *instr; struct ir2_reg *reg; reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++]; reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1); instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask)); instr->alu.write_mask = write_mask; instr->reg = reg; instr->is_ssa = false; return instr; } static struct ir2_instr * instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_dest *dst) { struct ir2_instr *instr; instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst)); set_index(ctx, dst, instr); return instr; } static struct ir2_instr * ir2_instr_create_fetch(struct ir2_context *ctx, nir_dest *dst, instr_fetch_opc_t opc) { struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH); instr->fetch.opc = opc; instr->src_count = 1; instr->ssa.ncomp = nir_dest_num_components(*dst); set_index(ctx, dst, instr); return instr; } static struct ir2_src make_src_noconst(struct ir2_context *ctx, nir_src src) { struct ir2_instr *instr; if (nir_src_as_const_value(src)) { assert(src.is_ssa); instr = instr_create_alu(ctx, nir_op_mov, src.ssa->num_components); instr->src[0] = make_src(ctx, src); return ir2_src(instr->idx, 0, IR2_SRC_SSA); } return make_src(ctx, src); } static void emit_alu(struct ir2_context *ctx, nir_alu_instr * alu) { const nir_op_info *info = &nir_op_infos[alu->op]; nir_dest *dst = &alu->dest.dest; struct ir2_instr *instr; struct ir2_src tmp; unsigned ncomp; /* get the number of dst components */ if (dst->is_ssa) { ncomp = dst->ssa.num_components; } else { ncomp = 0; for (int i = 0; i < 4; i++) ncomp += !!(alu->dest.write_mask & 1 << i); } instr = instr_create_alu(ctx, alu->op, ncomp); set_index(ctx, dst, instr); instr->alu.saturate = alu->dest.saturate; instr->alu.write_mask = alu->dest.write_mask; for (int i = 0; i < info->num_inputs; i++) { nir_alu_src *src = &alu->src[i]; /* compress swizzle with writemask when applicable */ unsigned swiz = 0, j = 0; for (int i = 0; i < 4; i++) { if (!(alu->dest.write_mask & 1 << i) && !info->output_size) continue; swiz |= swiz_set(src->swizzle[i], j++); } instr->src[i] = make_src(ctx, src->src); instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz); instr->src[i].negate = src->negate; instr->src[i].abs = src->abs; } /* workarounds for NIR ops that don't map directly to a2xx ops */ switch (alu->op) { case nir_op_fneg: instr->src[0].negate = 1; break; case nir_op_fabs: instr->src[0].abs = 1; break; case nir_op_fsat: instr->alu.saturate = 1; break; case nir_op_slt: tmp = instr->src[0]; instr->src[0] = instr->src[1]; instr->src[1] = tmp; break; case nir_op_fcsel: tmp = instr->src[1]; instr->src[1] = instr->src[2]; instr->src[2] = tmp; break; case nir_op_fsub: instr->src[1].negate = !instr->src[1].negate; break; case nir_op_fdot2: instr->src_count = 3; instr->src[2] = ir2_zero(ctx); break; case nir_op_fsign: { /* we need an extra instruction to deal with the zero case */ struct ir2_instr *tmp; /* tmp = x == 0 ? 0 : 1 */ tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp); tmp->src[0] = instr->src[0]; tmp->src[1] = ir2_zero(ctx); tmp->src[2] = load_const(ctx, (float[]) {1.0f}, 1); /* result = x >= 0 ? tmp : -tmp */ instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA); instr->src[2] = instr->src[1]; instr->src[2].negate = true; instr->src_count = 3; } break; default: break; } } static void load_input(struct ir2_context *ctx, nir_dest *dst, unsigned idx) { struct ir2_instr *instr; int slot = -1; if (ctx->so->type == MESA_SHADER_VERTEX) { instr = ir2_instr_create_fetch(ctx, dst, 0); instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT); instr->fetch.vtx.const_idx = 20 + (idx / 3); instr->fetch.vtx.const_idx_sel = idx % 3; return; } /* get slot from idx */ nir_foreach_variable(var, &ctx->nir->inputs) { if (var->data.driver_location == idx) { slot = var->data.location; break; } } assert(slot >= 0); switch (slot) { case VARYING_SLOT_PNTC: /* need to extract with abs and invert y */ instr = instr_create_alu_dest(ctx, nir_op_ffma, dst); instr->src[0] = ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT); instr->src[0].abs = true; instr->src[1] = load_const(ctx, (float[]) {1.0f, -1.0f}, 2); instr->src[2] = load_const(ctx, (float[]) {0.0f, 1.0f}, 2); break; case VARYING_SLOT_POS: /* need to extract xy with abs and add tile offset on a20x * zw from fragcoord input (w inverted in fragment shader) * TODO: only components that are required by fragment shader */ instr = instr_create_alu_reg(ctx, ctx->so->is_a20x ? nir_op_fadd : nir_op_mov, 3, NULL); instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT); instr->src[0].abs = true; /* on a20x, C64 contains the tile offset */ instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST); instr = instr_create_alu_reg(ctx, nir_op_mov, 4, instr); instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT); instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr); instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT); unsigned reg_idx = instr->reg - ctx->reg; /* XXX */ instr = instr_create_alu_dest(ctx, nir_op_mov, dst); instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG); break; default: instr = instr_create_alu_dest(ctx, nir_op_mov, dst); instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT); break; } } static unsigned output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr) { int slot = -1; unsigned idx = nir_intrinsic_base(intr); nir_foreach_variable(var, &ctx->nir->outputs) { if (var->data.driver_location == idx) { slot = var->data.location; break; } } assert(slot != -1); return slot; } static void store_output(struct ir2_context *ctx, nir_src src, unsigned slot, unsigned ncomp) { struct ir2_instr *instr; unsigned idx = 0; if (ctx->so->type == MESA_SHADER_VERTEX) { switch (slot) { case VARYING_SLOT_POS: ctx->position = make_src(ctx, src); idx = 62; break; case VARYING_SLOT_PSIZ: ctx->so->writes_psize = true; idx = 63; break; default: /* find matching slot from fragment shader input */ for (idx = 0; idx < ctx->f->inputs_count; idx++) if (ctx->f->inputs[idx].slot == slot) break; if (idx == ctx->f->inputs_count) return; } } else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) { /* only color output is implemented */ return; } instr = instr_create_alu(ctx, nir_op_mov, ncomp); instr->src[0] = make_src(ctx, src); instr->alu.export = idx; } static void emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr) { struct ir2_instr *instr; nir_const_value *const_offset; unsigned idx; switch (intr->intrinsic) { case nir_intrinsic_load_input: load_input(ctx, &intr->dest, nir_intrinsic_base(intr)); break; case nir_intrinsic_store_output: store_output(ctx, intr->src[0], output_slot(ctx, intr), intr->num_components); break; case nir_intrinsic_load_uniform: const_offset = nir_src_as_const_value(intr->src[0]); assert(const_offset); /* TODO can be false in ES2? */ idx = nir_intrinsic_base(intr); idx += (uint32_t) nir_src_as_const_value(intr->src[0])[0].f32; instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest); instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST); break; case nir_intrinsic_discard: case nir_intrinsic_discard_if: instr = ir2_instr_create(ctx, IR2_ALU); instr->alu.vector_opc = VECTOR_NONE; if (intr->intrinsic == nir_intrinsic_discard_if) { instr->alu.scalar_opc = KILLNEs; instr->src[0] = make_src(ctx, intr->src[0]); } else { instr->alu.scalar_opc = KILLEs; instr->src[0] = ir2_zero(ctx); } instr->alu.export = -1; instr->src_count = 1; ctx->so->has_kill = true; break; case nir_intrinsic_load_front_face: /* gl_FrontFacing is in the sign of param.x * rcp required because otherwise we can't differentiate -0.0 and +0.0 */ ctx->so->need_param = true; struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1); tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT); instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->dest); instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA); instr->src[1] = ir2_zero(ctx); break; default: compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic); break; } } static void emit_tex(struct ir2_context *ctx, nir_tex_instr * tex) { bool is_rect = false, is_cube = false; struct ir2_instr *instr; nir_src *coord, *lod_bias; coord = lod_bias = NULL; for (unsigned i = 0; i < tex->num_srcs; i++) { switch (tex->src[i].src_type) { case nir_tex_src_coord: coord = &tex->src[i].src; break; case nir_tex_src_bias: case nir_tex_src_lod: assert(!lod_bias); lod_bias = &tex->src[i].src; break; default: compile_error(ctx, "Unhandled NIR tex src type: %d\n", tex->src[i].src_type); return; } } switch (tex->op) { case nir_texop_tex: case nir_texop_txb: case nir_texop_txl: break; default: compile_error(ctx, "unimplemented texop %d\n", tex->op); return; } switch (tex->sampler_dim) { case GLSL_SAMPLER_DIM_2D: break; case GLSL_SAMPLER_DIM_RECT: is_rect = true; break; case GLSL_SAMPLER_DIM_CUBE: is_cube = true; break; default: compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim); return; } struct ir2_src src_coord = make_src_noconst(ctx, *coord); /* for cube maps * tmp = cube(coord) * tmp.xy = tmp.xy / |tmp.z| + 1.5 * coord = tmp.xyw */ if (is_cube) { struct ir2_instr *rcp, *coord_xy; unsigned reg_idx; instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL); instr->src[0] = src_coord; instr->src[0].swizzle = IR2_SWIZZLE_ZZXY; instr->src[1] = src_coord; instr->src[1].swizzle = IR2_SWIZZLE_YXZZ; reg_idx = instr->reg - ctx->reg; /* hacky */ rcp = instr_create_alu(ctx, nir_op_frcp, 1); rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG); rcp->src[0].abs = true; coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr); coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG); coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA); coord_xy->src[2] = load_const(ctx, (float[]) {1.5f}, 1); src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG); /* TODO: lod/bias transformed by src_coord.z ? */ } instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH); instr->src[0] = src_coord; instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_YXW : 0; instr->fetch.tex.is_cube = is_cube; instr->fetch.tex.is_rect = is_rect; instr->fetch.tex.samp_id = tex->sampler_index; /* for lod/bias, we insert an extra src for the backend to deal with */ if (lod_bias) { instr->src[1] = make_src_noconst(ctx, *lod_bias); /* backend will use 2-3 components so apply swizzle */ swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX); instr->src_count = 2; } } static void setup_input(struct ir2_context *ctx, nir_variable * in) { struct fd2_shader_stateobj *so = ctx->so; unsigned array_len = MAX2(glsl_get_length(in->type), 1); unsigned n = in->data.driver_location; unsigned slot = in->data.location; assert(array_len == 1); /* handle later */ if (ctx->so->type == MESA_SHADER_VERTEX) return; if (ctx->so->type != MESA_SHADER_FRAGMENT) compile_error(ctx, "unknown shader type: %d\n", ctx->so->type); if (slot == VARYING_SLOT_PNTC) { so->need_param = true; return; } n = ctx->f->inputs_count++; /* half of fragcoord from param reg, half from a varying */ if (slot == VARYING_SLOT_POS) { ctx->f->fragcoord = n; so->need_param = true; } ctx->f->inputs[n].slot = slot; ctx->f->inputs[n].ncomp = glsl_get_components(in->type); /* in->data.interpolation? * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD */ } static void emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr * undef) { /* TODO we don't want to emit anything for undefs */ struct ir2_instr *instr; instr = instr_create_alu_dest(ctx, nir_op_mov, &(nir_dest) {.ssa = undef->def,.is_ssa = true}); instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST); } static void emit_instr(struct ir2_context *ctx, nir_instr * instr) { switch (instr->type) { case nir_instr_type_alu: emit_alu(ctx, nir_instr_as_alu(instr)); break; case nir_instr_type_deref: /* ignored, handled as part of the intrinsic they are src to */ break; case nir_instr_type_intrinsic: emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break; case nir_instr_type_load_const: /* dealt with when using nir_src */ break; case nir_instr_type_tex: emit_tex(ctx, nir_instr_as_tex(instr)); break; case nir_instr_type_jump: ctx->block_has_jump[ctx->block_idx] = true; break; case nir_instr_type_ssa_undef: emit_undef(ctx, nir_instr_as_ssa_undef(instr)); break; default: break; } } /* fragcoord.zw and a20x hw binning outputs */ static void extra_position_exports(struct ir2_context *ctx, bool binning) { struct ir2_instr *instr, *rcp, *sc, *wincoord, *off; if (ctx->f->fragcoord < 0 && !binning) return; instr = instr_create_alu(ctx, nir_op_fmax, 1); instr->src[0] = ctx->position; instr->src[0].swizzle = IR2_SWIZZLE_W; instr->src[1] = ir2_zero(ctx); rcp = instr_create_alu(ctx, nir_op_frcp, 1); rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA); sc = instr_create_alu(ctx, nir_op_fmul, 4); sc->src[0] = ctx->position; sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA); wincoord = instr_create_alu(ctx, nir_op_ffma, 4); wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST); wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA); wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST); /* fragcoord z/w */ if (ctx->f->fragcoord >= 0 && !binning) { instr = instr_create_alu(ctx, nir_op_mov, 1); instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA); instr->alu.export = ctx->f->fragcoord; instr = instr_create_alu(ctx, nir_op_mov, 1); instr->src[0] = ctx->position; instr->src[0].swizzle = IR2_SWIZZLE_W; instr->alu.export = ctx->f->fragcoord; instr->alu.write_mask = 2; } if (!binning) return; off = instr_create_alu(ctx, nir_op_fadd, 1); off->src[0] = ir2_src(64, 0, IR2_SRC_CONST); off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT); /* 8 max set in freedreno_screen.. unneeded instrs patched out */ for (int i = 0; i < 8; i++) { instr = instr_create_alu(ctx, nir_op_ffma, 4); instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST); instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA); instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST); instr->alu.export = 32; instr = instr_create_alu(ctx, nir_op_ffma, 4); instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST); instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA); instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST); instr->alu.export = 33; } } static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list); static bool emit_block(struct ir2_context *ctx, nir_block * block) { struct ir2_instr *instr; nir_block *succs = block->successors[0]; ctx->block_idx = block->index; nir_foreach_instr(instr, block) emit_instr(ctx, instr); if (!succs || !succs->index) return false; /* we want to be smart and always jump and have the backend cleanup * but we are not, so there are two cases where jump is needed: * loops (succs index lower) * jumps (jump instruction seen in block) */ if (succs->index > block->index && !ctx->block_has_jump[block->index]) return false; assert(block->successors[1] == NULL); instr = ir2_instr_create(ctx, IR2_CF); instr->cf.block_idx = succs->index; /* XXX can't jump to a block with different predicate */ return true; } static void emit_if(struct ir2_context *ctx, nir_if * nif) { unsigned pred = ctx->pred, pred_idx = ctx->pred_idx; struct ir2_instr *instr; /* XXX: blob seems to always use same register for condition */ instr = ir2_instr_create(ctx, IR2_ALU); instr->src[0] = make_src(ctx, nif->condition); instr->src_count = 1; instr->ssa.ncomp = 1; instr->alu.vector_opc = VECTOR_NONE; instr->alu.scalar_opc = SCALAR_NONE; instr->alu.export = -1; instr->alu.write_mask = 1; instr->pred = 0; /* if nested, use PRED_SETNE_PUSHv */ if (pred) { instr->alu.vector_opc = PRED_SETNE_PUSHv; instr->src[1] = instr->src[0]; instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA); instr->src[0].swizzle = IR2_SWIZZLE_XXXX; instr->src[1].swizzle = IR2_SWIZZLE_XXXX; instr->src_count = 2; } else { instr->alu.scalar_opc = PRED_SETNEs; } ctx->pred_idx = instr->idx; ctx->pred = 3; emit_cf_list(ctx, &nif->then_list); /* TODO: if these is no else branch we don't need this * and if the else branch is simple, can just flip ctx->pred instead */ instr = ir2_instr_create(ctx, IR2_ALU); instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA); instr->src_count = 1; instr->ssa.ncomp = 1; instr->alu.vector_opc = VECTOR_NONE; instr->alu.scalar_opc = PRED_SET_INVs; instr->alu.export = -1; instr->alu.write_mask = 1; instr->pred = 0; ctx->pred_idx = instr->idx; emit_cf_list(ctx, &nif->else_list); /* restore predicate for nested predicates */ if (pred) { instr = ir2_instr_create(ctx, IR2_ALU); instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA); instr->src_count = 1; instr->ssa.ncomp = 1; instr->alu.vector_opc = VECTOR_NONE; instr->alu.scalar_opc = PRED_SET_POPs; instr->alu.export = -1; instr->alu.write_mask = 1; instr->pred = 0; ctx->pred_idx = instr->idx; } /* restore ctx->pred */ ctx->pred = pred; } /* get the highest block idx in the loop, so we know when * we can free registers that are allocated outside the loop */ static unsigned loop_last_block(struct exec_list *list) { nir_cf_node *node = exec_node_data(nir_cf_node, exec_list_get_tail(list), node); switch (node->type) { case nir_cf_node_block: return nir_cf_node_as_block(node)->index; case nir_cf_node_if: assert(0); /* XXX could this ever happen? */ return 0; case nir_cf_node_loop: return loop_last_block(&nir_cf_node_as_loop(node)->body); default: compile_error(ctx, "Not supported\n"); return 0; } } static void emit_loop(struct ir2_context *ctx, nir_loop *nloop) { ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body); emit_cf_list(ctx, &nloop->body); ctx->loop_depth--; } static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list) { bool ret = false; foreach_list_typed(nir_cf_node, node, node, list) { ret = false; switch (node->type) { case nir_cf_node_block: ret = emit_block(ctx, nir_cf_node_as_block(node)); break; case nir_cf_node_if: emit_if(ctx, nir_cf_node_as_if(node)); break; case nir_cf_node_loop: emit_loop(ctx, nir_cf_node_as_loop(node)); break; case nir_cf_node_function: compile_error(ctx, "Not supported\n"); break; } } return ret; } static void cleanup_binning(struct ir2_context *ctx) { assert(ctx->so->type == MESA_SHADER_VERTEX); /* kill non-position outputs for binning variant */ nir_foreach_block(block, nir_shader_get_entrypoint(ctx->nir)) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); if (intr->intrinsic != nir_intrinsic_store_output) continue; if (output_slot(ctx, intr) != VARYING_SLOT_POS) nir_instr_remove(instr); } } ir2_optimize_nir(ctx->nir, false); } void ir2_nir_compile(struct ir2_context *ctx, bool binning) { struct fd2_shader_stateobj *so = ctx->so; memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map)); ctx->nir = nir_shader_clone(NULL, so->nir); if (binning) cleanup_binning(ctx); /* postprocess */ OPT_V(ctx->nir, nir_opt_algebraic_late); OPT_V(ctx->nir, nir_copy_prop); OPT_V(ctx->nir, nir_opt_dce); OPT_V(ctx->nir, nir_opt_move_comparisons); OPT_V(ctx->nir, nir_lower_int_to_float); OPT_V(ctx->nir, nir_lower_bool_to_float); OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods); /* TODO: static bitset ? */ BITSET_DECLARE(scalar_ops, nir_num_opcodes); BITSET_ZERO(scalar_ops); BITSET_SET(scalar_ops, nir_op_frsq); BITSET_SET(scalar_ops, nir_op_frcp); BITSET_SET(scalar_ops, nir_op_flog2); BITSET_SET(scalar_ops, nir_op_fexp2); BITSET_SET(scalar_ops, nir_op_fsqrt); BITSET_SET(scalar_ops, nir_op_fcos); BITSET_SET(scalar_ops, nir_op_fsin); OPT_V(ctx->nir, nir_lower_alu_to_scalar, scalar_ops); OPT_V(ctx->nir, nir_lower_locals_to_regs); OPT_V(ctx->nir, nir_convert_from_ssa, true); OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest); OPT_V(ctx->nir, nir_lower_vec_to_movs); OPT_V(ctx->nir, nir_opt_dce); nir_sweep(ctx->nir); if (fd_mesa_debug & FD_DBG_DISASM) { debug_printf("----------------------\n"); nir_print_shader(ctx->nir, stdout); debug_printf("----------------------\n"); } /* fd2_shader_stateobj init */ if (so->type == MESA_SHADER_FRAGMENT) { ctx->f->fragcoord = -1; ctx->f->inputs_count = 0; memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs)); } /* Setup inputs: */ nir_foreach_variable(in, &ctx->nir->inputs) setup_input(ctx, in); if (so->type == MESA_SHADER_FRAGMENT) { unsigned idx; for (idx = 0; idx < ctx->f->inputs_count; idx++) { ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp; update_range(ctx, &ctx->input[idx]); } /* assume we have param input and kill it later if not */ ctx->input[idx].ncomp = 4; update_range(ctx, &ctx->input[idx]); } else { ctx->input[0].ncomp = 1; ctx->input[2].ncomp = 1; update_range(ctx, &ctx->input[0]); update_range(ctx, &ctx->input[2]); } /* And emit the body: */ nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir); nir_foreach_register(reg, &fxn->registers) { ctx->reg[reg->index].ncomp = reg->num_components; ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1); } nir_metadata_require(fxn, nir_metadata_block_index); emit_cf_list(ctx, &fxn->body); /* TODO emit_block(ctx, fxn->end_block); */ if (so->type == MESA_SHADER_VERTEX) extra_position_exports(ctx, binning); ralloc_free(ctx->nir); /* kill unused param input */ if (so->type == MESA_SHADER_FRAGMENT && !so->need_param) ctx->input[ctx->f->inputs_count].initialized = false; }