/* * Copyright 2013 Vadim Girlin * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * on the rights to use, copy, modify, merge, publish, distribute, sub * license, and/or sell copies of the Software, and to permit persons to whom * the Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. * * Authors: * Vadim Girlin */ #define FBC_DEBUG 0 #if FBC_DEBUG #define FBC_DUMP(q) do { q } while (0) #else #define FBC_DUMP(q) #endif #include "sb_bc.h" #include "sb_shader.h" #include "sb_pass.h" namespace r600_sb { void bc_finalizer::insert_rv6xx_load_ar_workaround(alu_group_node *b4) { alu_group_node *g = sh.create_alu_group(); alu_node *a = sh.create_alu(); a->bc.set_op(ALU_OP0_NOP); a->bc.last = 1; g->push_back(a); b4->insert_before(g); } int bc_finalizer::run() { run_on(sh.root); regions_vec &rv = sh.get_regions(); for (regions_vec::reverse_iterator I = rv.rbegin(), E = rv.rend(); I != E; ++I) { region_node *r = *I; assert(r); bool loop = r->is_loop(); if (loop) finalize_loop(r); else finalize_if(r); r->expand(); } cf_peephole(); // workaround for some problems on r6xx/7xx // add ALU NOP to each vertex shader if (!ctx.is_egcm() && (sh.target == TARGET_VS || sh.target == TARGET_ES)) { cf_node *c = sh.create_clause(NST_ALU_CLAUSE); alu_group_node *g = sh.create_alu_group(); alu_node *a = sh.create_alu(); a->bc.set_op(ALU_OP0_NOP); a->bc.last = 1; g->push_back(a); c->push_back(g); sh.root->push_back(c); c = sh.create_cf(CF_OP_NOP); sh.root->push_back(c); last_cf = c; } if (!ctx.is_cayman() && last_cf->bc.op_ptr->flags & CF_ALU) { last_cf = sh.create_cf(CF_OP_NOP); sh.root->push_back(last_cf); } if (ctx.is_cayman()) { if (!last_cf) { cf_node *c = sh.create_cf(CF_OP_CF_END); sh.root->push_back(c); } else last_cf->insert_after(sh.create_cf(CF_OP_CF_END)); } else last_cf->bc.end_of_program = 1; for (unsigned t = EXP_PIXEL; t < EXP_TYPE_COUNT; ++t) { cf_node *le = last_export[t]; if (le) le->bc.set_op(CF_OP_EXPORT_DONE); } sh.ngpr = ngpr; sh.nstack = nstack; return 0; } void bc_finalizer::finalize_loop(region_node* r) { update_nstack(r); cf_node *loop_start = sh.create_cf(CF_OP_LOOP_START_DX10); cf_node *loop_end = sh.create_cf(CF_OP_LOOP_END); // Update last_cf, but don't overwrite it if it's outside the current loop nest since // it may point to a cf that is later in program order. // The single parent level check is sufficient since finalize_loop() is processed in // reverse order from innermost to outermost loop nest level. if (!last_cf || last_cf->get_parent_region() == r) { last_cf = loop_end; } loop_start->jump_after(loop_end); loop_end->jump_after(loop_start); for (depart_vec::iterator I = r->departs.begin(), E = r->departs.end(); I != E; ++I) { depart_node *dep = *I; cf_node *loop_break = sh.create_cf(CF_OP_LOOP_BREAK); loop_break->jump(loop_end); dep->push_back(loop_break); dep->expand(); } // FIXME produces unnecessary LOOP_CONTINUE for (repeat_vec::iterator I = r->repeats.begin(), E = r->repeats.end(); I != E; ++I) { repeat_node *rep = *I; if (!(rep->parent == r && rep->prev == NULL)) { cf_node *loop_cont = sh.create_cf(CF_OP_LOOP_CONTINUE); loop_cont->jump(loop_end); rep->push_back(loop_cont); } rep->expand(); } r->push_front(loop_start); r->push_back(loop_end); } void bc_finalizer::finalize_if(region_node* r) { update_nstack(r); // expecting the following control flow structure here: // - region // { // - depart/repeat 1 (it may be depart/repeat for some outer region) // { // - if // { // - depart/repeat 2 (possibly for outer region) // { // - some optional code // } // } // - optional code> ... // } // } container_node *repdep1 = static_cast(r->first); assert(repdep1->is_depart() || repdep1->is_repeat()); if_node *n_if = static_cast(repdep1->first); if (n_if) { assert(n_if->is_if()); container_node *repdep2 = static_cast(n_if->first); assert(repdep2->is_depart() || repdep2->is_repeat()); cf_node *if_jump = sh.create_cf(CF_OP_JUMP); cf_node *if_pop = sh.create_cf(CF_OP_POP); if (!last_cf || last_cf->get_parent_region() == r) { last_cf = if_pop; } if_pop->bc.pop_count = 1; if_pop->jump_after(if_pop); r->push_front(if_jump); r->push_back(if_pop); bool has_else = n_if->next; if (has_else) { cf_node *nelse = sh.create_cf(CF_OP_ELSE); n_if->insert_after(nelse); if_jump->jump(nelse); nelse->jump_after(if_pop); nelse->bc.pop_count = 1; } else { if_jump->jump_after(if_pop); if_jump->bc.pop_count = 1; } n_if->expand(); } for (depart_vec::iterator I = r->departs.begin(), E = r->departs.end(); I != E; ++I) { (*I)->expand(); } r->departs.clear(); assert(r->repeats.empty()); } void bc_finalizer::run_on(container_node* c) { node *prev_node = NULL; for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) { node *n = *I; if (n->is_alu_group()) { finalize_alu_group(static_cast(n), prev_node); } else { if (n->is_alu_clause()) { cf_node *c = static_cast(n); if (c->bc.op == CF_OP_ALU_PUSH_BEFORE && ctx.is_egcm()) { if (ctx.stack_workaround_8xx) { region_node *r = c->get_parent_region(); if (r) { unsigned ifs, loops; unsigned elems = get_stack_depth(r, loops, ifs); unsigned dmod1 = elems % ctx.stack_entry_size; unsigned dmod2 = (elems + 1) % ctx.stack_entry_size; if (elems && (!dmod1 || !dmod2)) c->flags |= NF_ALU_STACK_WORKAROUND; } } else if (ctx.stack_workaround_9xx) { region_node *r = c->get_parent_region(); if (r) { unsigned ifs, loops; get_stack_depth(r, loops, ifs); if (loops >= 2) c->flags |= NF_ALU_STACK_WORKAROUND; } } } } else if (n->is_fetch_inst()) { finalize_fetch(static_cast(n)); } else if (n->is_cf_inst()) { finalize_cf(static_cast(n)); } if (n->is_container()) run_on(static_cast(n)); } prev_node = n; } } void bc_finalizer::finalize_alu_group(alu_group_node* g, node *prev_node) { alu_node *last = NULL; alu_group_node *prev_g = NULL; bool add_nop = false; if (prev_node && prev_node->is_alu_group()) { prev_g = static_cast(prev_node); } for (node_iterator I = g->begin(), E = g->end(); I != E; ++I) { alu_node *n = static_cast(*I); unsigned slot = n->bc.slot; value *d = n->dst.empty() ? NULL : n->dst[0]; if (d && d->is_special_reg()) { assert((n->bc.op_ptr->flags & AF_MOVA) || d->is_geometry_emit()); d = NULL; } sel_chan fdst = d ? d->get_final_gpr() : sel_chan(0, 0); if (d) { assert(fdst.chan() == slot || slot == SLOT_TRANS); } if (!(n->bc.op_ptr->flags & AF_MOVA && ctx.is_cayman())) n->bc.dst_gpr = fdst.sel(); n->bc.dst_chan = d ? fdst.chan() : slot < SLOT_TRANS ? slot : 0; if (d && d->is_rel() && d->rel && !d->rel->is_const()) { n->bc.dst_rel = 1; update_ngpr(d->array->gpr.sel() + d->array->array_size -1); } else { n->bc.dst_rel = 0; } n->bc.write_mask = d != NULL; n->bc.last = 0; if (n->bc.op_ptr->flags & AF_PRED) { n->bc.update_pred = (n->dst[1] != NULL); n->bc.update_exec_mask = (n->dst[2] != NULL); } // FIXME handle predication here n->bc.pred_sel = PRED_SEL_OFF; update_ngpr(n->bc.dst_gpr); add_nop |= finalize_alu_src(g, n, prev_g); last = n; } if (add_nop) { if (sh.get_ctx().r6xx_gpr_index_workaround) { insert_rv6xx_load_ar_workaround(g); } } last->bc.last = 1; } bool bc_finalizer::finalize_alu_src(alu_group_node* g, alu_node* a, alu_group_node *prev) { vvec &sv = a->src; bool add_nop = false; FBC_DUMP( sblog << "finalize_alu_src: "; dump::dump_op(a); sblog << "\n"; ); unsigned si = 0; for (vvec::iterator I = sv.begin(), E = sv.end(); I != E; ++I, ++si) { value *v = *I; assert(v); bc_alu_src &src = a->bc.src[si]; sel_chan sc; src.rel = 0; sel_chan gpr; switch (v->kind) { case VLK_REL_REG: sc = v->get_final_gpr(); src.sel = sc.sel(); src.chan = sc.chan(); if (!v->rel->is_const()) { src.rel = 1; update_ngpr(v->array->gpr.sel() + v->array->array_size -1); if (prev && !add_nop) { for (node_iterator pI = prev->begin(), pE = prev->end(); pI != pE; ++pI) { alu_node *pn = static_cast(*pI); if (pn->bc.dst_gpr == src.sel) { add_nop = true; break; } } } } else src.rel = 0; break; case VLK_REG: gpr = v->get_final_gpr(); src.sel = gpr.sel(); src.chan = gpr.chan(); update_ngpr(src.sel); break; case VLK_TEMP: src.sel = v->gpr.sel(); src.chan = v->gpr.chan(); update_ngpr(src.sel); break; case VLK_UNDEF: case VLK_CONST: { literal lv = v->literal_value; src.chan = 0; if (lv == literal(0)) src.sel = ALU_SRC_0; else if (lv == literal(0.5f)) src.sel = ALU_SRC_0_5; else if (lv == literal(1.0f)) src.sel = ALU_SRC_1; else if (lv == literal(1)) src.sel = ALU_SRC_1_INT; else if (lv == literal(-1)) src.sel = ALU_SRC_M_1_INT; else { src.sel = ALU_SRC_LITERAL; src.chan = g->literal_chan(lv); src.value = lv; } break; } case VLK_KCACHE: { cf_node *clause = static_cast(g->parent); assert(clause->is_alu_clause()); sel_chan k = translate_kcache(clause, v); assert(k && "kcache translation failed"); src.sel = k.sel(); src.chan = k.chan(); break; } case VLK_PARAM: case VLK_SPECIAL_CONST: src.sel = v->select.sel(); src.chan = v->select.chan(); break; default: assert(!"unknown value kind"); break; } if (prev && !add_nop) { for (node_iterator pI = prev->begin(), pE = prev->end(); pI != pE; ++pI) { alu_node *pn = static_cast(*pI); if (pn->bc.dst_rel) { if (pn->bc.dst_gpr == src.sel) { add_nop = true; break; } } } } } while (si < 3) { a->bc.src[si++].sel = 0; } return add_nop; } void bc_finalizer::copy_fetch_src(fetch_node &dst, fetch_node &src, unsigned arg_start) { int reg = -1; for (unsigned chan = 0; chan < 4; ++chan) { dst.bc.dst_sel[chan] = SEL_MASK; unsigned sel = SEL_MASK; value *v = src.src[arg_start + chan]; if (!v || v->is_undef()) { sel = SEL_MASK; } else if (v->is_const()) { literal l = v->literal_value; if (l == literal(0)) sel = SEL_0; else if (l == literal(1.0f)) sel = SEL_1; else { sblog << "invalid fetch constant operand " << chan << " "; dump::dump_op(&src); sblog << "\n"; abort(); } } else if (v->is_any_gpr()) { unsigned vreg = v->gpr.sel(); unsigned vchan = v->gpr.chan(); if (reg == -1) reg = vreg; else if ((unsigned)reg != vreg) { sblog << "invalid fetch source operand " << chan << " "; dump::dump_op(&src); sblog << "\n"; abort(); } sel = vchan; } else { sblog << "invalid fetch source operand " << chan << " "; dump::dump_op(&src); sblog << "\n"; abort(); } dst.bc.src_sel[chan] = sel; } if (reg >= 0) update_ngpr(reg); dst.bc.src_gpr = reg >= 0 ? reg : 0; } void bc_finalizer::emit_set_grad(fetch_node* f) { assert(f->src.size() == 12 || f->src.size() == 13); unsigned ops[2] = { FETCH_OP_SET_GRADIENTS_V, FETCH_OP_SET_GRADIENTS_H }; unsigned arg_start = 0; for (unsigned op = 0; op < 2; ++op) { fetch_node *n = sh.create_fetch(); n->bc.set_op(ops[op]); arg_start += 4; copy_fetch_src(*n, *f, arg_start); f->insert_before(n); } } void bc_finalizer::emit_set_texture_offsets(fetch_node &f) { assert(f.src.size() == 8); fetch_node *n = sh.create_fetch(); n->bc.set_op(FETCH_OP_SET_TEXTURE_OFFSETS); copy_fetch_src(*n, f, 4); f.insert_before(n); } void bc_finalizer::finalize_fetch(fetch_node* f) { int reg = -1; // src unsigned src_count = 4; unsigned flags = f->bc.op_ptr->flags; if (flags & FF_VTX) { src_count = 1; } else if (flags & FF_USEGRAD) { emit_set_grad(f); } else if (flags & FF_USE_TEXTURE_OFFSETS) { emit_set_texture_offsets(*f); } for (unsigned chan = 0; chan < src_count; ++chan) { unsigned sel = f->bc.src_sel[chan]; if (sel > SEL_W) continue; value *v = f->src[chan]; if (v->is_undef()) { sel = SEL_MASK; } else if (v->is_const()) { literal l = v->literal_value; if (l == literal(0)) sel = SEL_0; else if (l == literal(1.0f)) sel = SEL_1; else { sblog << "invalid fetch constant operand " << chan << " "; dump::dump_op(f); sblog << "\n"; abort(); } } else if (v->is_any_gpr()) { unsigned vreg = v->gpr.sel(); unsigned vchan = v->gpr.chan(); if (reg == -1) reg = vreg; else if ((unsigned)reg != vreg) { sblog << "invalid fetch source operand " << chan << " "; dump::dump_op(f); sblog << "\n"; abort(); } sel = vchan; } else { sblog << "invalid fetch source operand " << chan << " "; dump::dump_op(f); sblog << "\n"; abort(); } f->bc.src_sel[chan] = sel; } if (reg >= 0) update_ngpr(reg); f->bc.src_gpr = reg >= 0 ? reg : 0; // dst reg = -1; unsigned dst_swz[4] = {SEL_MASK, SEL_MASK, SEL_MASK, SEL_MASK}; for (unsigned chan = 0; chan < 4; ++chan) { unsigned sel = f->bc.dst_sel[chan]; if (sel == SEL_MASK) continue; value *v = f->dst[chan]; if (!v) continue; if (v->is_any_gpr()) { unsigned vreg = v->gpr.sel(); unsigned vchan = v->gpr.chan(); if (reg == -1) reg = vreg; else if ((unsigned)reg != vreg) { sblog << "invalid fetch dst operand " << chan << " "; dump::dump_op(f); sblog << "\n"; abort(); } dst_swz[vchan] = sel; } else { sblog << "invalid fetch dst operand " << chan << " "; dump::dump_op(f); sblog << "\n"; abort(); } } for (unsigned i = 0; i < 4; ++i) f->bc.dst_sel[i] = dst_swz[i]; assert(reg >= 0); if (reg >= 0) update_ngpr(reg); f->bc.dst_gpr = reg >= 0 ? reg : 0; } void bc_finalizer::finalize_cf(cf_node* c) { unsigned flags = c->bc.op_ptr->flags; c->bc.end_of_program = 0; last_cf = c; if (flags & CF_EXP) { c->bc.set_op(CF_OP_EXPORT); last_export[c->bc.type] = c; int reg = -1; for (unsigned chan = 0; chan < 4; ++chan) { unsigned sel = c->bc.sel[chan]; if (sel > SEL_W) continue; value *v = c->src[chan]; if (v->is_undef()) { sel = SEL_MASK; } else if (v->is_const()) { literal l = v->literal_value; if (l == literal(0)) sel = SEL_0; else if (l == literal(1.0f)) sel = SEL_1; else { sblog << "invalid export constant operand " << chan << " "; dump::dump_op(c); sblog << "\n"; abort(); } } else if (v->is_any_gpr()) { unsigned vreg = v->gpr.sel(); unsigned vchan = v->gpr.chan(); if (reg == -1) reg = vreg; else if ((unsigned)reg != vreg) { sblog << "invalid export source operand " << chan << " "; dump::dump_op(c); sblog << "\n"; abort(); } sel = vchan; } else { sblog << "invalid export source operand " << chan << " "; dump::dump_op(c); sblog << "\n"; abort(); } c->bc.sel[chan] = sel; } if (reg >= 0) update_ngpr(reg); c->bc.rw_gpr = reg >= 0 ? reg : 0; } else if (flags & CF_MEM) { int reg = -1; unsigned mask = 0; for (unsigned chan = 0; chan < 4; ++chan) { value *v = c->src[chan]; if (!v || v->is_undef()) continue; if (!v->is_any_gpr() || v->gpr.chan() != chan) { sblog << "invalid source operand " << chan << " "; dump::dump_op(c); sblog << "\n"; abort(); } unsigned vreg = v->gpr.sel(); if (reg == -1) reg = vreg; else if ((unsigned)reg != vreg) { sblog << "invalid source operand " << chan << " "; dump::dump_op(c); sblog << "\n"; abort(); } mask |= (1 << chan); } if (reg >= 0) update_ngpr(reg); c->bc.rw_gpr = reg >= 0 ? reg : 0; c->bc.comp_mask = mask; if (((flags & CF_RAT) || (!(flags & CF_STRM))) && (c->bc.type & 1)) { reg = -1; for (unsigned chan = 0; chan < 4; ++chan) { value *v = c->src[4 + chan]; if (!v || v->is_undef()) continue; if (!v->is_any_gpr() || v->gpr.chan() != chan) { sblog << "invalid source operand " << chan << " "; dump::dump_op(c); sblog << "\n"; abort(); } unsigned vreg = v->gpr.sel(); if (reg == -1) reg = vreg; else if ((unsigned)reg != vreg) { sblog << "invalid source operand " << chan << " "; dump::dump_op(c); sblog << "\n"; abort(); } } assert(reg >= 0); if (reg >= 0) update_ngpr(reg); c->bc.index_gpr = reg >= 0 ? reg : 0; } } else if (flags & CF_CALL) { update_nstack(c->get_parent_region(), ctx.wavefront_size == 16 ? 2 : 1); } } sel_chan bc_finalizer::translate_kcache(cf_node* alu, value* v) { unsigned sel = v->select.kcache_sel(); unsigned bank = v->select.kcache_bank(); unsigned chan = v->select.chan(); static const unsigned kc_base[] = {128, 160, 256, 288}; sel &= 4095; unsigned line = sel >> 4; for (unsigned k = 0; k < 4; ++k) { bc_kcache &kc = alu->bc.kc[k]; if (kc.mode == KC_LOCK_NONE) break; if (kc.bank == bank && (kc.addr == line || (kc.mode == KC_LOCK_2 && kc.addr + 1 == line))) { sel = kc_base[k] + (sel - (kc.addr << 4)); return sel_chan(sel, chan); } } assert(!"kcache translation error"); return 0; } void bc_finalizer::update_ngpr(unsigned gpr) { if (gpr < MAX_GPR - ctx.alu_temp_gprs && gpr >= ngpr) ngpr = gpr + 1; } unsigned bc_finalizer::get_stack_depth(node *n, unsigned &loops, unsigned &ifs, unsigned add) { unsigned stack_elements = add; bool has_non_wqm_push = (add != 0); region_node *r = n->is_region() ? static_cast(n) : n->get_parent_region(); loops = 0; ifs = 0; while (r) { if (r->is_loop()) { ++loops; } else { ++ifs; has_non_wqm_push = true; } r = r->get_parent_region(); } stack_elements += (loops * ctx.stack_entry_size) + ifs; // reserve additional elements in some cases switch (ctx.hw_class) { case HW_CLASS_R600: case HW_CLASS_R700: // If any non-WQM push is invoked, 2 elements should be reserved. if (has_non_wqm_push) stack_elements += 2; break; case HW_CLASS_CAYMAN: // If any stack operation is invoked, 2 elements should be reserved if (stack_elements) stack_elements += 2; break; case HW_CLASS_EVERGREEN: // According to the docs we need to reserve 1 element for each of the // following cases: // 1) non-WQM push is used with WQM/LOOP frames on stack // 2) ALU_ELSE_AFTER is used at the point of max stack usage // NOTE: // It was found that the conditions above are not sufficient, there are // other cases where we also need to reserve stack space, that's why // we always reserve 1 stack element if we have non-WQM push on stack. // Condition 2 is ignored for now because we don't use this instruction. if (has_non_wqm_push) ++stack_elements; break; case HW_CLASS_UNKNOWN: assert(0); } return stack_elements; } void bc_finalizer::update_nstack(region_node* r, unsigned add) { unsigned loops = 0; unsigned ifs = 0; unsigned elems = r ? get_stack_depth(r, loops, ifs, add) : add; // XXX all chips expect this value to be computed using 4 as entry size, // not the real entry size unsigned stack_entries = (elems + 3) >> 2; if (nstack < stack_entries) nstack = stack_entries; } void bc_finalizer::cf_peephole() { if (ctx.stack_workaround_8xx || ctx.stack_workaround_9xx) { for (node_iterator N, I = sh.root->begin(), E = sh.root->end(); I != E; I = N) { N = I; ++N; cf_node *c = static_cast(*I); if (c->bc.op == CF_OP_ALU_PUSH_BEFORE && (c->flags & NF_ALU_STACK_WORKAROUND)) { cf_node *push = sh.create_cf(CF_OP_PUSH); c->insert_before(push); push->jump(c); c->bc.set_op(CF_OP_ALU); } } } for (node_iterator N, I = sh.root->begin(), E = sh.root->end(); I != E; I = N) { N = I; ++N; cf_node *c = static_cast(*I); if (c->jump_after_target) { if (c->jump_target->next == NULL) { c->jump_target->insert_after(sh.create_cf(CF_OP_NOP)); if (last_cf == c->jump_target) last_cf = static_cast(c->jump_target->next); } c->jump_target = static_cast(c->jump_target->next); c->jump_after_target = false; } if (c->is_cf_op(CF_OP_POP)) { node *p = c->prev; if (p->is_alu_clause()) { cf_node *a = static_cast(p); if (a->bc.op == CF_OP_ALU) { a->bc.set_op(CF_OP_ALU_POP_AFTER); c->remove(); } } } else if (c->is_cf_op(CF_OP_JUMP) && c->jump_target == c->next) { // if JUMP is immediately followed by its jump target, // then JUMP is useless and we can eliminate it c->remove(); } } } } // namespace r600_sb