/* * Copyright © 2015 Red Hat * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * Authors: * Rob Clark */ #include "nir.h" #include "nir_builder.h" /* Has two paths * One (nir_lower_idiv_fast) lowers idiv/udiv/umod and is based on * NV50LegalizeSSA::handleDIV() * * Note that this path probably does not have not enough precision for * compute shaders. Perhaps we want a second higher precision (looping) * version of this? Or perhaps we assume if you can do compute shaders you * can also branch out to a pre-optimized shader library routine.. * * The other path (nir_lower_idiv_precise) is based off of code used by LLVM's * AMDGPU target. It should handle 32-bit idiv/irem/imod/udiv/umod exactly. */ static bool convert_instr(nir_builder *bld, nir_alu_instr *alu) { nir_ssa_def *numer, *denom, *af, *bf, *a, *b, *q, *r, *rt; nir_op op = alu->op; bool is_signed; if ((op != nir_op_idiv) && (op != nir_op_udiv) && (op != nir_op_imod) && (op != nir_op_umod) && (op != nir_op_irem)) return false; is_signed = (op == nir_op_idiv || op == nir_op_imod || op == nir_op_irem); bld->cursor = nir_before_instr(&alu->instr); numer = nir_ssa_for_alu_src(bld, alu, 0); denom = nir_ssa_for_alu_src(bld, alu, 1); if (is_signed) { af = nir_i2f32(bld, numer); bf = nir_i2f32(bld, denom); af = nir_fabs(bld, af); bf = nir_fabs(bld, bf); a = nir_iabs(bld, numer); b = nir_iabs(bld, denom); } else { af = nir_u2f32(bld, numer); bf = nir_u2f32(bld, denom); a = numer; b = denom; } /* get first result: */ bf = nir_frcp(bld, bf); bf = nir_isub(bld, bf, nir_imm_int(bld, 2)); /* yes, really */ q = nir_fmul(bld, af, bf); if (is_signed) { q = nir_f2i32(bld, q); } else { q = nir_f2u32(bld, q); } /* get error of first result: */ r = nir_imul(bld, q, b); r = nir_isub(bld, a, r); r = nir_u2f32(bld, r); r = nir_fmul(bld, r, bf); r = nir_f2u32(bld, r); /* add quotients: */ q = nir_iadd(bld, q, r); /* correction: if modulus >= divisor, add 1 */ r = nir_imul(bld, q, b); r = nir_isub(bld, a, r); rt = nir_uge(bld, r, b); if (op == nir_op_umod) { q = nir_bcsel(bld, rt, nir_isub(bld, r, b), r); } else { r = nir_b2i32(bld, rt); q = nir_iadd(bld, q, r); if (is_signed) { /* fix the sign: */ r = nir_ixor(bld, numer, denom); r = nir_ilt(bld, r, nir_imm_int(bld, 0)); b = nir_ineg(bld, q); q = nir_bcsel(bld, r, b, q); if (op == nir_op_imod || op == nir_op_irem) { q = nir_imul(bld, q, denom); q = nir_isub(bld, numer, q); if (op == nir_op_imod) { q = nir_bcsel(bld, nir_ieq(bld, q, nir_imm_int(bld, 0)), nir_imm_int(bld, 0), nir_bcsel(bld, r, nir_iadd(bld, q, denom), q)); } } } } assert(alu->dest.dest.is_ssa); nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(q)); return true; } /* ported from LLVM's AMDGPUTargetLowering::LowerUDIVREM */ static nir_ssa_def * emit_udiv(nir_builder *bld, nir_ssa_def *numer, nir_ssa_def *denom, bool modulo) { nir_ssa_def *rcp = nir_frcp(bld, nir_u2f32(bld, denom)); rcp = nir_f2u32(bld, nir_fmul_imm(bld, rcp, 4294967296.0)); nir_ssa_def *rcp_lo = nir_imul(bld, rcp, denom); nir_ssa_def *rcp_hi = nir_umul_high(bld, rcp, denom); nir_ssa_def *rcp_hi_ne_zero = nir_ine(bld, rcp_hi, nir_imm_int(bld, 0)); nir_ssa_def *neg_rcp_lo = nir_ineg(bld, rcp_lo); nir_ssa_def *abs_rcp_lo = nir_bcsel(bld, rcp_hi_ne_zero, rcp_lo, neg_rcp_lo); nir_ssa_def *e = nir_umul_high(bld, abs_rcp_lo, rcp); nir_ssa_def *rcp_plus_e = nir_iadd(bld, rcp, e); nir_ssa_def *rcp_minus_e = nir_isub(bld, rcp, e); nir_ssa_def *tmp0 = nir_bcsel(bld, rcp_hi_ne_zero, rcp_minus_e, rcp_plus_e); nir_ssa_def *quotient = nir_umul_high(bld, tmp0, numer); nir_ssa_def *num_s_remainder = nir_imul(bld, quotient, denom); nir_ssa_def *remainder = nir_isub(bld, numer, num_s_remainder); nir_ssa_def *remainder_ge_den = nir_uge(bld, remainder, denom); nir_ssa_def *remainder_ge_zero = nir_uge(bld, numer, num_s_remainder); nir_ssa_def *tmp1 = nir_iand(bld, remainder_ge_den, remainder_ge_zero); if (modulo) { nir_ssa_def *rem = nir_bcsel(bld, tmp1, nir_isub(bld, remainder, denom), remainder); return nir_bcsel(bld, remainder_ge_zero, rem, nir_iadd(bld, remainder, denom)); } else { nir_ssa_def *one = nir_imm_int(bld, 1); nir_ssa_def *div = nir_bcsel(bld, tmp1, nir_iadd(bld, quotient, one), quotient); return nir_bcsel(bld, remainder_ge_zero, div, nir_isub(bld, quotient, one)); } } /* ported from LLVM's AMDGPUTargetLowering::LowerSDIVREM */ static nir_ssa_def * emit_idiv(nir_builder *bld, nir_ssa_def *numer, nir_ssa_def *denom, nir_op op) { nir_ssa_def *lh_sign = nir_ilt(bld, numer, nir_imm_int(bld, 0)); nir_ssa_def *rh_sign = nir_ilt(bld, denom, nir_imm_int(bld, 0)); lh_sign = nir_bcsel(bld, lh_sign, nir_imm_int(bld, -1), nir_imm_int(bld, 0)); rh_sign = nir_bcsel(bld, rh_sign, nir_imm_int(bld, -1), nir_imm_int(bld, 0)); nir_ssa_def *lhs = nir_iadd(bld, numer, lh_sign); nir_ssa_def *rhs = nir_iadd(bld, denom, rh_sign); lhs = nir_ixor(bld, lhs, lh_sign); rhs = nir_ixor(bld, rhs, rh_sign); if (op == nir_op_idiv) { nir_ssa_def *d_sign = nir_ixor(bld, lh_sign, rh_sign); nir_ssa_def *res = emit_udiv(bld, lhs, rhs, false); res = nir_ixor(bld, res, d_sign); return nir_isub(bld, res, d_sign); } else { nir_ssa_def *res = emit_udiv(bld, lhs, rhs, true); res = nir_ixor(bld, res, lh_sign); res = nir_isub(bld, res, lh_sign); if (op == nir_op_imod) { nir_ssa_def *cond = nir_ieq(bld, res, nir_imm_int(bld, 0)); cond = nir_ior(bld, nir_ieq(bld, lh_sign, rh_sign), cond); res = nir_bcsel(bld, cond, res, nir_iadd(bld, res, denom)); } return res; } } static bool convert_instr_precise(nir_builder *bld, nir_alu_instr *alu) { nir_op op = alu->op; if ((op != nir_op_idiv) && (op != nir_op_imod) && (op != nir_op_irem) && (op != nir_op_udiv) && (op != nir_op_umod)) return false; if (alu->dest.dest.ssa.bit_size != 32) return false; bld->cursor = nir_before_instr(&alu->instr); nir_ssa_def *numer = nir_ssa_for_alu_src(bld, alu, 0); nir_ssa_def *denom = nir_ssa_for_alu_src(bld, alu, 1); nir_ssa_def *res = NULL; if (op == nir_op_udiv || op == nir_op_umod) res = emit_udiv(bld, numer, denom, op == nir_op_umod); else res = emit_idiv(bld, numer, denom, op); assert(alu->dest.dest.is_ssa); nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(res)); return true; } static bool convert_impl(nir_function_impl *impl, enum nir_lower_idiv_path path) { nir_builder b; nir_builder_init(&b, impl); bool progress = false; nir_foreach_block(block, impl) { nir_foreach_instr_safe(instr, block) { if (instr->type == nir_instr_type_alu && path == nir_lower_idiv_precise) progress |= convert_instr_precise(&b, nir_instr_as_alu(instr)); else if (instr->type == nir_instr_type_alu) progress |= convert_instr(&b, nir_instr_as_alu(instr)); } } nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance); return progress; } bool nir_lower_idiv(nir_shader *shader, enum nir_lower_idiv_path path) { bool progress = false; nir_foreach_function(function, shader) { if (function->impl) progress |= convert_impl(function->impl, path); } return progress; }