/*
 * Copyright (C) 2019 Collabora, Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * Authors (Collabora):
 *   Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
 */

#include "compiler.h"
#include "util/u_math.h"
#include "util/u_memory.h"

/* This pass promotes reads from uniforms from load/store ops to uniform
 * registers if it is beneficial to do so. Normally, this saves both
 * instructions and total register pressure, but it does take a toll on the
 * number of work registers that are available, so this is a balance.
 *
 * We use a heuristic to determine the ideal count, implemented by
 * mir_work_heuristic, which returns the ideal number of work registers.
 */

static bool
mir_is_promoteable_ubo(midgard_instruction *ins)
{
        /* TODO: promote unaligned access via swizzle? */

        return (ins->type == TAG_LOAD_STORE_4) &&
                (OP_IS_UBO_READ(ins->load_store.op)) &&
                !(ins->constants.u32[0] & 0xF) &&
                !(ins->load_store.arg_1) &&
                (ins->load_store.arg_2 == 0x1E) &&
                ((ins->constants.u32[0] / 16) < 16);
}

static unsigned
mir_promoteable_uniform_count(compiler_context *ctx)
{
        unsigned count = 0;

        mir_foreach_instr_global(ctx, ins) {
                if (mir_is_promoteable_ubo(ins))
                        count = MAX2(count, ins->constants.u32[0] / 16);
        }

        return count;
}

static unsigned
mir_count_live(uint16_t *live, unsigned temp_count)
{
        unsigned count = 0;

        for (unsigned i = 0; i < temp_count; ++i)
                count += util_bitcount(live[i]);

        return count;
}

static unsigned
mir_estimate_pressure(compiler_context *ctx)
{
        mir_invalidate_liveness(ctx);
        mir_compute_liveness(ctx);

        unsigned max_live = 0;

        mir_foreach_block(ctx, _block) {
                midgard_block *block = (midgard_block *) _block;
                uint16_t *live = mem_dup(block->base.live_out, ctx->temp_count * sizeof(uint16_t));

                mir_foreach_instr_in_block_rev(block, ins) {
                        unsigned count = mir_count_live(live, ctx->temp_count);
                        max_live = MAX2(max_live, count);
                        mir_liveness_ins_update(live, ins, ctx->temp_count);
                }

                free(live);
        }

        return DIV_ROUND_UP(max_live, 16);
}

static unsigned
mir_work_heuristic(compiler_context *ctx)
{
        unsigned uniform_count = mir_promoteable_uniform_count(ctx);

        /* If there are 8 or fewer uniforms, it doesn't matter what we do, so
         * allow as many work registers as needed */

        if (uniform_count <= 8)
                return 16;

        /* Otherwise, estimate the register pressure */

        unsigned pressure = mir_estimate_pressure(ctx);

        /* Prioritize not spilling above all else. The relation between the
         * pressure estimate and the actual register pressure is a little
         * murkier than we might like (due to scheduling, pipeline registers,
         * failure to pack vector registers, load/store registers, texture
         * registers...), hence why this is a heuristic parameter */

        if (pressure > 6)
                return 16;

        /* If there's no chance of spilling, prioritize UBOs and thread count */

        return 8;
}

/* Bitset of indices that will be used as a special register -- inputs to a
 * non-ALU op. We precompute this set so that testing is efficient, otherwise
 * we end up O(mn) behaviour for n instructions and m uniform reads */

static BITSET_WORD *
mir_special_indices(compiler_context *ctx)
{
        mir_compute_temp_count(ctx);
        BITSET_WORD *bset = calloc(BITSET_WORDS(ctx->temp_count), sizeof(BITSET_WORD));

        mir_foreach_instr_global(ctx, ins) {
                /* Look for special instructions */
                bool is_ldst = ins->type == TAG_LOAD_STORE_4;
                bool is_tex = ins->type == TAG_TEXTURE_4;
                bool is_writeout = ins->compact_branch && ins->writeout;

                if (!(is_ldst || is_tex || is_writeout))
                        continue;

                /* Anything read by a special instruction is itself special */
                mir_foreach_src(ins, i) {
                        unsigned idx = ins->src[i];

                        if (idx < ctx->temp_count)
                                BITSET_SET(bset, idx);
                }
        }

        return bset;
}

void
midgard_promote_uniforms(compiler_context *ctx)
{
        unsigned work_count = mir_work_heuristic(ctx);
        unsigned promoted_count = 24 - work_count;

        /* First, figure out special indices a priori so we don't recompute a lot */
        BITSET_WORD *special = mir_special_indices(ctx);

        mir_foreach_instr_global_safe(ctx, ins) {
                if (!mir_is_promoteable_ubo(ins)) continue;

                unsigned off = ins->constants.u32[0];
                unsigned address = off / 16;

                /* Check if it's a promotable range */
                unsigned uniform_reg = 23 - address;

                if (address >= promoted_count) continue;

                /* It is, great! Let's promote */

                ctx->uniform_cutoff = MAX2(ctx->uniform_cutoff, address + 1);
                unsigned promoted = SSA_FIXED_REGISTER(uniform_reg);

                /* We do need the move for safety for a non-SSA dest, or if
                 * we're being fed into a special class */

                bool needs_move = ins->dest & PAN_IS_REG || ins->dest == ctx->blend_src1;

                if (ins->dest < ctx->temp_count)
                        needs_move |= BITSET_TEST(special, ins->dest);

                if (needs_move) {
                        unsigned type_size = nir_alu_type_get_type_size(ins->dest_type);
                        midgard_instruction mov = v_mov(promoted, ins->dest);
                        mov.dest_type = nir_type_uint | type_size;
                        mov.src_types[0] = mov.dest_type;

                        if (type_size == 64)
                                mov.alu.reg_mode = midgard_reg_mode_64;

                        uint16_t rounded = mir_round_bytemask_up(mir_bytemask(ins), type_size);
                        mir_set_bytemask(&mov, rounded);
                        mir_insert_instruction_before(ctx, ins, mov);
                } else {
                        mir_rewrite_index_src(ctx, ins->dest, promoted);
                }

                mir_remove_instruction(ins);
        }

        free(special);
}