/*
 * Copyright (C) 2020 Collabora, Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "compiler.h"

/* NIR creates vectors as vecN ops, which we represent by a synthetic
 * BI_COMBINE instruction, e.g.:
 *
 *      v = combine x, y, z, w
 *
 * These combines need to be lowered by the pass in this file. Fix a given
 * source at component c.
 *
 * First suppose the source is SSA. If it is also scalar, then we may rewrite
 * the destination of the generating instruction (unique by SSA+scalar) to
 * write to v.c, and rewrite each of its uses to swizzle out .c instead of .x
 * (the original by scalar). If it is vector, there are two cases. If the
 * component c is `x`, we are accessing v.x, and each of the succeeding
 * components y, z... up to the last component of the vector are accessed
 * sequentially, then we may perform the same rewrite. If this is not the case,
 * rewriting would require a swizzle or writemask (TODO), so we fallback on a
 * move.
 *
 * Otherwise is the source is not SSA, we also fallback on a move. We could
 * probably do better.
 */

static void
bi_insert_combine_mov(bi_context *ctx, bi_instruction *parent, unsigned comp, unsigned R)
{
        unsigned bits = nir_alu_type_get_type_size(parent->dest_type);
        unsigned bytes = bits / 8;

        bi_instruction move = {
                .type = BI_MOV,
                .dest = R,
                .dest_type = parent->dest_type,
                .writemask = ((1 << bytes) - 1) << (bytes * comp),
                .src = { parent->src[comp] },
                .src_types = { parent->dest_type },
                .swizzle = { { parent->swizzle[comp][0] } }
        };

        bi_emit_before(ctx, parent, move);
}

/* Gets the instruction generating a given source. Combine lowering is
 * accidentally O(n^2) right now because this function is O(n) instead of O(1).
 * If this pass is slow, this cost can be avoided in favour for better
 * bookkeeping. */

static bi_instruction *
bi_get_parent(bi_context *ctx, unsigned idx, unsigned mask)
{
        bi_foreach_instr_global(ctx, ins) {
                if (ins->dest == idx)
                        if ((ins->writemask & mask) == mask)
                                return ins;
        }

        return NULL;
}

/* Rewrites uses of an index. Again, this could be O(n) to the program but is
 * currently O(nc) to the program and number of combines, so the pass becomes
 * effectively O(n^2). Better bookkeeping would bring down to linear if that's
 * an issue. */

static void
bi_rewrite_uses(bi_context *ctx,
                unsigned old, unsigned oldc,
                unsigned new, unsigned newc)
{
        bi_foreach_instr_global(ctx, ins) {
                bi_foreach_src(ins, s) {
                        if (ins->src[s] != old) continue;

                        for (unsigned i = 0; i < 16; ++i)
                                ins->swizzle[s][0] += (newc - oldc);

                        ins->src[s] = new;
                }
        }
}

/* Shifts the writemask of an instruction by a specified byte count,
 * rotating the sources to compensate. Returns true if successful, and
 * returns false if not (nondestructive in this case). */

static bool
bi_shift_mask(bi_instruction *ins, unsigned shift)
{
        /* No op and handles the funny cases */
        if (!shift)
                return true;

        unsigned sz = nir_alu_type_get_type_size(ins->dest_type);
        unsigned bytes = sz / 8;

        /* If things are misaligned, we bail. Check if shift % bytes is
         * nonzero. Note bytes is a power-of-two. */
        if (shift & (bytes - 1))
                return false;

        /* Ensure there are no funny types */
        bi_foreach_src(ins, s) {
                if (ins->src[s] && nir_alu_type_get_type_size(ins->src_types[s]) != sz)
                        return false;
        }

        /* Shift swizzle so old i'th component is accessed by new (i + j)'th
         * component where j is component shift */
        unsigned component_shift = shift / bytes;

        /* Sanity check to avoid memory corruption */
        if (component_shift >= sizeof(ins->swizzle[0]))
                return false;

        /* Otherwise, shift is divisible by bytes, and all relevant src types
         * are the same size as the dest type. */
        ins->writemask <<= shift;

        bi_foreach_src(ins, s) {
                if (!ins->src[s]) continue;

                size_t overlap = sizeof(ins->swizzle[s]) - component_shift;
                memmove(ins->swizzle[s] + component_shift, ins->swizzle[s], overlap);
        }

        return true;
}

/* Checks if we have a nicely aligned vector prefix */

static bool
bi_is_aligned_vec(bi_instruction *combine, unsigned s, bi_instruction *parent,
                unsigned *count)
{
        /* We only support prefixes */
        if (s != 0)
                return false;

        /* Is it a contiguous write? */
        unsigned writes = util_bitcount(parent->writemask);
        if (parent->writemask != ((1 << writes) - 1))
                return false;

        /* Okay - how many components? */
        unsigned bytes = nir_alu_type_get_type_size(parent->dest_type) / 8;
        unsigned components = writes / bytes;

        /* Are we contiguous like that? */

        for (unsigned i = 0; i < components; ++i) {
                if (combine->src[i] != parent->dest)
                        return false;

                if (combine->swizzle[i][0] != i)
                        return false;
        }

        /* We're good to go */
        *count = components;
        return true;
}

/* Tries to lower a given source of a combine to an appropriate rewrite,
 * returning true if successful, and false with no changes otherwise. */

static bool
bi_lower_combine_src(bi_context *ctx, bi_instruction *ins, unsigned s, unsigned R,
                unsigned *vec_count)
{
        unsigned src = ins->src[s];

        /* We currently only handle SSA */

        if (!src) return false;
        if (src & (BIR_SPECIAL | BIR_IS_REG)) return false;

        /* We are SSA. Lookup the generating instruction. */
        unsigned bytes = nir_alu_type_get_type_size(ins->dest_type) / 8;

        bi_instruction *parent = bi_get_parent(ctx, src,
                         0xF << (ins->swizzle[s][0] * bytes));

        if (!parent) return false;

        /* We have a parent instuction, sanity check the typesize */
        unsigned pbytes = nir_alu_type_get_type_size(parent->dest_type) / 8;
        if (pbytes != bytes) return false;

        bool scalar = (parent->writemask == ((1 << bytes) - 1));
        if (!(scalar || bi_is_aligned_vec(ins, s, parent, vec_count))) return false;

        if (!bi_shift_mask(parent, bytes * s)) return false;
        bi_rewrite_uses(ctx, parent->dest, 0, R, s);
        parent->dest = R;
        return true;
}

void
bi_lower_combine(bi_context *ctx, bi_block *block)
{
        bi_foreach_instr_in_block_safe(block, ins) {
                if (ins->type != BI_COMBINE) continue;

                /* The vector itself can't be shifted */
                assert(ins->writemask & 0x1);

                unsigned R = bi_make_temp_reg(ctx);

                bi_foreach_src(ins, s) {
                        unsigned vec_count = 0;

                        if (bi_lower_combine_src(ctx, ins, s, R, &vec_count)) {
                                /* Skip vectored sources */
                                if (vec_count)
                                        s += (vec_count - 1);
                        } else {
                                bi_insert_combine_mov(ctx, ins, s, R);
                        }
                }


                bi_rewrite_uses(ctx, ins->dest, 0, R, 0);
                bi_remove_instruction(ins);
        }
}