/*
 * Copyright © 2013-2015 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "brw_vec4_surface_builder.h"

using namespace brw;

namespace {
   namespace array_utils {
      /**
       * Copy one every \p src_stride logical components of the argument into
       * one every \p dst_stride logical components of the result.
       */
      static src_reg
      emit_stride(const vec4_builder &bld, const src_reg &src, unsigned size,
                  unsigned dst_stride, unsigned src_stride)
      {
         if (src_stride == 1 && dst_stride == 1) {
            return src;
         } else {
            const dst_reg dst = bld.vgrf(src.type,
                                         DIV_ROUND_UP(size * dst_stride, 4));

            for (unsigned i = 0; i < size; ++i)
               bld.MOV(writemask(offset(dst, 8, i * dst_stride / 4),
                                 1 << (i * dst_stride % 4)),
                       swizzle(offset(src, 8, i * src_stride / 4),
                               brw_swizzle_for_mask(1 << (i * src_stride % 4))));

            return src_reg(dst);
         }
      }

      /**
       * Convert a VEC4 into an array of registers with the layout expected by
       * the recipient shared unit.  If \p has_simd4x2 is true the argument is
       * left unmodified in SIMD4x2 form, otherwise it will be rearranged into
       * a SIMD8 vector.
       */
      static src_reg
      emit_insert(const vec4_builder &bld, const src_reg &src,
                  unsigned n, bool has_simd4x2)
      {
         if (src.file == BAD_FILE || n == 0) {
            return src_reg();

         } else {
            /* Pad unused components with zeroes. */
            const unsigned mask = (1 << n) - 1;
            const dst_reg tmp = bld.vgrf(src.type);

            bld.MOV(writemask(tmp, mask), src);
            if (n < 4)
               bld.MOV(writemask(tmp, ~mask), brw_imm_d(0));

            return emit_stride(bld, src_reg(tmp), n, has_simd4x2 ? 1 : 4, 1);
         }
      }

      /**
       * Convert an array of registers back into a VEC4 according to the
       * layout expected from some shared unit.  If \p has_simd4x2 is true the
       * argument is left unmodified in SIMD4x2 form, otherwise it will be
       * rearranged from SIMD8 form.
       */
      static src_reg
      emit_extract(const vec4_builder &bld, const src_reg src,
                   unsigned n, bool has_simd4x2)
      {
         if (src.file == BAD_FILE || n == 0) {
            return src_reg();

         } else {
            return emit_stride(bld, src, n, 1, has_simd4x2 ? 1 : 4);
         }
      }
   }
}

namespace brw {
   namespace surface_access {
      namespace {
         using namespace array_utils;

         /**
          * Generate a send opcode for a surface message and return the
          * result.
          */
         src_reg
         emit_send(const vec4_builder &bld, enum opcode op,
                   const src_reg &header,
                   const src_reg &addr, unsigned addr_sz,
                   const src_reg &src, unsigned src_sz,
                   const src_reg &surface,
                   unsigned arg, unsigned ret_sz,
                   brw_predicate pred = BRW_PREDICATE_NONE)
         {
            /* Calculate the total number of components of the payload. */
            const unsigned header_sz = (header.file == BAD_FILE ? 0 : 1);
            const unsigned sz = header_sz + addr_sz + src_sz;

            /* Construct the payload. */
            const dst_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
            unsigned n = 0;

            if (header_sz)
               bld.exec_all().MOV(offset(payload, 8, n++),
                                  retype(header, BRW_REGISTER_TYPE_UD));

            for (unsigned i = 0; i < addr_sz; i++)
               bld.MOV(offset(payload, 8, n++),
                       offset(retype(addr, BRW_REGISTER_TYPE_UD), 8, i));

            for (unsigned i = 0; i < src_sz; i++)
               bld.MOV(offset(payload, 8, n++),
                       offset(retype(src, BRW_REGISTER_TYPE_UD), 8, i));

            /* Reduce the dynamically uniform surface index to a single
             * scalar.
             */
            const src_reg usurface = bld.emit_uniformize(surface);

            /* Emit the message send instruction. */
            const dst_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, ret_sz);
            vec4_instruction *inst =
               bld.emit(op, dst, src_reg(payload), usurface, brw_imm_ud(arg));
            inst->mlen = sz;
            inst->size_written = ret_sz * REG_SIZE;
            inst->header_size = header_sz;
            inst->predicate = pred;

            return src_reg(dst);
         }
      }

      /**
       * Emit an untyped surface read opcode.  \p dims determines the number
       * of components of the address and \p size the number of components of
       * the returned value.
       */
      src_reg
      emit_untyped_read(const vec4_builder &bld,
                        const src_reg &surface, const src_reg &addr,
                        unsigned dims, unsigned size,
                        brw_predicate pred)
      {
         return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ, src_reg(),
                          emit_insert(bld, addr, dims, true), 1,
                          src_reg(), 0,
                          surface, size, 1, pred);
      }

      /**
       * Emit an untyped surface write opcode.  \p dims determines the number
       * of components of the address and \p size the number of components of
       * the argument.
       */
      void
      emit_untyped_write(const vec4_builder &bld, const src_reg &surface,
                         const src_reg &addr, const src_reg &src,
                         unsigned dims, unsigned size,
                         brw_predicate pred)
      {
         const bool has_simd4x2 = (bld.shader->devinfo->gen >= 8 ||
                                   bld.shader->devinfo->is_haswell);
         emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE, src_reg(),
                   emit_insert(bld, addr, dims, has_simd4x2),
                   has_simd4x2 ? 1 : dims,
                   emit_insert(bld, src, size, has_simd4x2),
                   has_simd4x2 ? 1 : size,
                   surface, size, 0, pred);
      }

      /**
       * Emit an untyped surface atomic opcode.  \p dims determines the number
       * of components of the address and \p rsize the number of components of
       * the returned value (either zero or one).
       */
      src_reg
      emit_untyped_atomic(const vec4_builder &bld,
                          const src_reg &surface, const src_reg &addr,
                          const src_reg &src0, const src_reg &src1,
                          unsigned dims, unsigned rsize, unsigned op,
                          brw_predicate pred)
      {
         const bool has_simd4x2 = (bld.shader->devinfo->gen >= 8 ||
                                   bld.shader->devinfo->is_haswell);

         /* Zip the components of both sources, they are represented as the X
          * and Y components of the same vector.
          */
         const unsigned size = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
         const dst_reg srcs = bld.vgrf(BRW_REGISTER_TYPE_UD);

         if (size >= 1) {
            bld.MOV(writemask(srcs, WRITEMASK_X),
                    swizzle(src0, BRW_SWIZZLE_XXXX));
         }

         if (size >= 2) {
            bld.MOV(writemask(srcs, WRITEMASK_Y),
                    swizzle(src1, BRW_SWIZZLE_XXXX));
         }

         return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC, src_reg(),
                          emit_insert(bld, addr, dims, has_simd4x2),
                          has_simd4x2 ? 1 : dims,
                          emit_insert(bld, src_reg(srcs), size, has_simd4x2),
                          has_simd4x2 && size ? 1 : size,
                          surface, op, rsize, pred);
      }

      namespace {
         /**
          * Initialize the header present in typed surface messages.
          */
         src_reg
         emit_typed_message_header(const vec4_builder &bld)
         {
            const vec4_builder ubld = bld.exec_all();
            const dst_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);

            ubld.MOV(dst, brw_imm_d(0));

            if (bld.shader->devinfo->gen == 7 &&
                !bld.shader->devinfo->is_haswell) {
               /* The sample mask is used on IVB for the SIMD8 messages that
                * have no SIMD4x2 variant.  We only use the two X channels
                * in that case, mask everything else out.
                */
               ubld.MOV(writemask(dst, WRITEMASK_W), brw_imm_d(0x11));
            }

            return src_reg(dst);
         }
      }

      /**
       * Emit a typed surface read opcode.  \p dims determines the number of
       * components of the address and \p size the number of components of the
       * returned value.
       */
      src_reg
      emit_typed_read(const vec4_builder &bld, const src_reg &surface,
                      const src_reg &addr, unsigned dims, unsigned size)
      {
         const bool has_simd4x2 = (bld.shader->devinfo->gen >= 8 ||
                                   bld.shader->devinfo->is_haswell);
         const src_reg tmp =
            emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ,
                      emit_typed_message_header(bld),
                      emit_insert(bld, addr, dims, has_simd4x2),
                      has_simd4x2 ? 1 : dims,
                      src_reg(), 0,
                      surface, size,
                      has_simd4x2 ? 1 : size);

         return emit_extract(bld, tmp, size, has_simd4x2);
      }

      /**
       * Emit a typed surface write opcode.  \p dims determines the number of
       * components of the address and \p size the number of components of the
       * argument.
       */
      void
      emit_typed_write(const vec4_builder &bld, const src_reg &surface,
                       const src_reg &addr, const src_reg &src,
                       unsigned dims, unsigned size)
      {
         const bool has_simd4x2 = (bld.shader->devinfo->gen >= 8 ||
                                   bld.shader->devinfo->is_haswell);
         emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE,
                   emit_typed_message_header(bld),
                   emit_insert(bld, addr, dims, has_simd4x2),
                   has_simd4x2 ? 1 : dims,
                   emit_insert(bld, src, size, has_simd4x2),
                   has_simd4x2 ? 1 : size,
                   surface, size, 0);
      }

      /**
       * Emit a typed surface atomic opcode.  \p dims determines the number of
       * components of the address and \p rsize the number of components of
       * the returned value (either zero or one).
       */
      src_reg
      emit_typed_atomic(const vec4_builder &bld,
                        const src_reg &surface, const src_reg &addr,
                        const src_reg &src0, const src_reg &src1,
                        unsigned dims, unsigned rsize, unsigned op,
                        brw_predicate pred)
      {
         const bool has_simd4x2 = (bld.shader->devinfo->gen >= 8 ||
                                   bld.shader->devinfo->is_haswell);

         /* Zip the components of both sources, they are represented as the X
          * and Y components of the same vector.
          */
         const unsigned size = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
         const dst_reg srcs = bld.vgrf(BRW_REGISTER_TYPE_UD);

         if (size >= 1)
            bld.MOV(writemask(srcs, WRITEMASK_X), src0);
         if (size >= 2)
            bld.MOV(writemask(srcs, WRITEMASK_Y), src1);

         return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC,
                          emit_typed_message_header(bld),
                          emit_insert(bld, addr, dims, has_simd4x2),
                          has_simd4x2 ? 1 : dims,
                          emit_insert(bld, src_reg(srcs), size, has_simd4x2),
                          has_simd4x2 ? 1 : size,
                          surface, op, rsize, pred);
      }
   }
}