/*
 * Copyright (c) 2014 Scott Mansell
 * Copyright © 2014 Broadcom
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "util/u_blitter.h"
#include "util/u_prim.h"
#include "util/u_format.h"
#include "util/u_pack_color.h"
#include "util/u_upload_mgr.h"
#include "indices/u_primconvert.h"

#include "vc4_context.h"
#include "vc4_resource.h"

#define VC4_HW_2116_COUNT		0x1ef0

static void
vc4_get_draw_cl_space(struct vc4_job *job, int vert_count)
{
        /* The SW-5891 workaround may cause us to emit multiple shader recs
         * and draw packets.
         */
        int num_draws = DIV_ROUND_UP(vert_count, 65535) + 1;

        /* Binner gets our packet state -- vc4_emit.c contents,
         * and the primitive itself.
         */
        cl_ensure_space(&job->bcl,
                        256 + (VC4_PACKET_GL_ARRAY_PRIMITIVE_SIZE +
                               VC4_PACKET_GL_SHADER_STATE_SIZE) * num_draws);

        /* Nothing for rcl -- that's covered by vc4_context.c */

        /* shader_rec gets up to 12 dwords of reloc handles plus a maximally
         * sized shader_rec (104 bytes base for 8 vattrs plus 32 bytes of
         * vattr stride).
         */
        cl_ensure_space(&job->shader_rec,
                        (12 * sizeof(uint32_t) + 104 + 8 * 32) * num_draws);

        /* Uniforms are covered by vc4_write_uniforms(). */

        /* There could be up to 16 textures per stage, plus misc other
         * pointers.
         */
        cl_ensure_space(&job->bo_handles, (2 * 16 + 20) * sizeof(uint32_t));
        cl_ensure_space(&job->bo_pointers,
                        (2 * 16 + 20) * sizeof(struct vc4_bo *));
}

/**
 * Does the initial bining command list setup for drawing to a given FBO.
 */
static void
vc4_start_draw(struct vc4_context *vc4)
{
        struct vc4_job *job = vc4->job;

        if (job->needs_flush)
                return;

        vc4_get_draw_cl_space(job, 0);

        struct vc4_cl_out *bcl = cl_start(&job->bcl);
        //   Tile state data is 48 bytes per tile, I think it can be thrown away
        //   as soon as binning is finished.
        cl_u8(&bcl, VC4_PACKET_TILE_BINNING_MODE_CONFIG);
        cl_u32(&bcl, 0); /* tile alloc addr, filled by kernel */
        cl_u32(&bcl, 0); /* tile alloc size, filled by kernel */
        cl_u32(&bcl, 0); /* tile state addr, filled by kernel */
        cl_u8(&bcl, job->draw_tiles_x);
        cl_u8(&bcl, job->draw_tiles_y);
        /* Other flags are filled by kernel. */
        cl_u8(&bcl, job->msaa ? VC4_BIN_CONFIG_MS_MODE_4X : 0);

        /* START_TILE_BINNING resets the statechange counters in the hardware,
         * which are what is used when a primitive is binned to a tile to
         * figure out what new state packets need to be written to that tile's
         * command list.
         */
        cl_u8(&bcl, VC4_PACKET_START_TILE_BINNING);

        /* Reset the current compressed primitives format.  This gets modified
         * by VC4_PACKET_GL_INDEXED_PRIMITIVE and
         * VC4_PACKET_GL_ARRAY_PRIMITIVE, so it needs to be reset at the start
         * of every tile.
         */
        cl_u8(&bcl, VC4_PACKET_PRIMITIVE_LIST_FORMAT);
        cl_u8(&bcl, (VC4_PRIMITIVE_LIST_FORMAT_16_INDEX |
                     VC4_PRIMITIVE_LIST_FORMAT_TYPE_TRIANGLES));

        job->needs_flush = true;
        job->draw_width = vc4->framebuffer.width;
        job->draw_height = vc4->framebuffer.height;

        cl_end(&job->bcl, bcl);
}

static void
vc4_predraw_check_textures(struct pipe_context *pctx,
                           struct vc4_texture_stateobj *stage_tex)
{
        struct vc4_context *vc4 = vc4_context(pctx);

        for (int i = 0; i < stage_tex->num_textures; i++) {
                struct pipe_sampler_view *view = stage_tex->textures[i];
                if (!view)
                        continue;
                struct vc4_resource *rsc = vc4_resource(view->texture);
                if (rsc->shadow_parent)
                        vc4_update_shadow_baselevel_texture(pctx, view);

                vc4_flush_jobs_writing_resource(vc4, view->texture);
        }
}

static void
vc4_emit_gl_shader_state(struct vc4_context *vc4,
                         const struct pipe_draw_info *info,
                         uint32_t extra_index_bias)
{
        struct vc4_job *job = vc4->job;
        /* VC4_DIRTY_VTXSTATE */
        struct vc4_vertex_stateobj *vtx = vc4->vtx;
        /* VC4_DIRTY_VTXBUF */
        struct vc4_vertexbuf_stateobj *vertexbuf = &vc4->vertexbuf;

        /* The simulator throws a fit if VS or CS don't read an attribute, so
         * we emit a dummy read.
         */
        uint32_t num_elements_emit = MAX2(vtx->num_elements, 1);
        /* Emit the shader record. */
        struct vc4_cl_out *shader_rec =
                cl_start_shader_reloc(&job->shader_rec, 3 + num_elements_emit);
        /* VC4_DIRTY_PRIM_MODE | VC4_DIRTY_RASTERIZER */
        cl_u16(&shader_rec,
               VC4_SHADER_FLAG_ENABLE_CLIPPING |
               (vc4->prog.fs->fs_threaded ?
                0 : VC4_SHADER_FLAG_FS_SINGLE_THREAD) |
               ((info->mode == PIPE_PRIM_POINTS &&
                 vc4->rasterizer->base.point_size_per_vertex) ?
                VC4_SHADER_FLAG_VS_POINT_SIZE : 0));

        /* VC4_DIRTY_COMPILED_FS */
        cl_u8(&shader_rec, 0); /* fs num uniforms (unused) */
        cl_u8(&shader_rec, vc4->prog.fs->num_inputs);
        cl_reloc(job, &job->shader_rec, &shader_rec, vc4->prog.fs->bo, 0);
        cl_u32(&shader_rec, 0); /* UBO offset written by kernel */

        /* VC4_DIRTY_COMPILED_VS */
        cl_u16(&shader_rec, 0); /* vs num uniforms */
        cl_u8(&shader_rec, vc4->prog.vs->vattrs_live);
        cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[8]);
        cl_reloc(job, &job->shader_rec, &shader_rec, vc4->prog.vs->bo, 0);
        cl_u32(&shader_rec, 0); /* UBO offset written by kernel */

        /* VC4_DIRTY_COMPILED_CS */
        cl_u16(&shader_rec, 0); /* cs num uniforms */
        cl_u8(&shader_rec, vc4->prog.cs->vattrs_live);
        cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[8]);
        cl_reloc(job, &job->shader_rec, &shader_rec, vc4->prog.cs->bo, 0);
        cl_u32(&shader_rec, 0); /* UBO offset written by kernel */

        uint32_t max_index = 0xffff;
        for (int i = 0; i < vtx->num_elements; i++) {
                struct pipe_vertex_element *elem = &vtx->pipe[i];
                struct pipe_vertex_buffer *vb =
                        &vertexbuf->vb[elem->vertex_buffer_index];
                struct vc4_resource *rsc = vc4_resource(vb->buffer);
                /* not vc4->dirty tracked: vc4->last_index_bias */
                uint32_t offset = (vb->buffer_offset +
                                   elem->src_offset +
                                   vb->stride * (info->index_bias +
                                                 extra_index_bias));
                uint32_t vb_size = rsc->bo->size - offset;
                uint32_t elem_size =
                        util_format_get_blocksize(elem->src_format);

                cl_reloc(job, &job->shader_rec, &shader_rec, rsc->bo, offset);
                cl_u8(&shader_rec, elem_size - 1);
                cl_u8(&shader_rec, vb->stride);
                cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[i]);
                cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[i]);

                if (vb->stride > 0) {
                        max_index = MIN2(max_index,
                                         (vb_size - elem_size) / vb->stride);
                }
        }

        if (vtx->num_elements == 0) {
                assert(num_elements_emit == 1);
                struct vc4_bo *bo = vc4_bo_alloc(vc4->screen, 4096, "scratch VBO");
                cl_reloc(job, &job->shader_rec, &shader_rec, bo, 0);
                cl_u8(&shader_rec, 16 - 1); /* element size */
                cl_u8(&shader_rec, 0); /* stride */
                cl_u8(&shader_rec, 0); /* VS VPM offset */
                cl_u8(&shader_rec, 0); /* CS VPM offset */
                vc4_bo_unreference(&bo);
        }
        cl_end(&job->shader_rec, shader_rec);

        struct vc4_cl_out *bcl = cl_start(&job->bcl);
        /* the actual draw call. */
        cl_u8(&bcl, VC4_PACKET_GL_SHADER_STATE);
        assert(vtx->num_elements <= 8);
        /* Note that number of attributes == 0 in the packet means 8
         * attributes.  This field also contains the offset into shader_rec.
         */
        cl_u32(&bcl, num_elements_emit & 0x7);
        cl_end(&job->bcl, bcl);

        vc4_write_uniforms(vc4, vc4->prog.fs,
                           &vc4->constbuf[PIPE_SHADER_FRAGMENT],
                           &vc4->fragtex);
        vc4_write_uniforms(vc4, vc4->prog.vs,
                           &vc4->constbuf[PIPE_SHADER_VERTEX],
                           &vc4->verttex);
        vc4_write_uniforms(vc4, vc4->prog.cs,
                           &vc4->constbuf[PIPE_SHADER_VERTEX],
                           &vc4->verttex);

        vc4->last_index_bias = info->index_bias + extra_index_bias;
        vc4->max_index = max_index;
        job->shader_rec_count++;
}

/**
 * HW-2116 workaround: Flush the batch before triggering the hardware state
 * counter wraparound behavior.
 *
 * State updates are tracked by a global counter which increments at the first
 * state update after a draw or a START_BINNING.  Tiles can then have their
 * state updated at draw time with a set of cheap checks for whether the
 * state's copy of the global counter matches the global counter the last time
 * that state was written to the tile.
 *
 * The state counters are relatively small and wrap around quickly, so you
 * could get false negatives for needing to update a particular state in the
 * tile.  To avoid this, the hardware attempts to write all of the state in
 * the tile at wraparound time.  This apparently is broken, so we just flush
 * everything before that behavior is triggered.  A batch flush is sufficient
 * to get our current contents drawn and reset the counters to 0.
 *
 * Note that we can't just use VC4_PACKET_FLUSH_ALL, because that caps the
 * tiles with VC4_PACKET_RETURN_FROM_LIST.
 */
static void
vc4_hw_2116_workaround(struct pipe_context *pctx, int vert_count)
{
        struct vc4_context *vc4 = vc4_context(pctx);
        struct vc4_job *job = vc4_get_job_for_fbo(vc4);

        if (job->draw_calls_queued + vert_count / 65535 >= VC4_HW_2116_COUNT) {
                perf_debug("Flushing batch due to HW-2116 workaround "
                           "(too many draw calls per scene\n");
                vc4_job_submit(vc4, job);
        }
}

static void
vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
{
        struct vc4_context *vc4 = vc4_context(pctx);

        if (info->mode >= PIPE_PRIM_QUADS) {
                util_primconvert_save_index_buffer(vc4->primconvert, &vc4->indexbuf);
                util_primconvert_save_rasterizer_state(vc4->primconvert, &vc4->rasterizer->base);
                util_primconvert_draw_vbo(vc4->primconvert, info);
                perf_debug("Fallback conversion for %d %s vertices\n",
                           info->count, u_prim_name(info->mode));
                return;
        }

        /* Before setting up the draw, do any fixup blits necessary. */
        vc4_predraw_check_textures(pctx, &vc4->verttex);
        vc4_predraw_check_textures(pctx, &vc4->fragtex);

        vc4_hw_2116_workaround(pctx, info->count);

        struct vc4_job *job = vc4_get_job_for_fbo(vc4);

        vc4_get_draw_cl_space(job, info->count);

        if (vc4->prim_mode != info->mode) {
                vc4->prim_mode = info->mode;
                vc4->dirty |= VC4_DIRTY_PRIM_MODE;
        }

        vc4_start_draw(vc4);
        if (!vc4_update_compiled_shaders(vc4, info->mode)) {
                debug_warn_once("shader compile failed, skipping draw call.\n");
                return;
        }

        vc4_emit_state(pctx);

        if ((vc4->dirty & (VC4_DIRTY_VTXBUF |
                           VC4_DIRTY_VTXSTATE |
                           VC4_DIRTY_PRIM_MODE |
                           VC4_DIRTY_RASTERIZER |
                           VC4_DIRTY_COMPILED_CS |
                           VC4_DIRTY_COMPILED_VS |
                           VC4_DIRTY_COMPILED_FS |
                           vc4->prog.cs->uniform_dirty_bits |
                           vc4->prog.vs->uniform_dirty_bits |
                           vc4->prog.fs->uniform_dirty_bits)) ||
            vc4->last_index_bias != info->index_bias) {
                vc4_emit_gl_shader_state(vc4, info, 0);
        }

        vc4->dirty = 0;

        /* Note that the primitive type fields match with OpenGL/gallium
         * definitions, up to but not including QUADS.
         */
        struct vc4_cl_out *bcl = cl_start(&job->bcl);
        if (info->indexed) {
                uint32_t offset = vc4->indexbuf.offset;
                uint32_t index_size = vc4->indexbuf.index_size;
                struct pipe_resource *prsc;
                if (vc4->indexbuf.index_size == 4) {
                        prsc = vc4_get_shadow_index_buffer(pctx, &vc4->indexbuf,
                                                           info->count, &offset);
                        index_size = 2;
                } else {
                        if (vc4->indexbuf.user_buffer) {
                                prsc = NULL;
                                u_upload_data(vc4->uploader, 0,
                                              info->count * index_size, 4,
                                              vc4->indexbuf.user_buffer,
                                              &offset, &prsc);
                        } else {
                                prsc = vc4->indexbuf.buffer;
                        }
                }
                struct vc4_resource *rsc = vc4_resource(prsc);

                cl_start_reloc(&job->bcl, &bcl, 1);
                cl_u8(&bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE);
                cl_u8(&bcl,
                      info->mode |
                      (index_size == 2 ?
                       VC4_INDEX_BUFFER_U16:
                       VC4_INDEX_BUFFER_U8));
                cl_u32(&bcl, info->count);
                cl_reloc(job, &job->bcl, &bcl, rsc->bo, offset);
                cl_u32(&bcl, vc4->max_index);
                job->draw_calls_queued++;

                if (vc4->indexbuf.index_size == 4 || vc4->indexbuf.user_buffer)
                        pipe_resource_reference(&prsc, NULL);
        } else {
                uint32_t count = info->count;
                uint32_t start = info->start;
                uint32_t extra_index_bias = 0;

                while (count) {
                        uint32_t this_count = count;
                        uint32_t step = count;
                        static const uint32_t max_verts = 65535;

                        /* GFXH-515 / SW-5891: The binner emits 16 bit indices
                         * for drawarrays, which means that if start + count >
                         * 64k it would truncate the top bits.  Work around
                         * this by emitting a limited number of primitives at
                         * a time and reemitting the shader state pointing
                         * farther down the vertex attribute arrays.
                         *
                         * To do this properly for line loops or trifans, we'd
                         * need to make a new VB containing the first vertex
                         * plus whatever remainder.
                         */
                        if (extra_index_bias) {
                                cl_end(&job->bcl, bcl);
                                vc4_emit_gl_shader_state(vc4, info,
                                                         extra_index_bias);
                                bcl = cl_start(&job->bcl);
                        }

                        if (start + count > max_verts) {
                                switch (info->mode) {
                                case PIPE_PRIM_POINTS:
                                        this_count = step = max_verts;
                                        break;
                                case PIPE_PRIM_LINES:
                                        this_count = step = max_verts - (max_verts % 2);
                                        break;
                                case PIPE_PRIM_LINE_STRIP:
                                        this_count = max_verts;
                                        step = max_verts - 1;
                                        break;
                                case PIPE_PRIM_LINE_LOOP:
                                        this_count = max_verts;
                                        step = max_verts - 1;
                                        debug_warn_once("unhandled line loop "
                                                        "looping behavior with "
                                                        ">65535 verts\n");
                                        break;
                                case PIPE_PRIM_TRIANGLES:
                                        this_count = step = max_verts - (max_verts % 3);
                                        break;
                                case PIPE_PRIM_TRIANGLE_STRIP:
                                        this_count = max_verts;
                                        step = max_verts - 2;
                                        break;
                                default:
                                        debug_warn_once("unhandled primitive "
                                                        "max vert count, truncating\n");
                                        this_count = step = max_verts;
                                }
                        }

                        cl_u8(&bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE);
                        cl_u8(&bcl, info->mode);
                        cl_u32(&bcl, this_count);
                        cl_u32(&bcl, start);
                        job->draw_calls_queued++;

                        count -= step;
                        extra_index_bias += start + step;
                        start = 0;
                }
        }
        cl_end(&job->bcl, bcl);

        /* We shouldn't have tripped the HW_2116 bug with the GFXH-515
         * workaround.
         */
        assert(job->draw_calls_queued <= VC4_HW_2116_COUNT);

        if (vc4->zsa && vc4->framebuffer.zsbuf) {
                struct vc4_resource *rsc =
                        vc4_resource(vc4->framebuffer.zsbuf->texture);

                if (vc4->zsa->base.depth.enabled) {
                        job->resolve |= PIPE_CLEAR_DEPTH;
                        rsc->initialized_buffers = PIPE_CLEAR_DEPTH;
                }

                if (vc4->zsa->base.stencil[0].enabled) {
                        job->resolve |= PIPE_CLEAR_STENCIL;
                        rsc->initialized_buffers |= PIPE_CLEAR_STENCIL;
                }
        }

        job->resolve |= PIPE_CLEAR_COLOR0;

        /* If we've used half of the presumably 256MB CMA area, flush the job
         * so that we don't accumulate a job that will end up not being
         * executable.
         */
        if (job->bo_space > 128 * 1024 * 1024)
                vc4_flush(pctx);

        if (vc4_debug & VC4_DEBUG_ALWAYS_FLUSH)
                vc4_flush(pctx);
}

static uint32_t
pack_rgba(enum pipe_format format, const float *rgba)
{
        union util_color uc;
        util_pack_color(rgba, format, &uc);
        if (util_format_get_blocksize(format) == 2)
                return uc.us;
        else
                return uc.ui[0];
}

static void
vc4_clear(struct pipe_context *pctx, unsigned buffers,
          const union pipe_color_union *color, double depth, unsigned stencil)
{
        struct vc4_context *vc4 = vc4_context(pctx);
        struct vc4_job *job = vc4_get_job_for_fbo(vc4);

        /* We can't flag new buffers for clearing once we've queued draws.  We
         * could avoid this by using the 3d engine to clear.
         */
        if (job->draw_calls_queued) {
                perf_debug("Flushing rendering to process new clear.\n");
                vc4_job_submit(vc4, job);
                job = vc4_get_job_for_fbo(vc4);
        }

        if (buffers & PIPE_CLEAR_COLOR0) {
                struct vc4_resource *rsc =
                        vc4_resource(vc4->framebuffer.cbufs[0]->texture);
                uint32_t clear_color;

                if (vc4_rt_format_is_565(vc4->framebuffer.cbufs[0]->format)) {
                        /* In 565 mode, the hardware will be packing our color
                         * for us.
                         */
                        clear_color = pack_rgba(PIPE_FORMAT_R8G8B8A8_UNORM,
                                                color->f);
                } else {
                        /* Otherwise, we need to do this packing because we
                         * support multiple swizzlings of RGBA8888.
                         */
                        clear_color =
                                pack_rgba(vc4->framebuffer.cbufs[0]->format,
                                          color->f);
                }
                job->clear_color[0] = job->clear_color[1] = clear_color;
                rsc->initialized_buffers |= (buffers & PIPE_CLEAR_COLOR0);
        }

        if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
                struct vc4_resource *rsc =
                        vc4_resource(vc4->framebuffer.zsbuf->texture);
                unsigned zsclear = buffers & PIPE_CLEAR_DEPTHSTENCIL;

                /* Clearing ZS will clear both Z and stencil, so if we're
                 * trying to clear just one then we need to draw a quad to do
                 * it instead.
                 */
                if ((zsclear == PIPE_CLEAR_DEPTH ||
                     zsclear == PIPE_CLEAR_STENCIL) &&
                    (rsc->initialized_buffers & ~(zsclear | job->cleared)) &&
                    util_format_is_depth_and_stencil(vc4->framebuffer.zsbuf->format)) {
                        perf_debug("Partial clear of Z+stencil buffer, "
                                   "drawing a quad instead of fast clearing\n");
                        vc4_blitter_save(vc4);
                        util_blitter_clear(vc4->blitter,
                                           vc4->framebuffer.width,
                                           vc4->framebuffer.height,
                                           1,
                                           zsclear,
                                           NULL, depth, stencil);
                        buffers &= ~zsclear;
                        if (!buffers)
                                return;
                }

                /* Though the depth buffer is stored with Z in the high 24,
                 * for this field we just need to store it in the low 24.
                 */
                if (buffers & PIPE_CLEAR_DEPTH) {
                        job->clear_depth = util_pack_z(PIPE_FORMAT_Z24X8_UNORM,
                                                       depth);
                }
                if (buffers & PIPE_CLEAR_STENCIL)
                        job->clear_stencil = stencil;

                rsc->initialized_buffers |= zsclear;
        }

        job->draw_min_x = 0;
        job->draw_min_y = 0;
        job->draw_max_x = vc4->framebuffer.width;
        job->draw_max_y = vc4->framebuffer.height;
        job->cleared |= buffers;
        job->resolve |= buffers;

        vc4_start_draw(vc4);
}

static void
vc4_clear_render_target(struct pipe_context *pctx, struct pipe_surface *ps,
                        const union pipe_color_union *color,
                        unsigned x, unsigned y, unsigned w, unsigned h,
			bool render_condition_enabled)
{
        fprintf(stderr, "unimpl: clear RT\n");
}

static void
vc4_clear_depth_stencil(struct pipe_context *pctx, struct pipe_surface *ps,
                        unsigned buffers, double depth, unsigned stencil,
                        unsigned x, unsigned y, unsigned w, unsigned h,
			bool render_condition_enabled)
{
        fprintf(stderr, "unimpl: clear DS\n");
}

void
vc4_draw_init(struct pipe_context *pctx)
{
        pctx->draw_vbo = vc4_draw_vbo;
        pctx->clear = vc4_clear;
        pctx->clear_render_target = vc4_clear_render_target;
        pctx->clear_depth_stencil = vc4_clear_depth_stencil;
}