From cb2322c7c0f95d6d1a2b90494cf5f6fd55f55638 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Tue, 18 Dec 2018 23:33:54 -0500 Subject: freedreno: a2xx: a20x hw binning Signed-off-by: Jonathan Marek --- src/gallium/drivers/freedreno/a2xx/fd2_draw.c | 32 ++++- src/gallium/drivers/freedreno/a2xx/fd2_emit.c | 52 ++++++++ src/gallium/drivers/freedreno/a2xx/fd2_emit.h | 3 +- src/gallium/drivers/freedreno/a2xx/fd2_gmem.c | 163 +++++++++++++++++++++++ src/gallium/drivers/freedreno/a2xx/fd2_program.c | 11 +- 5 files changed, 253 insertions(+), 8 deletions(-) (limited to 'src/gallium/drivers/freedreno/a2xx') diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c index f15d57cf0e0..c857c118d91 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c @@ -75,11 +75,12 @@ emit_vertexbufs(struct fd_context *ctx) // CONST(20,0) (or CONST(26,0) in soliv_vp) fd2_emit_vertex_bufs(ctx->batch->draw, 0x78, bufs, vtx->num_elements); + fd2_emit_vertex_bufs(ctx->batch->binning, 0x78, bufs, vtx->num_elements); } static void draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info, - struct fd_ringbuffer *ring, unsigned index_offset) + struct fd_ringbuffer *ring, unsigned index_offset, bool binning) { OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET)); @@ -119,8 +120,22 @@ draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info, OUT_RING(ring, info->min_index); /* VGT_MIN_VTX_INDX */ } + /* binning shader will take offset from C64 */ + if (binning && is_a20x(ctx->screen)) { + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, 0x00000180); + OUT_RING(ring, fui(ctx->batch->num_vertices)); + OUT_RING(ring, fui(0.0f)); + OUT_RING(ring, fui(0.0f)); + OUT_RING(ring, fui(0.0f)); + } + + enum pc_di_vis_cull_mode vismode = USE_VISIBILITY; + if (binning || info->mode == PIPE_PRIM_POINTS) + vismode = IGNORE_VISIBILITY; + fd_draw_emit(ctx->batch, ring, ctx->primtypes[info->mode], - IGNORE_VISIBILITY, info, index_offset); + vismode, info, index_offset); if (is_a20x(ctx->screen)) { /* not sure why this is required, but it fixes some hangs */ @@ -145,6 +160,9 @@ fd2_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *pinfo, if (ctx->dirty & FD_DIRTY_VTXBUF) emit_vertexbufs(ctx); + if (fd_binning_enabled) + fd2_emit_state_binning(ctx, ctx->dirty); + fd2_emit_state(ctx, ctx->dirty); /* a2xx can draw only 65535 vertices at once @@ -166,17 +184,23 @@ fd2_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *pinfo, struct pipe_draw_info info = *pinfo; unsigned count = info.count; unsigned step = step_tbl[info.mode]; + unsigned num_vertices = ctx->batch->num_vertices; if (!step) return false; for (; count + step > 32766; count -= step) { info.count = MIN2(count, 32766); - draw_impl(ctx, &info, ctx->batch->draw, index_offset); + draw_impl(ctx, &info, ctx->batch->draw, index_offset, false); + draw_impl(ctx, &info, ctx->batch->binning, index_offset, true); info.start += step; + ctx->batch->num_vertices += step; } + /* changing this value is a hack, restore it */ + ctx->batch->num_vertices = num_vertices; } else { - draw_impl(ctx, pinfo, ctx->batch->draw, index_offset); + draw_impl(ctx, pinfo, ctx->batch->draw, index_offset, false); + draw_impl(ctx, pinfo, ctx->batch->binning, index_offset, true); } fd_context_all_clean(ctx); diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c index e98f86a8257..18d69444d12 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c @@ -185,6 +185,58 @@ fd2_emit_vertex_bufs(struct fd_ringbuffer *ring, uint32_t val, } } +void +fd2_emit_state_binning(struct fd_context *ctx, const enum fd_dirty_3d_state dirty) +{ + struct fd2_blend_stateobj *blend = fd2_blend_stateobj(ctx->blend); + struct fd_ringbuffer *ring = ctx->batch->binning; + + /* subset of fd2_emit_state needed for hw binning on a20x */ + + if (dirty & (FD_DIRTY_PROG | FD_DIRTY_VTXSTATE)) + fd2_program_emit(ctx, ring, &ctx->prog); + + if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONST)) { + emit_constants(ring, VS_CONST_BASE * 4, + &ctx->constbuf[PIPE_SHADER_VERTEX], + (dirty & FD_DIRTY_PROG) ? ctx->prog.vp : NULL); + } + + if (dirty & FD_DIRTY_VIEWPORT) { + OUT_PKT3(ring, CP_SET_CONSTANT, 9); + OUT_RING(ring, 0x00000184); + OUT_RING(ring, fui(ctx->viewport.translate[0])); + OUT_RING(ring, fui(ctx->viewport.translate[1])); + OUT_RING(ring, fui(ctx->viewport.translate[2])); + OUT_RING(ring, fui(0.0f)); + OUT_RING(ring, fui(ctx->viewport.scale[0])); + OUT_RING(ring, fui(ctx->viewport.scale[1])); + OUT_RING(ring, fui(ctx->viewport.scale[2])); + OUT_RING(ring, fui(0.0f)); + } + + /* not sure why this is needed */ + if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) { + enum pipe_format format = + pipe_surface_format(ctx->batch->framebuffer.cbufs[0]); + bool has_alpha = util_format_has_alpha(format); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_CONTROL)); + OUT_RING(ring, blend->rb_blendcontrol_alpha | + COND(has_alpha, blend->rb_blendcontrol_rgb) | + COND(!has_alpha, blend->rb_blendcontrol_no_alpha_rgb)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_MASK)); + OUT_RING(ring, blend->rb_colormask); + } + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SU_SC_MODE_CNTL)); + OUT_RING(ring, A2XX_PA_SU_SC_MODE_CNTL_FACE_KILL_ENABLE); +} + void fd2_emit_state(struct fd_context *ctx, const enum fd_dirty_3d_state dirty) { diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.h b/src/gallium/drivers/freedreno/a2xx/fd2_emit.h index 5e4bddd1faa..891ed91e5a8 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.h +++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.h @@ -40,7 +40,8 @@ struct fd2_vertex_buf { void fd2_emit_vertex_bufs(struct fd_ringbuffer *ring, uint32_t val, struct fd2_vertex_buf *vbufs, uint32_t n); -void fd2_emit_state(struct fd_context *ctx, enum fd_dirty_3d_state dirty); +void fd2_emit_state_binning(struct fd_context *ctx, const enum fd_dirty_3d_state dirty); +void fd2_emit_state(struct fd_context *ctx, const enum fd_dirty_3d_state dirty); void fd2_emit_restore(struct fd_context *ctx, struct fd_ringbuffer *ring); void fd2_emit_init(struct pipe_context *pctx); diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c index 56db5608c28..6a066a63730 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c @@ -39,6 +39,7 @@ #include "fd2_program.h" #include "fd2_util.h" #include "fd2_zsa.h" +#include "instr-a2xx.h" static uint32_t fmt2swap(enum pipe_format format) { @@ -366,6 +367,41 @@ fd2_emit_tile_mem2gmem(struct fd_batch *batch, struct fd_tile *tile) /* TODO blob driver seems to toss in a CACHE_FLUSH after each DRAW_INDX.. */ } +static void +patch_draws(struct fd_batch *batch, enum pc_di_vis_cull_mode vismode) +{ + unsigned i; + + if (!is_a20x(batch->ctx->screen)) { + /* identical to a3xx */ + for (i = 0; i < fd_patch_num_elements(&batch->draw_patches); i++) { + struct fd_cs_patch *patch = fd_patch_element(&batch->draw_patches, i); + *patch->cs = patch->val | DRAW(0, 0, 0, vismode, 0); + } + util_dynarray_resize(&batch->draw_patches, 0); + return; + } + + if (vismode == USE_VISIBILITY) + return; + + for (i = 0; i < batch->draw_patches.size / sizeof(uint32_t*); i++) { + uint32_t *ptr = *util_dynarray_element(&batch->draw_patches, uint32_t*, i); + unsigned cnt = ptr[0] >> 16 & 0xfff; /* 5 with idx buffer, 3 without */ + + /* convert CP_DRAW_INDX_BIN to a CP_DRAW_INDX + * replace first two DWORDS with NOP and move the rest down + * (we don't want to have to move the idx buffer reloc) + */ + ptr[0] = CP_TYPE3_PKT | (CP_NOP << 8); + ptr[1] = 0x00000000; + + ptr[4] = ptr[2] & ~(1 << 14 | 1 << 15); /* remove cull_enable bits */ + ptr[2] = CP_TYPE3_PKT | ((cnt-2) << 16) | (CP_DRAW_INDX << 8); + ptr[3] = 0x00000000; + } +} + static void fd2_emit_sysmem_prep(struct fd_batch *batch) { @@ -408,6 +444,10 @@ fd2_emit_sysmem_prep(struct fd_batch *batch) OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_OFFSET)); OUT_RING(ring, A2XX_PA_SC_WINDOW_OFFSET_X(0) | A2XX_PA_SC_WINDOW_OFFSET_Y(0)); + + patch_draws(batch, IGNORE_VISIBILITY); + util_dynarray_resize(&batch->draw_patches, 0); + util_dynarray_resize(&batch->shader_patches, 0); } /* before first tile */ @@ -432,6 +472,112 @@ fd2_emit_tile_init(struct fd_batch *batch) if (pfb->zsbuf) reg |= A2XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd_pipe2depth(pfb->zsbuf->format)); OUT_RING(ring, reg); /* RB_DEPTH_INFO */ + + /* set to zero, for some reason hardware doesn't like certain values */ + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MIN)); + OUT_RING(ring, 0); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MAX)); + OUT_RING(ring, 0); + + if (is_a20x(ctx->screen) && fd_binning_enabled && gmem->num_vsc_pipes) { + /* patch out unneeded memory exports by changing EXEC CF to EXEC_END + * + * in the shader compiler, we guarantee that the shader ends with + * a specific pattern of ALLOC/EXEC CF pairs for the hw binning exports + * + * the since patches point only to dwords and CFs are 1.5 dwords + * the patch is aligned and might point to a ALLOC CF + */ + for (int i = 0; i < batch->shader_patches.size / sizeof(void*); i++) { + instr_cf_t *cf = + *util_dynarray_element(&batch->shader_patches, instr_cf_t*, i); + if (cf->opc == ALLOC) + cf++; + assert(cf->opc == EXEC); + assert(cf[ctx->screen->num_vsc_pipes*2-2].opc == EXEC_END); + cf[2*(gmem->num_vsc_pipes-1)].opc = EXEC_END; + } + + patch_draws(batch, USE_VISIBILITY); + + /* initialize shader constants for the binning memexport */ + OUT_PKT3(ring, CP_SET_CONSTANT, 1 + gmem->num_vsc_pipes * 4); + OUT_RING(ring, 0x0000000C); + + for (int i = 0; i < gmem->num_vsc_pipes; i++) { + struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i]; + + /* XXX we know how large this needs to be.. + * should do some sort of realloc + * it should be ctx->batch->num_vertices bytes large + * with this size it will break with more than 256k vertices.. + */ + if (!pipe->bo) { + pipe->bo = fd_bo_new(ctx->dev, 0x40000, + DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_pipe[%u]", i); + } + + /* memory export address (export32): + * .x: (base_address >> 2) | 0x40000000 (?) + * .y: index (float) - set by shader + * .z: 0x4B00D000 (?) + * .w: 0x4B000000 (?) | max_index (?) + */ + OUT_RELOCW(ring, pipe->bo, 0, 0x40000000, -2); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x4B00D000); + OUT_RING(ring, 0x4B000000 | 0x40000); + } + + OUT_PKT3(ring, CP_SET_CONSTANT, 1 + gmem->num_vsc_pipes * 8); + OUT_RING(ring, 0x0000018C); + + for (int i = 0; i < gmem->num_vsc_pipes; i++) { + struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i]; + float off_x, off_y, mul_x, mul_y; + + /* const to tranform from [-1,1] to bin coordinates for this pipe + * for x/y, [0,256/2040] = 0, [256/2040,512/2040] = 1, etc + * 8 possible values on x/y axis, + * to clip at binning stage: only use center 6x6 + * TODO: set the z parameters too so that hw binning + * can clip primitives in Z too + */ + + mul_x = 1.0f / (float) (gmem->bin_w * 8); + mul_y = 1.0f / (float) (gmem->bin_h * 8); + off_x = -pipe->x * (1.0/8.0f) + 0.125f - mul_x * gmem->minx; + off_y = -pipe->y * (1.0/8.0f) + 0.125f - mul_y * gmem->miny; + + OUT_RING(ring, fui(off_x * (256.0f/255.0f))); + OUT_RING(ring, fui(off_y * (256.0f/255.0f))); + OUT_RING(ring, 0x3f000000); + OUT_RING(ring, fui(0.0f)); + + OUT_RING(ring, fui(mul_x * (256.0f/255.0f))); + OUT_RING(ring, fui(mul_y * (256.0f/255.0f))); + OUT_RING(ring, fui(0.0f)); + OUT_RING(ring, fui(0.0f)); + } + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); + OUT_RING(ring, 0); + + ctx->emit_ib(ring, batch->binning); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); + OUT_RING(ring, 0x00000002); + } else { + patch_draws(batch, IGNORE_VISIBILITY); + } + + util_dynarray_resize(&batch->draw_patches, 0); + util_dynarray_resize(&batch->shader_patches, 0); } /* before mem2gmem */ @@ -460,6 +606,7 @@ fd2_emit_tile_prep(struct fd_batch *batch, struct fd_tile *tile) static void fd2_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile) { + struct fd_context *ctx = batch->ctx; struct fd_ringbuffer *ring = batch->gmem; struct pipe_framebuffer_state *pfb = &batch->framebuffer; enum pipe_format format = pipe_surface_format(pfb->cbufs[0]); @@ -486,6 +633,22 @@ fd2_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile) OUT_RING(ring, fui(0.0f)); OUT_RING(ring, fui(0.0f)); } + + if (is_a20x(ctx->screen) && fd_binning_enabled) { + struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[tile->p]; + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MIN)); + OUT_RING(ring, tile->n); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MAX)); + OUT_RING(ring, tile->n); + + /* TODO only emit this when tile->p changes */ + OUT_PKT3(ring, CP_SET_DRAW_INIT_FLAGS, 1); + OUT_RELOC(ring, pipe->bo, 0, 0, 0); + } } void diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_program.c b/src/gallium/drivers/freedreno/a2xx/fd2_program.c index cbb18f84c4c..84b54cf56b7 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_program.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_program.c @@ -65,7 +65,7 @@ delete_shader(struct fd2_shader_stateobj *so) static void emit(struct fd_ringbuffer *ring, gl_shader_stage type, - struct ir2_shader_info *info) + struct ir2_shader_info *info, struct util_dynarray *patches) { unsigned i; @@ -74,6 +74,10 @@ emit(struct fd_ringbuffer *ring, gl_shader_stage type, OUT_PKT3(ring, CP_IM_LOAD_IMMEDIATE, 2 + info->sizedwords); OUT_RING(ring, type == MESA_SHADER_FRAGMENT); OUT_RING(ring, info->sizedwords); + + if (patches) + util_dynarray_append(patches, uint32_t*, &ring->cur[info->mem_export_ptr]); + for (i = 0; i < info->sizedwords; i++) OUT_RING(ring, info->dwords[i]); } @@ -261,10 +265,11 @@ fd2_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, patch_fetches(ctx, fpi, NULL, &ctx->tex[PIPE_SHADER_FRAGMENT]); } - emit(ring, MESA_SHADER_VERTEX, vpi); + emit(ring, MESA_SHADER_VERTEX, vpi, + binning ? &ctx->batch->shader_patches : NULL); if (fp) { - emit(ring, MESA_SHADER_FRAGMENT, fpi); + emit(ring, MESA_SHADER_FRAGMENT, fpi, NULL); fs_gprs = (fpi->max_reg < 0) ? 0x80 : fpi->max_reg; vs_export = MAX2(1, f->inputs_count) - 1; } -- cgit v1.2.3