diff options
author | Rob Clark <[email protected]> | 2014-01-07 10:55:07 -0500 |
---|---|---|
committer | Rob Clark <[email protected]> | 2014-01-08 16:30:18 -0500 |
commit | c0766528baaef48902c87bbdaa4f5926c472269b (patch) | |
tree | f825706059f50c37a9fda7961ec596b0fb6a65da /src/gallium/drivers/freedreno/a3xx | |
parent | bfb44c24bc1eff850d47984b2cb60c957ffc143d (diff) |
freedreno/a3xx: support for hw binning pass
The binning pass sorts vertices into which bins/tiles they apply to.
The visibility information generated during the binning pass can be
used to speed up the rendering pass by filtering out vertices which
do not apply to the current tile. See:
https://github.com/freedreno/freedreno/wiki/Adreno-tiling#optimized-approach
This brings a significant fps boost. A rough assortment of tests
(supertuxkart, etracer, tremulous, glmark2 'build' test, etc) seems
to yield a ~35-45% fps improvement.
For now, to be conservative, the binning pass is not enabled yet by
default. To enable it use:
FD_MESA_DEBUG=binning
So far I haven't found anything that breaks with binning enabled,
but I'd like a bit more testing before I enable it as default.
Signed-off-by: Rob Clark <[email protected]>
Diffstat (limited to 'src/gallium/drivers/freedreno/a3xx')
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/fd3_draw.c | 81 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/fd3_emit.c | 26 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/fd3_emit.h | 3 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/fd3_gmem.c | 360 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/fd3_program.c | 136 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/fd3_program.h | 2 |
6 files changed, 508 insertions, 100 deletions
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c index c5d8b774552..4c90d984955 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c @@ -43,7 +43,7 @@ static void -emit_vertexbufs(struct fd_context *ctx) +emit_vertexbufs(struct fd_context *ctx, struct fd_ringbuffer *ring) { struct fd_vertex_stateobj *vtx = ctx->vtx; struct fd_vertexbuf_stateobj *vertexbuf = &ctx->vertexbuf; @@ -63,19 +63,17 @@ emit_vertexbufs(struct fd_context *ctx) bufs[i].format = elem->src_format; } - fd3_emit_vertex_bufs(ctx->ring, &ctx->prog, bufs, vtx->num_elements); + fd3_emit_vertex_bufs(ring, &ctx->prog, bufs, vtx->num_elements); } static void -fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info) +draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info, + struct fd_ringbuffer *ring, unsigned dirty, bool binning) { - struct fd_ringbuffer *ring = ctx->ring; - unsigned dirty = ctx->dirty; - - fd3_emit_state(ctx, dirty); + fd3_emit_state(ctx, ring, dirty, binning); if (dirty & FD_DIRTY_VTXBUF) - emit_vertexbufs(ctx); + emit_vertexbufs(ctx, ring); OUT_PKT0(ring, REG_A3XX_PC_VERTEX_REUSE_BLOCK_CNTL, 1); OUT_RING(ring, 0x0000000b); /* PC_VERTEX_REUSE_BLOCK_CNTL */ @@ -90,7 +88,59 @@ fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info) OUT_RING(ring, info->primitive_restart ? /* PC_RESTART_INDEX */ info->restart_index : 0xffffffff); - fd_draw_emit(ctx, info); + fd_draw_emit(ctx, ring, binning ? IGNORE_VISIBILITY : USE_VISIBILITY, info); +} + +static void +fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info) +{ + unsigned dirty = ctx->dirty; + draw_impl(ctx, info, ctx->binning_ring, + dirty & ~(FD_DIRTY_BLEND), true); + draw_impl(ctx, info, ctx->ring, dirty, false); +} + +/* binning pass cmds for a clear: + * NOTE: newer blob drivers don't use binning for clear, which is probably + * preferable since it is low vtx count. However that doesn't seem to + * actually work for me. Not sure if it is depending on support for + * clear pass (rather than using solid-fill shader), or something else + * that newer blob is doing differently. Once that is figured out, we + * can remove fd3_clear_binning(). + */ +static void +fd3_clear_binning(struct fd_context *ctx, unsigned dirty) +{ + struct fd3_context *fd3_ctx = fd3_context(ctx); + struct fd_ringbuffer *ring = ctx->binning_ring; + + fd3_emit_state(ctx, ring, dirty & (FD_DIRTY_VIEWPORT | + FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR), true); + + fd3_program_emit(ring, &ctx->solid_prog, true); + + fd3_emit_vertex_bufs(ring, &ctx->solid_prog, (struct fd3_vertex_buf[]) { + { .prsc = fd3_ctx->solid_vbuf, .stride = 12, .format = PIPE_FORMAT_R32G32B32_FLOAT }, + }, 1); + + OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1); + OUT_RING(ring, A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(0) | + A3XX_PC_PRIM_VTX_CNTL_POLYMODE_FRONT_PTYPE(PC_DRAW_TRIANGLES) | + A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE(PC_DRAW_TRIANGLES) | + A3XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST); + OUT_PKT0(ring, REG_A3XX_VFD_INDEX_MIN, 4); + OUT_RING(ring, 0); /* VFD_INDEX_MIN */ + OUT_RING(ring, 2); /* VFD_INDEX_MAX */ + OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */ + OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ + OUT_PKT0(ring, REG_A3XX_PC_RESTART_INDEX, 1); + OUT_RING(ring, 0xffffffff); /* PC_RESTART_INDEX */ + + OUT_PKT3(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, PERFCOUNTER_STOP); + + fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 2, INDEX_SIZE_IGN, 0, 0, NULL); } static void @@ -99,11 +149,14 @@ fd3_clear(struct fd_context *ctx, unsigned buffers, { struct fd3_context *fd3_ctx = fd3_context(ctx); struct fd_ringbuffer *ring = ctx->ring; + unsigned dirty = ctx->dirty; unsigned ce, i; + fd3_clear_binning(ctx, dirty); + /* emit generic state now: */ - fd3_emit_state(ctx, ctx->dirty & (FD_DIRTY_VIEWPORT | - FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR)); + fd3_emit_state(ctx, ring, dirty & (FD_DIRTY_VIEWPORT | + FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR), false); OUT_PKT0(ring, REG_A3XX_RB_BLEND_ALPHA, 1); OUT_RING(ring, A3XX_RB_BLEND_ALPHA_UINT(0xff) | @@ -192,7 +245,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers, OUT_PKT0(ring, REG_A3XX_GRAS_SU_MODE_CONTROL, 1); OUT_RING(ring, A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(0)); - fd3_program_emit(ring, &ctx->solid_prog); + fd3_program_emit(ring, &ctx->solid_prog, false); fd3_emit_vertex_bufs(ring, &ctx->solid_prog, (struct fd3_vertex_buf[]) { { .prsc = fd3_ctx->solid_vbuf, .stride = 12, .format = PIPE_FORMAT_R32G32B32_FLOAT }, @@ -216,8 +269,8 @@ fd3_clear(struct fd_context *ctx, unsigned buffers, OUT_PKT3(ring, CP_EVENT_WRITE, 1); OUT_RING(ring, PERFCOUNTER_STOP); - fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 2, - INDEX_SIZE_IGN, 0, 0, NULL); + fd_draw(ctx, ring, DI_PT_RECTLIST, USE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 2, INDEX_SIZE_IGN, 0, 0, NULL); } void diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c index 91993725ea6..9cfe4ddb662 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c @@ -337,10 +337,9 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, } void -fd3_emit_state(struct fd_context *ctx, uint32_t dirty) +fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, + uint32_t dirty, bool binning) { - struct fd_ringbuffer *ring = ctx->ring; - emit_marker(ring, 5); if (dirty & FD_DIRTY_SAMPLE_MASK) { @@ -354,7 +353,8 @@ fd3_emit_state(struct fd_context *ctx, uint32_t dirty) struct fd3_zsa_stateobj *zsa = fd3_zsa_stateobj(ctx->zsa); struct pipe_stencil_ref *sr = &ctx->stencil_ref; - fd3_emit_rbrc_draw_state(ctx, ring, zsa->rb_render_control); + if (!binning) + fd3_emit_rbrc_draw_state(ctx, ring, zsa->rb_render_control); OUT_PKT0(ring, REG_A3XX_RB_ALPHA_REF, 1); OUT_RING(ring, zsa->rb_alpha_ref); @@ -432,7 +432,10 @@ fd3_emit_state(struct fd_context *ctx, uint32_t dirty) } if (dirty & FD_DIRTY_PROG) - fd3_program_emit(ring, &ctx->prog); + fd3_program_emit(ring, &ctx->prog, binning); + + OUT_PKT3(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, HLSQ_FLUSH); if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) { struct fd_program_stateobj *prog = &ctx->prog; @@ -566,11 +569,11 @@ fd3_emit_restore(struct fd_context *ctx) OUT_RING(ring, A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(0) | A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(0)); - OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_MODE_CONTROL_REG, 1); - OUT_RING(ring, 0x00000001); /* UCHE_CACHE_MODE_CONTROL_REG */ - - OUT_PKT0(ring, REG_A3XX_VSC_SIZE_ADDRESS, 1); - OUT_RELOC(ring, fd3_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS */ + OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_INVALIDATE0_REG, 2); + OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE0_REG_ADDR(0)); + OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE1_REG_ADDR(0) | + A3XX_UCHE_CACHE_INVALIDATE1_REG_OPCODE(INVALIDATE) | + A3XX_UCHE_CACHE_INVALIDATE1_REG_ENTIRE_CACHE); OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); OUT_RING(ring, 0x00000000); /* GRAS_CL_CLIP_CNTL */ @@ -604,6 +607,9 @@ fd3_emit_restore(struct fd_context *ctx) OUT_RING(ring, 0x00000000); /* GRAS_CL_USER_PLANE[i].W */ } + OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, 0x00000000); + emit_cache_flush(ring); fd_rmw_wfi(ctx, ring); } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h index bf7787ab6f7..50559d10d22 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h @@ -58,7 +58,8 @@ struct fd3_vertex_buf { void fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd_program_stateobj *prog, struct fd3_vertex_buf *vbufs, uint32_t n); -void fd3_emit_state(struct fd_context *ctx, uint32_t dirty); +void fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, + uint32_t dirty, bool binning); void fd3_emit_restore(struct fd_context *ctx); diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c index 3d0a607ed28..8720e087b7b 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c @@ -106,6 +106,159 @@ depth_base(struct fd_gmem_stateobj *gmem) return align(gmem->bin_w * gmem->bin_h, 0x4000); } +static bool +use_hw_binning(struct fd_context *ctx) +{ + struct fd_gmem_stateobj *gmem = &ctx->gmem; + return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2); +} + +/* workaround for (hlsq?) lockup with hw binning on a3xx patchlevel 0 */ +static void update_vsc_pipe(struct fd_context *ctx); +static void +emit_binning_workaround(struct fd_context *ctx) +{ + struct fd3_context *fd3_ctx = fd3_context(ctx); + struct fd_gmem_stateobj *gmem = &ctx->gmem; + struct fd_ringbuffer *ring = ctx->ring; + + OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 2); + OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) | + A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE); + OUT_RING(ring, A3XX_RB_RENDER_CONTROL_BIN_WIDTH(32) | + A3XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE | + A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER)); + + OUT_PKT0(ring, REG_A3XX_RB_COPY_CONTROL, 4); + OUT_RING(ring, A3XX_RB_COPY_CONTROL_MSAA_RESOLVE(MSAA_ONE) | + A3XX_RB_COPY_CONTROL_MODE(0) | + A3XX_RB_COPY_CONTROL_GMEM_BASE(0)); + OUT_RELOC(ring, fd_resource(fd3_ctx->solid_vbuf)->bo, 0x20, 0, -1); /* RB_COPY_DEST_BASE */ + OUT_RING(ring, A3XX_RB_COPY_DEST_PITCH_PITCH(128)); + OUT_RING(ring, A3XX_RB_COPY_DEST_INFO_TILE(LINEAR) | + A3XX_RB_COPY_DEST_INFO_FORMAT(RB_R8G8B8A8_UNORM) | + A3XX_RB_COPY_DEST_INFO_SWAP(WZYX) | + A3XX_RB_COPY_DEST_INFO_COMPONENT_ENABLE(0xf) | + A3XX_RB_COPY_DEST_INFO_ENDIAN(ENDIAN_NONE)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) | + A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A3XX_GRAS_SC_CONTROL_RASTER_MODE(1)); + + fd3_program_emit(ring, &ctx->solid_prog, false); + + fd3_emit_vertex_bufs(ring, &ctx->solid_prog, (struct fd3_vertex_buf[]) { + { .prsc = fd3_ctx->solid_vbuf, .stride = 12, .format = PIPE_FORMAT_R32G32B32_FLOAT }, + }, 1); + + OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 4); + OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) | + A3XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE | + A3XX_HLSQ_CONTROL_0_REG_RESERVED2 | + A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE); + OUT_RING(ring, A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) | + A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE); + OUT_RING(ring, A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(31)); + OUT_RING(ring, 0); /* HLSQ_CONTROL_3_REG */ + + OUT_PKT0(ring, REG_A3XX_HLSQ_CONST_FSPRESV_RANGE_REG, 1); + OUT_RING(ring, A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(0x20) | + A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(0x20)); + + OUT_PKT0(ring, REG_A3XX_RB_MSAA_CONTROL, 1); + OUT_RING(ring, A3XX_RB_MSAA_CONTROL_DISABLE | + A3XX_RB_MSAA_CONTROL_SAMPLES(MSAA_ONE) | + A3XX_RB_MSAA_CONTROL_SAMPLE_MASK(0xffff)); + + OUT_PKT0(ring, REG_A3XX_RB_DEPTH_CONTROL, 1); + OUT_RING(ring, A3XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_NEVER)); + + OUT_PKT0(ring, REG_A3XX_RB_STENCIL_CONTROL, 1); + OUT_RING(ring, A3XX_RB_STENCIL_CONTROL_FUNC(FUNC_NEVER) | + A3XX_RB_STENCIL_CONTROL_FAIL(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZPASS(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZFAIL(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_FUNC_BF(FUNC_NEVER) | + A3XX_RB_STENCIL_CONTROL_FAIL_BF(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZPASS_BF(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZFAIL_BF(STENCIL_KEEP)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SU_MODE_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(0.0)); + + OUT_PKT0(ring, REG_A3XX_VFD_INDEX_MIN, 4); + OUT_RING(ring, 0); /* VFD_INDEX_MIN */ + OUT_RING(ring, 2); /* VFD_INDEX_MAX */ + OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */ + OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ + + OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1); + OUT_RING(ring, A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(0) | + A3XX_PC_PRIM_VTX_CNTL_POLYMODE_FRONT_PTYPE(PC_DRAW_TRIANGLES) | + A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE(PC_DRAW_TRIANGLES) | + A3XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_WINDOW_SCISSOR_TL, 2); + OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_TL_X(0) | + A3XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(1)); + OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_BR_X(0) | + A3XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(1)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) | + A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0)); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(31) | + A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(0)); + + OUT_PKT0(ring, REG_A3XX_GRAS_CL_VPORT_XOFFSET, 6); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_XOFFSET(0.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_XSCALE(1.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_YOFFSET(0.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_YSCALE(1.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZOFFSET(0.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZSCALE(1.0)); + + OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); + OUT_RING(ring, A3XX_GRAS_CL_CLIP_CNTL_CLIP_DISABLE | + A3XX_GRAS_CL_CLIP_CNTL_ZFAR_CLIP_DISABLE | + A3XX_GRAS_CL_CLIP_CNTL_VP_CLIP_CODE_IGNORE | + A3XX_GRAS_CL_CLIP_CNTL_VP_XFORM_DISABLE | + A3XX_GRAS_CL_CLIP_CNTL_PERSP_DIVISION_DISABLE); + + OUT_PKT0(ring, REG_A3XX_GRAS_CL_GB_CLIP_ADJ, 1); + OUT_RING(ring, A3XX_GRAS_CL_GB_CLIP_ADJ_HORZ(0) | + A3XX_GRAS_CL_GB_CLIP_ADJ_VERT(0)); + + OUT_PKT3(ring, CP_DRAW_INDX_2, 5); + OUT_RING(ring, 0x00000000); /* viz query info. */ + OUT_RING(ring, DRAW(DI_PT_RECTLIST, DI_SRC_SEL_IMMEDIATE, + INDEX_SIZE_32_BIT, IGNORE_VISIBILITY)); + OUT_RING(ring, 2); /* NumIndices */ + OUT_RING(ring, 2); + OUT_RING(ring, 1); + + OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 1); + OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(TWO_QUADS)); + + OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); + OUT_RING(ring, 0x00000000); + + OUT_WFI(ring); + + OUT_PKT0(ring, REG_A3XX_VSC_BIN_SIZE, 1); + OUT_RING(ring, A3XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) | + A3XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); + + OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); + OUT_RING(ring, 0x00000000); +} + /* transfer from gmem to system memory (ie. normal RAM) */ static void @@ -129,8 +282,8 @@ emit_gmem2mem_surf(struct fd_context *ctx, A3XX_RB_COPY_DEST_INFO_ENDIAN(ENDIAN_NONE) | A3XX_RB_COPY_DEST_INFO_SWAP(fd3_pipe2swap(psurf->format))); - fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 2, - INDEX_SIZE_IGN, 0, 0, NULL); + fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 2, INDEX_SIZE_IGN, 0, 0, NULL); } static void @@ -210,7 +363,7 @@ fd3_emit_tile_gmem2mem(struct fd_context *ctx, struct fd_tile *tile) OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */ OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ - fd3_program_emit(ring, &ctx->solid_prog); + fd3_program_emit(ring, &ctx->solid_prog, false); fd3_emit_vertex_bufs(ring, &ctx->solid_prog, (struct fd3_vertex_buf[]) { { .prsc = fd3_ctx->solid_vbuf, .stride = 12, .format = PIPE_FORMAT_R32G32B32_FLOAT }, @@ -252,8 +405,8 @@ emit_mem2gmem_surf(struct fd_context *ctx, uint32_t base, fd3_emit_gmem_restore_tex(ring, psurf); - fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 2, - INDEX_SIZE_IGN, 0, 0, NULL); + fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 2, INDEX_SIZE_IGN, 0, 0, NULL); } static void @@ -355,7 +508,7 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile) OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */ OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ - fd3_program_emit(ring, &ctx->blit_prog); + fd3_program_emit(ring, &ctx->blit_prog, false); fd3_emit_vertex_bufs(ring, &ctx->blit_prog, (struct fd3_vertex_buf[]) { { .prsc = fd3_ctx->blit_texcoord_vbuf, .stride = 8, .format = PIPE_FORMAT_R32G32_FLOAT }, @@ -381,11 +534,68 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile) } static void +patch_draws(struct fd_context *ctx, enum pc_di_vis_cull_mode vismode) +{ + unsigned i; + for (i = 0; i < fd_patch_num_elements(&ctx->draw_patches); i++) { + struct fd_cs_patch *patch = fd_patch_element(&ctx->draw_patches, i); + *patch->cs = patch->val | DRAW(0, 0, 0, vismode); + } + util_dynarray_resize(&ctx->draw_patches, 0); +} + +/* for rendering directly to system memory: */ +static void +fd3_emit_sysmem_prep(struct fd_context *ctx) +{ + struct pipe_framebuffer_state *pfb = &ctx->framebuffer; + struct fd_ringbuffer *ring = ctx->ring; + uint32_t pitch = 0; + + if (pfb->cbufs[0]) + pitch = fd_resource(pfb->cbufs[0]->texture)->slices[0].pitch; + + fd3_emit_restore(ctx); + + OUT_PKT0(ring, REG_A3XX_RB_FRAME_BUFFER_DIMENSION, 1); + OUT_RING(ring, A3XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) | + A3XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height)); + + emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0); + + OUT_PKT0(ring, REG_A3XX_RB_RENDER_CONTROL, 1); + OUT_RING(ring, A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) | + A3XX_RB_RENDER_CONTROL_BIN_WIDTH(pitch)); + + /* setup scissor/offset for current tile: */ + OUT_PKT0(ring, REG_A3XX_RB_WINDOW_OFFSET, 1); + OUT_RING(ring, A3XX_RB_WINDOW_OFFSET_X(0) | + A3XX_RB_WINDOW_OFFSET_Y(0)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) | + A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0)); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(pfb->width - 1) | + A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(pfb->height - 1)); + + OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1); + OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_RB_MODE_CONTROL_GMEM_BYPASS | + A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE); + + patch_draws(ctx, IGNORE_VISIBILITY); +} + +static void update_vsc_pipe(struct fd_context *ctx) { + struct fd3_context *fd3_ctx = fd3_context(ctx); struct fd_ringbuffer *ring = ctx->ring; int i; + OUT_PKT0(ring, REG_A3XX_VSC_SIZE_ADDRESS, 1); + OUT_RELOC(ring, fd3_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS */ + for (i = 0; i < 8; i++) { struct fd_vsc_pipe *pipe = &ctx->pipe[i]; @@ -394,7 +604,7 @@ update_vsc_pipe(struct fd_context *ctx) DRM_FREEDRENO_GEM_TYPE_KMEM); } - OUT_PKT0(ring, REG_A3XX_VSC_PIPE(0), 3); + OUT_PKT0(ring, REG_A3XX_VSC_PIPE(i), 3); OUT_RING(ring, A3XX_VSC_PIPE_CONFIG_X(pipe->x) | A3XX_VSC_PIPE_CONFIG_Y(pipe->y) | A3XX_VSC_PIPE_CONFIG_W(pipe->w) | @@ -404,34 +614,45 @@ update_vsc_pipe(struct fd_context *ctx) } } -/* for rendering directly to system memory: */ static void -fd3_emit_sysmem_prep(struct fd_context *ctx) +emit_binning_pass(struct fd_context *ctx) { struct pipe_framebuffer_state *pfb = &ctx->framebuffer; struct fd_ringbuffer *ring = ctx->ring; - uint32_t pitch = 0; + int i; - if (pfb->cbufs[0]) - pitch = fd_resource(pfb->cbufs[0]->texture)->slices[0].pitch; + if (ctx->screen->gpu_id == 320) { + emit_binning_workaround(ctx); - fd3_emit_restore(ctx); + OUT_PKT3(ring, CP_INVALIDATE_STATE, 1); + OUT_RING(ring, 0x00007fff); + } + + OUT_PKT0(ring, REG_A3XX_VSC_BIN_CONTROL, 1); + OUT_RING(ring, A3XX_VSC_BIN_CONTROL_BINNING_ENABLE); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_TILING_PASS) | + A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); OUT_PKT0(ring, REG_A3XX_RB_FRAME_BUFFER_DIMENSION, 1); OUT_RING(ring, A3XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) | A3XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height)); - emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0); - OUT_PKT0(ring, REG_A3XX_RB_RENDER_CONTROL, 1); OUT_RING(ring, A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) | - A3XX_RB_RENDER_CONTROL_BIN_WIDTH(pitch)); + A3XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE | + A3XX_RB_RENDER_CONTROL_BIN_WIDTH(ctx->gmem.bin_w)); - /* setup scissor/offset for current tile: */ + /* setup scissor/offset for whole screen: */ OUT_PKT0(ring, REG_A3XX_RB_WINDOW_OFFSET, 1); OUT_RING(ring, A3XX_RB_WINDOW_OFFSET_X(0) | A3XX_RB_WINDOW_OFFSET_Y(0)); + OUT_PKT0(ring, REG_A3XX_RB_LRZ_VSC_CONTROL, 1); + OUT_RING(ring, A3XX_RB_LRZ_VSC_CONTROL_BINNING_ENABLE); + OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) | A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0)); @@ -439,9 +660,72 @@ fd3_emit_sysmem_prep(struct fd_context *ctx) A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(pfb->height - 1)); OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1); + OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_TILING_PASS) | + A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE); + + for (i = 0; i < 4; i++) { + OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(i), 1); + OUT_RING(ring, A3XX_RB_MRT_CONTROL_ROP_CODE(0) | + A3XX_RB_MRT_CONTROL_DITHER_MODE(DITHER_DISABLE) | + A3XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0)); + } + + OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, A3XX_PC_VSTREAM_CONTROL_SIZE(1) | + A3XX_PC_VSTREAM_CONTROL_N(0)); + + /* emit IB to binning drawcmds: */ + OUT_IB(ring, ctx->binning_start, ctx->binning_end); + + /* and then put stuff back the way it was: */ + + OUT_PKT0(ring, REG_A3XX_VSC_BIN_CONTROL, 1); + OUT_RING(ring, 0x00000000); + + OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1); + OUT_RING(ring, A3XX_SP_SP_CTRL_REG_RESOLVE | + A3XX_SP_SP_CTRL_REG_CONSTMODE(1) | + A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) | + A3XX_SP_SP_CTRL_REG_L0MODE(0)); + + OUT_PKT0(ring, REG_A3XX_RB_LRZ_VSC_CONTROL, 1); + OUT_RING(ring, 0x00000000); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); + + OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 2); OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | - A3XX_RB_MODE_CONTROL_GMEM_BYPASS | A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE); + OUT_RING(ring, A3XX_RB_RENDER_CONTROL_ENABLE_GMEM | + A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) | + A3XX_RB_RENDER_CONTROL_BIN_WIDTH(ctx->gmem.bin_w)); + + OUT_PKT3(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, CACHE_FLUSH); + + if (ctx->screen->gpu_id == 320) { + /* dummy-draw workaround: */ + OUT_PKT3(ring, CP_DRAW_INDX, 3); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, DRAW(1, DI_SRC_SEL_AUTO_INDEX, + INDEX_SIZE_IGN, IGNORE_VISIBILITY)); + OUT_RING(ring, 0); /* NumIndices */ + } + + OUT_PKT3(ring, CP_NOP, 4); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_WFI(ring); + + if (ctx->screen->gpu_id == 320) { + emit_binning_workaround(ctx); + } } /* before first tile */ @@ -461,6 +745,18 @@ fd3_emit_tile_init(struct fd_context *ctx) A3XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h)); update_vsc_pipe(ctx); + + if (use_hw_binning(ctx)) { + /* mark the end of the binning cmds: */ + fd_ringmarker_mark(ctx->binning_end); + + /* emit hw binning pass: */ + emit_binning_pass(ctx); + + patch_draws(ctx, USE_VISIBILITY); + } else { + patch_draws(ctx, IGNORE_VISIBILITY); + } } /* before mem2gmem */ @@ -472,7 +768,6 @@ fd3_emit_tile_prep(struct fd_context *ctx, struct fd_tile *tile) struct fd_gmem_stateobj *gmem = &ctx->gmem; uint32_t reg; - OUT_PKT0(ring, REG_A3XX_RB_DEPTH_INFO, 2); reg = A3XX_RB_DEPTH_INFO_DEPTH_BASE(depth_base(gmem)); if (pfb->zsbuf) { @@ -499,6 +794,7 @@ fd3_emit_tile_prep(struct fd_context *ctx, struct fd_tile *tile) static void fd3_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile) { + struct fd3_context *fd3_ctx = fd3_context(ctx); struct fd_ringbuffer *ring = ctx->ring; struct fd_gmem_stateobj *gmem = &ctx->gmem; struct pipe_framebuffer_state *pfb = &ctx->framebuffer; @@ -508,6 +804,32 @@ fd3_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile) uint32_t x2 = tile->xoff + tile->bin_w - 1; uint32_t y2 = tile->yoff + tile->bin_h - 1; + if (use_hw_binning(ctx)) { + struct fd_vsc_pipe *pipe = &ctx->pipe[tile->p]; + + assert(pipe->w * pipe->h); + + OUT_PKT3(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, HLSQ_FLUSH); + + OUT_WFI(ring); + + OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, A3XX_PC_VSTREAM_CONTROL_SIZE(pipe->w * pipe->h) | + A3XX_PC_VSTREAM_CONTROL_N(tile->n)); + + OUT_PKT3(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, CACHE_FLUSH); + + OUT_PKT3(ring, CP_SET_BIN_DATA, 2); + OUT_RELOC(ring, pipe->bo, 0, 0, 0); /* BIN_DATA_ADDR <- VSC_PIPE[p].DATA_ADDRESS */ + OUT_RELOC(ring, fd3_ctx->vsc_size_mem, /* BIN_SIZE_ADDR <- VSC_SIZE_ADDRESS + (p * 4) */ + (tile->p * 4), 0, 0); + } else { + OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, 0x00000000); + } + OUT_PKT3(ring, CP_SET_BIN, 3); OUT_RING(ring, 0x00000000); OUT_RING(ring, CP_SET_BIN_1_X1(x1) | CP_SET_BIN_1_Y1(y1)); diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c index c02b14cba39..2622006ff09 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c @@ -36,6 +36,7 @@ #include "fd3_program.h" #include "fd3_compiler.h" +#include "fd3_emit.h" #include "fd3_texture.h" #include "fd3_util.h" @@ -175,9 +176,9 @@ fd3_vp_state_bind(struct pipe_context *pctx, void *hwcso) } static void -emit_shader(struct fd_ringbuffer *ring, struct fd3_shader_stateobj *so) +emit_shader(struct fd_ringbuffer *ring, const struct fd3_shader_stateobj *so) { - struct ir3_shader_info *si = &so->info; + const struct ir3_shader_info *si = &so->info; enum adreno_state_block sb; enum adreno_state_src src; uint32_t i, sz, *bin; @@ -216,7 +217,7 @@ emit_shader(struct fd_ringbuffer *ring, struct fd3_shader_stateobj *so) } static int -find_output(struct fd3_shader_stateobj *so, fd3_semantic semantic) +find_output(const struct fd3_shader_stateobj *so, fd3_semantic semantic) { int j; for (j = 0; j < so->outputs_count; j++) @@ -227,14 +228,21 @@ find_output(struct fd3_shader_stateobj *so, fd3_semantic semantic) void fd3_program_emit(struct fd_ringbuffer *ring, - struct fd_program_stateobj *prog) + struct fd_program_stateobj *prog, bool binning) { - struct fd3_shader_stateobj *vp = prog->vp; - struct fd3_shader_stateobj *fp = prog->fp; - struct ir3_shader_info *vsi = &vp->info; - struct ir3_shader_info *fsi = &fp->info; + const struct fd3_shader_stateobj *vp = prog->vp; + const struct fd3_shader_stateobj *fp = prog->fp; + const struct ir3_shader_info *vsi = &vp->info; + const struct ir3_shader_info *fsi = &fp->info; int i; + if (binning) { + /* use dummy stateobj to simplify binning vs non-binning: */ + static const struct fd3_shader_stateobj binning_fp = {}; + fp = &binning_fp; + fsi = &fp->info; + } + /* we could probably divide this up into things that need to be * emitted if frag-prog is dirty vs if vert-prog is dirty.. */ @@ -260,11 +268,9 @@ fd3_program_emit(struct fd_ringbuffer *ring, OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1); OUT_RING(ring, A3XX_SP_SP_CTRL_REG_CONSTMODE(0) | + COND(binning, A3XX_SP_SP_CTRL_REG_BINNING) | A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) | - // XXX "resolve" (?) bit set on gmem->mem pass.. -// COND(!uniforms, A3XX_SP_SP_CTRL_REG_RESOLVE) | - // XXX sometimes 0, sometimes 1: - A3XX_SP_SP_CTRL_REG_LOMODE(1)); + A3XX_SP_SP_CTRL_REG_L0MODE(0)); OUT_PKT0(ring, REG_A3XX_SP_VS_LENGTH_REG, 1); OUT_RING(ring, A3XX_SP_VS_LENGTH_REG_SHADERLENGTH(vp->instrlen)); @@ -272,6 +278,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, OUT_PKT0(ring, REG_A3XX_SP_VS_CTRL_REG0, 3); OUT_RING(ring, A3XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) | A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) | + A3XX_SP_VS_CTRL_REG0_CACHEINVALID | A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vsi->max_half_reg + 1) | A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vsi->max_reg + 1) | A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) | @@ -323,28 +330,38 @@ fd3_program_emit(struct fd_ringbuffer *ring, A3XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0)); OUT_RELOC(ring, vp->bo, 0, 0, 0); /* SP_VS_OBJ_START_REG */ - OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1); - OUT_RING(ring, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(fp->instrlen)); - - OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2); - OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | - A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) | - A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) | - A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) | - A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | - A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) | - A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE | - COND(fp->samplers_count > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) | - A3XX_SP_FS_CTRL_REG0_LENGTH(fp->instrlen)); - OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fp->constlen) | - A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(fp->total_in) | - A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(MAX2(fsi->max_const, 0)) | - A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63)); - - OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 2); - OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) | - A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0)); - OUT_RELOC(ring, fp->bo, 0, 0, 0); /* SP_FS_OBJ_START_REG */ + if (binning) { + OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1); + OUT_RING(ring, 0x00000000); + + OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2); + OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | + A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER)); + OUT_RING(ring, 0x00000000); + } else { + OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1); + OUT_RING(ring, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(fp->instrlen)); + + OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2); + OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | + A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) | + A3XX_SP_FS_CTRL_REG0_CACHEINVALID | + A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) | + A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) | + A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | + A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) | + A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE | + COND(fp->samplers_count > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) | + A3XX_SP_FS_CTRL_REG0_LENGTH(fp->instrlen)); + OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fp->constlen) | + A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(fp->total_in) | + A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(MAX2(fsi->max_const, 0)) | + A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63)); + OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 2); + OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) | + A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0)); + OUT_RELOC(ring, fp->bo, 0, 0, 0); /* SP_FS_OBJ_START_REG */ + } OUT_PKT0(ring, REG_A3XX_SP_FS_FLAT_SHAD_MODE_REG_0, 2); OUT_RING(ring, 0x00000000); /* SP_FS_FLAT_SHAD_MODE_REG_0 */ @@ -360,24 +377,31 @@ fd3_program_emit(struct fd_ringbuffer *ring, OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0)); OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0)); - OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2); - OUT_RING(ring, A3XX_VPC_ATTR_TOTALATTR(fp->total_in) | - A3XX_VPC_ATTR_THRDASSIGN(1) | - A3XX_VPC_ATTR_LMSIZE(1)); - OUT_RING(ring, A3XX_VPC_PACK_NUMFPNONPOSVAR(fp->total_in) | - A3XX_VPC_PACK_NUMNONPOSVSVAR(fp->total_in)); - - OUT_PKT0(ring, REG_A3XX_VPC_VARYING_INTERP_MODE(0), 4); - OUT_RING(ring, fp->vinterp[0]); /* VPC_VARYING_INTERP[0].MODE */ - OUT_RING(ring, fp->vinterp[1]); /* VPC_VARYING_INTERP[1].MODE */ - OUT_RING(ring, fp->vinterp[2]); /* VPC_VARYING_INTERP[2].MODE */ - OUT_RING(ring, fp->vinterp[3]); /* VPC_VARYING_INTERP[3].MODE */ - - OUT_PKT0(ring, REG_A3XX_VPC_VARYING_PS_REPL_MODE(0), 4); - OUT_RING(ring, fp->vpsrepl[0]); /* VPC_VARYING_PS_REPL[0].MODE */ - OUT_RING(ring, fp->vpsrepl[1]); /* VPC_VARYING_PS_REPL[1].MODE */ - OUT_RING(ring, fp->vpsrepl[2]); /* VPC_VARYING_PS_REPL[2].MODE */ - OUT_RING(ring, fp->vpsrepl[3]); /* VPC_VARYING_PS_REPL[3].MODE */ + if (binning) { + OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2); + OUT_RING(ring, A3XX_VPC_ATTR_THRDASSIGN(1) | + A3XX_VPC_ATTR_LMSIZE(1)); + OUT_RING(ring, 0x00000000); + } else { + OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2); + OUT_RING(ring, A3XX_VPC_ATTR_TOTALATTR(fp->total_in) | + A3XX_VPC_ATTR_THRDASSIGN(1) | + A3XX_VPC_ATTR_LMSIZE(1)); + OUT_RING(ring, A3XX_VPC_PACK_NUMFPNONPOSVAR(fp->total_in) | + A3XX_VPC_PACK_NUMNONPOSVSVAR(fp->total_in)); + + OUT_PKT0(ring, REG_A3XX_VPC_VARYING_INTERP_MODE(0), 4); + OUT_RING(ring, fp->vinterp[0]); /* VPC_VARYING_INTERP[0].MODE */ + OUT_RING(ring, fp->vinterp[1]); /* VPC_VARYING_INTERP[1].MODE */ + OUT_RING(ring, fp->vinterp[2]); /* VPC_VARYING_INTERP[2].MODE */ + OUT_RING(ring, fp->vinterp[3]); /* VPC_VARYING_INTERP[3].MODE */ + + OUT_PKT0(ring, REG_A3XX_VPC_VARYING_PS_REPL_MODE(0), 4); + OUT_RING(ring, fp->vpsrepl[0]); /* VPC_VARYING_PS_REPL[0].MODE */ + OUT_RING(ring, fp->vpsrepl[1]); /* VPC_VARYING_PS_REPL[1].MODE */ + OUT_RING(ring, fp->vpsrepl[2]); /* VPC_VARYING_PS_REPL[2].MODE */ + OUT_RING(ring, fp->vpsrepl[3]); /* VPC_VARYING_PS_REPL[3].MODE */ + } OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1); OUT_RING(ring, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) | @@ -388,10 +412,12 @@ fd3_program_emit(struct fd_ringbuffer *ring, OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */ - emit_shader(ring, fp); + if (!binning) { + emit_shader(ring, fp); - OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); - OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */ + OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); + OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */ + } OUT_PKT0(ring, REG_A3XX_VFD_CONTROL_0, 2); OUT_RING(ring, A3XX_VFD_CONTROL_0_TOTALATTRTOVS(vp->total_in) | diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.h b/src/gallium/drivers/freedreno/a3xx/fd3_program.h index 85c22a54cf7..bd6483ff42c 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.h @@ -117,7 +117,7 @@ struct fd3_shader_stateobj { }; void fd3_program_emit(struct fd_ringbuffer *ring, - struct fd_program_stateobj *prog); + struct fd_program_stateobj *prog, bool binning); void fd3_prog_init(struct pipe_context *pctx); void fd3_prog_fini(struct pipe_context *pctx); |