diff options
-rw-r--r-- | configure.ac | 2 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a2xx/fd2_draw.c | 6 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a2xx/fd2_gmem.c | 8 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/fd3_draw.c | 81 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/fd3_emit.c | 26 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/fd3_emit.h | 3 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/fd3_gmem.c | 360 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/fd3_program.c | 136 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/fd3_program.h | 2 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/freedreno_context.c | 45 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/freedreno_context.h | 12 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/freedreno_draw.c | 9 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/freedreno_draw.h | 23 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/freedreno_gmem.c | 100 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/freedreno_screen.c | 9 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/freedreno_util.h | 44 |
16 files changed, 707 insertions, 159 deletions
diff --git a/configure.ac b/configure.ac index f75325d33da..4b55140d299 100644 --- a/configure.ac +++ b/configure.ac @@ -32,7 +32,7 @@ LIBDRM_RADEON_REQUIRED=2.4.50 LIBDRM_INTEL_REQUIRED=2.4.49 LIBDRM_NVVIEUX_REQUIRED=2.4.33 LIBDRM_NOUVEAU_REQUIRED="2.4.33 libdrm >= 2.4.41" -LIBDRM_FREEDRENO_REQUIRED=2.4.39 +LIBDRM_FREEDRENO_REQUIRED=2.4.51 DRI2PROTO_REQUIRED=2.6 DRI3PROTO_REQUIRED=1.0 PRESENTPROTO_REQUIRED=1.0 diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c index 300ce2e51c1..d6e42b668a8 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c @@ -108,7 +108,7 @@ fd2_draw(struct fd_context *ctx, const struct pipe_draw_info *info) OUT_RING(ring, info->max_index); /* VGT_MAX_VTX_INDX */ OUT_RING(ring, info->min_index); /* VGT_MIN_VTX_INDX */ - fd_draw_emit(ctx, info); + fd_draw_emit(ctx, ring, IGNORE_VISIBILITY, info); OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_UNKNOWN_2010)); @@ -269,8 +269,8 @@ fd2_clear(struct fd_context *ctx, unsigned buffers, OUT_RING(ring, 3); /* VGT_MAX_VTX_INDX */ OUT_RING(ring, 0); /* VGT_MIN_VTX_INDX */ - fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 3, - INDEX_SIZE_IGN, 0, 0, NULL); + fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 3, INDEX_SIZE_IGN, 0, 0, NULL); OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_A220_RB_LRZ_VSC_CONTROL)); diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c index c494bf153e0..274b6145fde 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c @@ -90,8 +90,8 @@ emit_gmem2mem_surf(struct fd_context *ctx, uint32_t base, OUT_RING(ring, 3); /* VGT_MAX_VTX_INDX */ OUT_RING(ring, 0); /* VGT_MIN_VTX_INDX */ - fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 3, - INDEX_SIZE_IGN, 0, 0, NULL); + fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 3, INDEX_SIZE_IGN, 0, 0, NULL); } static void @@ -212,8 +212,8 @@ emit_mem2gmem_surf(struct fd_context *ctx, uint32_t base, OUT_RING(ring, 3); /* VGT_MAX_VTX_INDX */ OUT_RING(ring, 0); /* VGT_MIN_VTX_INDX */ - fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 3, - INDEX_SIZE_IGN, 0, 0, NULL); + fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 3, INDEX_SIZE_IGN, 0, 0, NULL); } static void diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c index c5d8b774552..4c90d984955 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c @@ -43,7 +43,7 @@ static void -emit_vertexbufs(struct fd_context *ctx) +emit_vertexbufs(struct fd_context *ctx, struct fd_ringbuffer *ring) { struct fd_vertex_stateobj *vtx = ctx->vtx; struct fd_vertexbuf_stateobj *vertexbuf = &ctx->vertexbuf; @@ -63,19 +63,17 @@ emit_vertexbufs(struct fd_context *ctx) bufs[i].format = elem->src_format; } - fd3_emit_vertex_bufs(ctx->ring, &ctx->prog, bufs, vtx->num_elements); + fd3_emit_vertex_bufs(ring, &ctx->prog, bufs, vtx->num_elements); } static void -fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info) +draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info, + struct fd_ringbuffer *ring, unsigned dirty, bool binning) { - struct fd_ringbuffer *ring = ctx->ring; - unsigned dirty = ctx->dirty; - - fd3_emit_state(ctx, dirty); + fd3_emit_state(ctx, ring, dirty, binning); if (dirty & FD_DIRTY_VTXBUF) - emit_vertexbufs(ctx); + emit_vertexbufs(ctx, ring); OUT_PKT0(ring, REG_A3XX_PC_VERTEX_REUSE_BLOCK_CNTL, 1); OUT_RING(ring, 0x0000000b); /* PC_VERTEX_REUSE_BLOCK_CNTL */ @@ -90,7 +88,59 @@ fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info) OUT_RING(ring, info->primitive_restart ? /* PC_RESTART_INDEX */ info->restart_index : 0xffffffff); - fd_draw_emit(ctx, info); + fd_draw_emit(ctx, ring, binning ? IGNORE_VISIBILITY : USE_VISIBILITY, info); +} + +static void +fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info) +{ + unsigned dirty = ctx->dirty; + draw_impl(ctx, info, ctx->binning_ring, + dirty & ~(FD_DIRTY_BLEND), true); + draw_impl(ctx, info, ctx->ring, dirty, false); +} + +/* binning pass cmds for a clear: + * NOTE: newer blob drivers don't use binning for clear, which is probably + * preferable since it is low vtx count. However that doesn't seem to + * actually work for me. Not sure if it is depending on support for + * clear pass (rather than using solid-fill shader), or something else + * that newer blob is doing differently. Once that is figured out, we + * can remove fd3_clear_binning(). + */ +static void +fd3_clear_binning(struct fd_context *ctx, unsigned dirty) +{ + struct fd3_context *fd3_ctx = fd3_context(ctx); + struct fd_ringbuffer *ring = ctx->binning_ring; + + fd3_emit_state(ctx, ring, dirty & (FD_DIRTY_VIEWPORT | + FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR), true); + + fd3_program_emit(ring, &ctx->solid_prog, true); + + fd3_emit_vertex_bufs(ring, &ctx->solid_prog, (struct fd3_vertex_buf[]) { + { .prsc = fd3_ctx->solid_vbuf, .stride = 12, .format = PIPE_FORMAT_R32G32B32_FLOAT }, + }, 1); + + OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1); + OUT_RING(ring, A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(0) | + A3XX_PC_PRIM_VTX_CNTL_POLYMODE_FRONT_PTYPE(PC_DRAW_TRIANGLES) | + A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE(PC_DRAW_TRIANGLES) | + A3XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST); + OUT_PKT0(ring, REG_A3XX_VFD_INDEX_MIN, 4); + OUT_RING(ring, 0); /* VFD_INDEX_MIN */ + OUT_RING(ring, 2); /* VFD_INDEX_MAX */ + OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */ + OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ + OUT_PKT0(ring, REG_A3XX_PC_RESTART_INDEX, 1); + OUT_RING(ring, 0xffffffff); /* PC_RESTART_INDEX */ + + OUT_PKT3(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, PERFCOUNTER_STOP); + + fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 2, INDEX_SIZE_IGN, 0, 0, NULL); } static void @@ -99,11 +149,14 @@ fd3_clear(struct fd_context *ctx, unsigned buffers, { struct fd3_context *fd3_ctx = fd3_context(ctx); struct fd_ringbuffer *ring = ctx->ring; + unsigned dirty = ctx->dirty; unsigned ce, i; + fd3_clear_binning(ctx, dirty); + /* emit generic state now: */ - fd3_emit_state(ctx, ctx->dirty & (FD_DIRTY_VIEWPORT | - FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR)); + fd3_emit_state(ctx, ring, dirty & (FD_DIRTY_VIEWPORT | + FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR), false); OUT_PKT0(ring, REG_A3XX_RB_BLEND_ALPHA, 1); OUT_RING(ring, A3XX_RB_BLEND_ALPHA_UINT(0xff) | @@ -192,7 +245,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers, OUT_PKT0(ring, REG_A3XX_GRAS_SU_MODE_CONTROL, 1); OUT_RING(ring, A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(0)); - fd3_program_emit(ring, &ctx->solid_prog); + fd3_program_emit(ring, &ctx->solid_prog, false); fd3_emit_vertex_bufs(ring, &ctx->solid_prog, (struct fd3_vertex_buf[]) { { .prsc = fd3_ctx->solid_vbuf, .stride = 12, .format = PIPE_FORMAT_R32G32B32_FLOAT }, @@ -216,8 +269,8 @@ fd3_clear(struct fd_context *ctx, unsigned buffers, OUT_PKT3(ring, CP_EVENT_WRITE, 1); OUT_RING(ring, PERFCOUNTER_STOP); - fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 2, - INDEX_SIZE_IGN, 0, 0, NULL); + fd_draw(ctx, ring, DI_PT_RECTLIST, USE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 2, INDEX_SIZE_IGN, 0, 0, NULL); } void diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c index 91993725ea6..9cfe4ddb662 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c @@ -337,10 +337,9 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, } void -fd3_emit_state(struct fd_context *ctx, uint32_t dirty) +fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, + uint32_t dirty, bool binning) { - struct fd_ringbuffer *ring = ctx->ring; - emit_marker(ring, 5); if (dirty & FD_DIRTY_SAMPLE_MASK) { @@ -354,7 +353,8 @@ fd3_emit_state(struct fd_context *ctx, uint32_t dirty) struct fd3_zsa_stateobj *zsa = fd3_zsa_stateobj(ctx->zsa); struct pipe_stencil_ref *sr = &ctx->stencil_ref; - fd3_emit_rbrc_draw_state(ctx, ring, zsa->rb_render_control); + if (!binning) + fd3_emit_rbrc_draw_state(ctx, ring, zsa->rb_render_control); OUT_PKT0(ring, REG_A3XX_RB_ALPHA_REF, 1); OUT_RING(ring, zsa->rb_alpha_ref); @@ -432,7 +432,10 @@ fd3_emit_state(struct fd_context *ctx, uint32_t dirty) } if (dirty & FD_DIRTY_PROG) - fd3_program_emit(ring, &ctx->prog); + fd3_program_emit(ring, &ctx->prog, binning); + + OUT_PKT3(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, HLSQ_FLUSH); if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) { struct fd_program_stateobj *prog = &ctx->prog; @@ -566,11 +569,11 @@ fd3_emit_restore(struct fd_context *ctx) OUT_RING(ring, A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(0) | A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(0)); - OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_MODE_CONTROL_REG, 1); - OUT_RING(ring, 0x00000001); /* UCHE_CACHE_MODE_CONTROL_REG */ - - OUT_PKT0(ring, REG_A3XX_VSC_SIZE_ADDRESS, 1); - OUT_RELOC(ring, fd3_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS */ + OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_INVALIDATE0_REG, 2); + OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE0_REG_ADDR(0)); + OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE1_REG_ADDR(0) | + A3XX_UCHE_CACHE_INVALIDATE1_REG_OPCODE(INVALIDATE) | + A3XX_UCHE_CACHE_INVALIDATE1_REG_ENTIRE_CACHE); OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); OUT_RING(ring, 0x00000000); /* GRAS_CL_CLIP_CNTL */ @@ -604,6 +607,9 @@ fd3_emit_restore(struct fd_context *ctx) OUT_RING(ring, 0x00000000); /* GRAS_CL_USER_PLANE[i].W */ } + OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, 0x00000000); + emit_cache_flush(ring); fd_rmw_wfi(ctx, ring); } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h index bf7787ab6f7..50559d10d22 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h @@ -58,7 +58,8 @@ struct fd3_vertex_buf { void fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd_program_stateobj *prog, struct fd3_vertex_buf *vbufs, uint32_t n); -void fd3_emit_state(struct fd_context *ctx, uint32_t dirty); +void fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, + uint32_t dirty, bool binning); void fd3_emit_restore(struct fd_context *ctx); diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c index 3d0a607ed28..8720e087b7b 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c @@ -106,6 +106,159 @@ depth_base(struct fd_gmem_stateobj *gmem) return align(gmem->bin_w * gmem->bin_h, 0x4000); } +static bool +use_hw_binning(struct fd_context *ctx) +{ + struct fd_gmem_stateobj *gmem = &ctx->gmem; + return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2); +} + +/* workaround for (hlsq?) lockup with hw binning on a3xx patchlevel 0 */ +static void update_vsc_pipe(struct fd_context *ctx); +static void +emit_binning_workaround(struct fd_context *ctx) +{ + struct fd3_context *fd3_ctx = fd3_context(ctx); + struct fd_gmem_stateobj *gmem = &ctx->gmem; + struct fd_ringbuffer *ring = ctx->ring; + + OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 2); + OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) | + A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE); + OUT_RING(ring, A3XX_RB_RENDER_CONTROL_BIN_WIDTH(32) | + A3XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE | + A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER)); + + OUT_PKT0(ring, REG_A3XX_RB_COPY_CONTROL, 4); + OUT_RING(ring, A3XX_RB_COPY_CONTROL_MSAA_RESOLVE(MSAA_ONE) | + A3XX_RB_COPY_CONTROL_MODE(0) | + A3XX_RB_COPY_CONTROL_GMEM_BASE(0)); + OUT_RELOC(ring, fd_resource(fd3_ctx->solid_vbuf)->bo, 0x20, 0, -1); /* RB_COPY_DEST_BASE */ + OUT_RING(ring, A3XX_RB_COPY_DEST_PITCH_PITCH(128)); + OUT_RING(ring, A3XX_RB_COPY_DEST_INFO_TILE(LINEAR) | + A3XX_RB_COPY_DEST_INFO_FORMAT(RB_R8G8B8A8_UNORM) | + A3XX_RB_COPY_DEST_INFO_SWAP(WZYX) | + A3XX_RB_COPY_DEST_INFO_COMPONENT_ENABLE(0xf) | + A3XX_RB_COPY_DEST_INFO_ENDIAN(ENDIAN_NONE)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) | + A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A3XX_GRAS_SC_CONTROL_RASTER_MODE(1)); + + fd3_program_emit(ring, &ctx->solid_prog, false); + + fd3_emit_vertex_bufs(ring, &ctx->solid_prog, (struct fd3_vertex_buf[]) { + { .prsc = fd3_ctx->solid_vbuf, .stride = 12, .format = PIPE_FORMAT_R32G32B32_FLOAT }, + }, 1); + + OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 4); + OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) | + A3XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE | + A3XX_HLSQ_CONTROL_0_REG_RESERVED2 | + A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE); + OUT_RING(ring, A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) | + A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE); + OUT_RING(ring, A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(31)); + OUT_RING(ring, 0); /* HLSQ_CONTROL_3_REG */ + + OUT_PKT0(ring, REG_A3XX_HLSQ_CONST_FSPRESV_RANGE_REG, 1); + OUT_RING(ring, A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(0x20) | + A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(0x20)); + + OUT_PKT0(ring, REG_A3XX_RB_MSAA_CONTROL, 1); + OUT_RING(ring, A3XX_RB_MSAA_CONTROL_DISABLE | + A3XX_RB_MSAA_CONTROL_SAMPLES(MSAA_ONE) | + A3XX_RB_MSAA_CONTROL_SAMPLE_MASK(0xffff)); + + OUT_PKT0(ring, REG_A3XX_RB_DEPTH_CONTROL, 1); + OUT_RING(ring, A3XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_NEVER)); + + OUT_PKT0(ring, REG_A3XX_RB_STENCIL_CONTROL, 1); + OUT_RING(ring, A3XX_RB_STENCIL_CONTROL_FUNC(FUNC_NEVER) | + A3XX_RB_STENCIL_CONTROL_FAIL(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZPASS(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZFAIL(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_FUNC_BF(FUNC_NEVER) | + A3XX_RB_STENCIL_CONTROL_FAIL_BF(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZPASS_BF(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZFAIL_BF(STENCIL_KEEP)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SU_MODE_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(0.0)); + + OUT_PKT0(ring, REG_A3XX_VFD_INDEX_MIN, 4); + OUT_RING(ring, 0); /* VFD_INDEX_MIN */ + OUT_RING(ring, 2); /* VFD_INDEX_MAX */ + OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */ + OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ + + OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1); + OUT_RING(ring, A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(0) | + A3XX_PC_PRIM_VTX_CNTL_POLYMODE_FRONT_PTYPE(PC_DRAW_TRIANGLES) | + A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE(PC_DRAW_TRIANGLES) | + A3XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_WINDOW_SCISSOR_TL, 2); + OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_TL_X(0) | + A3XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(1)); + OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_BR_X(0) | + A3XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(1)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) | + A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0)); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(31) | + A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(0)); + + OUT_PKT0(ring, REG_A3XX_GRAS_CL_VPORT_XOFFSET, 6); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_XOFFSET(0.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_XSCALE(1.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_YOFFSET(0.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_YSCALE(1.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZOFFSET(0.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZSCALE(1.0)); + + OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); + OUT_RING(ring, A3XX_GRAS_CL_CLIP_CNTL_CLIP_DISABLE | + A3XX_GRAS_CL_CLIP_CNTL_ZFAR_CLIP_DISABLE | + A3XX_GRAS_CL_CLIP_CNTL_VP_CLIP_CODE_IGNORE | + A3XX_GRAS_CL_CLIP_CNTL_VP_XFORM_DISABLE | + A3XX_GRAS_CL_CLIP_CNTL_PERSP_DIVISION_DISABLE); + + OUT_PKT0(ring, REG_A3XX_GRAS_CL_GB_CLIP_ADJ, 1); + OUT_RING(ring, A3XX_GRAS_CL_GB_CLIP_ADJ_HORZ(0) | + A3XX_GRAS_CL_GB_CLIP_ADJ_VERT(0)); + + OUT_PKT3(ring, CP_DRAW_INDX_2, 5); + OUT_RING(ring, 0x00000000); /* viz query info. */ + OUT_RING(ring, DRAW(DI_PT_RECTLIST, DI_SRC_SEL_IMMEDIATE, + INDEX_SIZE_32_BIT, IGNORE_VISIBILITY)); + OUT_RING(ring, 2); /* NumIndices */ + OUT_RING(ring, 2); + OUT_RING(ring, 1); + + OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 1); + OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(TWO_QUADS)); + + OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); + OUT_RING(ring, 0x00000000); + + OUT_WFI(ring); + + OUT_PKT0(ring, REG_A3XX_VSC_BIN_SIZE, 1); + OUT_RING(ring, A3XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) | + A3XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); + + OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); + OUT_RING(ring, 0x00000000); +} + /* transfer from gmem to system memory (ie. normal RAM) */ static void @@ -129,8 +282,8 @@ emit_gmem2mem_surf(struct fd_context *ctx, A3XX_RB_COPY_DEST_INFO_ENDIAN(ENDIAN_NONE) | A3XX_RB_COPY_DEST_INFO_SWAP(fd3_pipe2swap(psurf->format))); - fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 2, - INDEX_SIZE_IGN, 0, 0, NULL); + fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 2, INDEX_SIZE_IGN, 0, 0, NULL); } static void @@ -210,7 +363,7 @@ fd3_emit_tile_gmem2mem(struct fd_context *ctx, struct fd_tile *tile) OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */ OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ - fd3_program_emit(ring, &ctx->solid_prog); + fd3_program_emit(ring, &ctx->solid_prog, false); fd3_emit_vertex_bufs(ring, &ctx->solid_prog, (struct fd3_vertex_buf[]) { { .prsc = fd3_ctx->solid_vbuf, .stride = 12, .format = PIPE_FORMAT_R32G32B32_FLOAT }, @@ -252,8 +405,8 @@ emit_mem2gmem_surf(struct fd_context *ctx, uint32_t base, fd3_emit_gmem_restore_tex(ring, psurf); - fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 2, - INDEX_SIZE_IGN, 0, 0, NULL); + fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 2, INDEX_SIZE_IGN, 0, 0, NULL); } static void @@ -355,7 +508,7 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile) OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */ OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ - fd3_program_emit(ring, &ctx->blit_prog); + fd3_program_emit(ring, &ctx->blit_prog, false); fd3_emit_vertex_bufs(ring, &ctx->blit_prog, (struct fd3_vertex_buf[]) { { .prsc = fd3_ctx->blit_texcoord_vbuf, .stride = 8, .format = PIPE_FORMAT_R32G32_FLOAT }, @@ -381,11 +534,68 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile) } static void +patch_draws(struct fd_context *ctx, enum pc_di_vis_cull_mode vismode) +{ + unsigned i; + for (i = 0; i < fd_patch_num_elements(&ctx->draw_patches); i++) { + struct fd_cs_patch *patch = fd_patch_element(&ctx->draw_patches, i); + *patch->cs = patch->val | DRAW(0, 0, 0, vismode); + } + util_dynarray_resize(&ctx->draw_patches, 0); +} + +/* for rendering directly to system memory: */ +static void +fd3_emit_sysmem_prep(struct fd_context *ctx) +{ + struct pipe_framebuffer_state *pfb = &ctx->framebuffer; + struct fd_ringbuffer *ring = ctx->ring; + uint32_t pitch = 0; + + if (pfb->cbufs[0]) + pitch = fd_resource(pfb->cbufs[0]->texture)->slices[0].pitch; + + fd3_emit_restore(ctx); + + OUT_PKT0(ring, REG_A3XX_RB_FRAME_BUFFER_DIMENSION, 1); + OUT_RING(ring, A3XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) | + A3XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height)); + + emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0); + + OUT_PKT0(ring, REG_A3XX_RB_RENDER_CONTROL, 1); + OUT_RING(ring, A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) | + A3XX_RB_RENDER_CONTROL_BIN_WIDTH(pitch)); + + /* setup scissor/offset for current tile: */ + OUT_PKT0(ring, REG_A3XX_RB_WINDOW_OFFSET, 1); + OUT_RING(ring, A3XX_RB_WINDOW_OFFSET_X(0) | + A3XX_RB_WINDOW_OFFSET_Y(0)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) | + A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0)); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(pfb->width - 1) | + A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(pfb->height - 1)); + + OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1); + OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_RB_MODE_CONTROL_GMEM_BYPASS | + A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE); + + patch_draws(ctx, IGNORE_VISIBILITY); +} + +static void update_vsc_pipe(struct fd_context *ctx) { + struct fd3_context *fd3_ctx = fd3_context(ctx); struct fd_ringbuffer *ring = ctx->ring; int i; + OUT_PKT0(ring, REG_A3XX_VSC_SIZE_ADDRESS, 1); + OUT_RELOC(ring, fd3_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS */ + for (i = 0; i < 8; i++) { struct fd_vsc_pipe *pipe = &ctx->pipe[i]; @@ -394,7 +604,7 @@ update_vsc_pipe(struct fd_context *ctx) DRM_FREEDRENO_GEM_TYPE_KMEM); } - OUT_PKT0(ring, REG_A3XX_VSC_PIPE(0), 3); + OUT_PKT0(ring, REG_A3XX_VSC_PIPE(i), 3); OUT_RING(ring, A3XX_VSC_PIPE_CONFIG_X(pipe->x) | A3XX_VSC_PIPE_CONFIG_Y(pipe->y) | A3XX_VSC_PIPE_CONFIG_W(pipe->w) | @@ -404,34 +614,45 @@ update_vsc_pipe(struct fd_context *ctx) } } -/* for rendering directly to system memory: */ static void -fd3_emit_sysmem_prep(struct fd_context *ctx) +emit_binning_pass(struct fd_context *ctx) { struct pipe_framebuffer_state *pfb = &ctx->framebuffer; struct fd_ringbuffer *ring = ctx->ring; - uint32_t pitch = 0; + int i; - if (pfb->cbufs[0]) - pitch = fd_resource(pfb->cbufs[0]->texture)->slices[0].pitch; + if (ctx->screen->gpu_id == 320) { + emit_binning_workaround(ctx); - fd3_emit_restore(ctx); + OUT_PKT3(ring, CP_INVALIDATE_STATE, 1); + OUT_RING(ring, 0x00007fff); + } + + OUT_PKT0(ring, REG_A3XX_VSC_BIN_CONTROL, 1); + OUT_RING(ring, A3XX_VSC_BIN_CONTROL_BINNING_ENABLE); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_TILING_PASS) | + A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); OUT_PKT0(ring, REG_A3XX_RB_FRAME_BUFFER_DIMENSION, 1); OUT_RING(ring, A3XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) | A3XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height)); - emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0); - OUT_PKT0(ring, REG_A3XX_RB_RENDER_CONTROL, 1); OUT_RING(ring, A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) | - A3XX_RB_RENDER_CONTROL_BIN_WIDTH(pitch)); + A3XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE | + A3XX_RB_RENDER_CONTROL_BIN_WIDTH(ctx->gmem.bin_w)); - /* setup scissor/offset for current tile: */ + /* setup scissor/offset for whole screen: */ OUT_PKT0(ring, REG_A3XX_RB_WINDOW_OFFSET, 1); OUT_RING(ring, A3XX_RB_WINDOW_OFFSET_X(0) | A3XX_RB_WINDOW_OFFSET_Y(0)); + OUT_PKT0(ring, REG_A3XX_RB_LRZ_VSC_CONTROL, 1); + OUT_RING(ring, A3XX_RB_LRZ_VSC_CONTROL_BINNING_ENABLE); + OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) | A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0)); @@ -439,9 +660,72 @@ fd3_emit_sysmem_prep(struct fd_context *ctx) A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(pfb->height - 1)); OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1); + OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_TILING_PASS) | + A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE); + + for (i = 0; i < 4; i++) { + OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(i), 1); + OUT_RING(ring, A3XX_RB_MRT_CONTROL_ROP_CODE(0) | + A3XX_RB_MRT_CONTROL_DITHER_MODE(DITHER_DISABLE) | + A3XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0)); + } + + OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, A3XX_PC_VSTREAM_CONTROL_SIZE(1) | + A3XX_PC_VSTREAM_CONTROL_N(0)); + + /* emit IB to binning drawcmds: */ + OUT_IB(ring, ctx->binning_start, ctx->binning_end); + + /* and then put stuff back the way it was: */ + + OUT_PKT0(ring, REG_A3XX_VSC_BIN_CONTROL, 1); + OUT_RING(ring, 0x00000000); + + OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1); + OUT_RING(ring, A3XX_SP_SP_CTRL_REG_RESOLVE | + A3XX_SP_SP_CTRL_REG_CONSTMODE(1) | + A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) | + A3XX_SP_SP_CTRL_REG_L0MODE(0)); + + OUT_PKT0(ring, REG_A3XX_RB_LRZ_VSC_CONTROL, 1); + OUT_RING(ring, 0x00000000); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); + + OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 2); OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | - A3XX_RB_MODE_CONTROL_GMEM_BYPASS | A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE); + OUT_RING(ring, A3XX_RB_RENDER_CONTROL_ENABLE_GMEM | + A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) | + A3XX_RB_RENDER_CONTROL_BIN_WIDTH(ctx->gmem.bin_w)); + + OUT_PKT3(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, CACHE_FLUSH); + + if (ctx->screen->gpu_id == 320) { + /* dummy-draw workaround: */ + OUT_PKT3(ring, CP_DRAW_INDX, 3); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, DRAW(1, DI_SRC_SEL_AUTO_INDEX, + INDEX_SIZE_IGN, IGNORE_VISIBILITY)); + OUT_RING(ring, 0); /* NumIndices */ + } + + OUT_PKT3(ring, CP_NOP, 4); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_WFI(ring); + + if (ctx->screen->gpu_id == 320) { + emit_binning_workaround(ctx); + } } /* before first tile */ @@ -461,6 +745,18 @@ fd3_emit_tile_init(struct fd_context *ctx) A3XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h)); update_vsc_pipe(ctx); + + if (use_hw_binning(ctx)) { + /* mark the end of the binning cmds: */ + fd_ringmarker_mark(ctx->binning_end); + + /* emit hw binning pass: */ + emit_binning_pass(ctx); + + patch_draws(ctx, USE_VISIBILITY); + } else { + patch_draws(ctx, IGNORE_VISIBILITY); + } } /* before mem2gmem */ @@ -472,7 +768,6 @@ fd3_emit_tile_prep(struct fd_context *ctx, struct fd_tile *tile) struct fd_gmem_stateobj *gmem = &ctx->gmem; uint32_t reg; - OUT_PKT0(ring, REG_A3XX_RB_DEPTH_INFO, 2); reg = A3XX_RB_DEPTH_INFO_DEPTH_BASE(depth_base(gmem)); if (pfb->zsbuf) { @@ -499,6 +794,7 @@ fd3_emit_tile_prep(struct fd_context *ctx, struct fd_tile *tile) static void fd3_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile) { + struct fd3_context *fd3_ctx = fd3_context(ctx); struct fd_ringbuffer *ring = ctx->ring; struct fd_gmem_stateobj *gmem = &ctx->gmem; struct pipe_framebuffer_state *pfb = &ctx->framebuffer; @@ -508,6 +804,32 @@ fd3_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile) uint32_t x2 = tile->xoff + tile->bin_w - 1; uint32_t y2 = tile->yoff + tile->bin_h - 1; + if (use_hw_binning(ctx)) { + struct fd_vsc_pipe *pipe = &ctx->pipe[tile->p]; + + assert(pipe->w * pipe->h); + + OUT_PKT3(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, HLSQ_FLUSH); + + OUT_WFI(ring); + + OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, A3XX_PC_VSTREAM_CONTROL_SIZE(pipe->w * pipe->h) | + A3XX_PC_VSTREAM_CONTROL_N(tile->n)); + + OUT_PKT3(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, CACHE_FLUSH); + + OUT_PKT3(ring, CP_SET_BIN_DATA, 2); + OUT_RELOC(ring, pipe->bo, 0, 0, 0); /* BIN_DATA_ADDR <- VSC_PIPE[p].DATA_ADDRESS */ + OUT_RELOC(ring, fd3_ctx->vsc_size_mem, /* BIN_SIZE_ADDR <- VSC_SIZE_ADDRESS + (p * 4) */ + (tile->p * 4), 0, 0); + } else { + OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, 0x00000000); + } + OUT_PKT3(ring, CP_SET_BIN, 3); OUT_RING(ring, 0x00000000); OUT_RING(ring, CP_SET_BIN_1_X1(x1) | CP_SET_BIN_1_Y1(y1)); diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c index c02b14cba39..2622006ff09 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c @@ -36,6 +36,7 @@ #include "fd3_program.h" #include "fd3_compiler.h" +#include "fd3_emit.h" #include "fd3_texture.h" #include "fd3_util.h" @@ -175,9 +176,9 @@ fd3_vp_state_bind(struct pipe_context *pctx, void *hwcso) } static void -emit_shader(struct fd_ringbuffer *ring, struct fd3_shader_stateobj *so) +emit_shader(struct fd_ringbuffer *ring, const struct fd3_shader_stateobj *so) { - struct ir3_shader_info *si = &so->info; + const struct ir3_shader_info *si = &so->info; enum adreno_state_block sb; enum adreno_state_src src; uint32_t i, sz, *bin; @@ -216,7 +217,7 @@ emit_shader(struct fd_ringbuffer *ring, struct fd3_shader_stateobj *so) } static int -find_output(struct fd3_shader_stateobj *so, fd3_semantic semantic) +find_output(const struct fd3_shader_stateobj *so, fd3_semantic semantic) { int j; for (j = 0; j < so->outputs_count; j++) @@ -227,14 +228,21 @@ find_output(struct fd3_shader_stateobj *so, fd3_semantic semantic) void fd3_program_emit(struct fd_ringbuffer *ring, - struct fd_program_stateobj *prog) + struct fd_program_stateobj *prog, bool binning) { - struct fd3_shader_stateobj *vp = prog->vp; - struct fd3_shader_stateobj *fp = prog->fp; - struct ir3_shader_info *vsi = &vp->info; - struct ir3_shader_info *fsi = &fp->info; + const struct fd3_shader_stateobj *vp = prog->vp; + const struct fd3_shader_stateobj *fp = prog->fp; + const struct ir3_shader_info *vsi = &vp->info; + const struct ir3_shader_info *fsi = &fp->info; int i; + if (binning) { + /* use dummy stateobj to simplify binning vs non-binning: */ + static const struct fd3_shader_stateobj binning_fp = {}; + fp = &binning_fp; + fsi = &fp->info; + } + /* we could probably divide this up into things that need to be * emitted if frag-prog is dirty vs if vert-prog is dirty.. */ @@ -260,11 +268,9 @@ fd3_program_emit(struct fd_ringbuffer *ring, OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1); OUT_RING(ring, A3XX_SP_SP_CTRL_REG_CONSTMODE(0) | + COND(binning, A3XX_SP_SP_CTRL_REG_BINNING) | A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) | - // XXX "resolve" (?) bit set on gmem->mem pass.. -// COND(!uniforms, A3XX_SP_SP_CTRL_REG_RESOLVE) | - // XXX sometimes 0, sometimes 1: - A3XX_SP_SP_CTRL_REG_LOMODE(1)); + A3XX_SP_SP_CTRL_REG_L0MODE(0)); OUT_PKT0(ring, REG_A3XX_SP_VS_LENGTH_REG, 1); OUT_RING(ring, A3XX_SP_VS_LENGTH_REG_SHADERLENGTH(vp->instrlen)); @@ -272,6 +278,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, OUT_PKT0(ring, REG_A3XX_SP_VS_CTRL_REG0, 3); OUT_RING(ring, A3XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) | A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) | + A3XX_SP_VS_CTRL_REG0_CACHEINVALID | A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vsi->max_half_reg + 1) | A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vsi->max_reg + 1) | A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) | @@ -323,28 +330,38 @@ fd3_program_emit(struct fd_ringbuffer *ring, A3XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0)); OUT_RELOC(ring, vp->bo, 0, 0, 0); /* SP_VS_OBJ_START_REG */ - OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1); - OUT_RING(ring, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(fp->instrlen)); - - OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2); - OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | - A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) | - A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) | - A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) | - A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | - A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) | - A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE | - COND(fp->samplers_count > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) | - A3XX_SP_FS_CTRL_REG0_LENGTH(fp->instrlen)); - OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fp->constlen) | - A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(fp->total_in) | - A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(MAX2(fsi->max_const, 0)) | - A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63)); - - OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 2); - OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) | - A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0)); - OUT_RELOC(ring, fp->bo, 0, 0, 0); /* SP_FS_OBJ_START_REG */ + if (binning) { + OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1); + OUT_RING(ring, 0x00000000); + + OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2); + OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | + A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER)); + OUT_RING(ring, 0x00000000); + } else { + OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1); + OUT_RING(ring, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(fp->instrlen)); + + OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2); + OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | + A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) | + A3XX_SP_FS_CTRL_REG0_CACHEINVALID | + A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) | + A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) | + A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | + A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) | + A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE | + COND(fp->samplers_count > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) | + A3XX_SP_FS_CTRL_REG0_LENGTH(fp->instrlen)); + OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fp->constlen) | + A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(fp->total_in) | + A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(MAX2(fsi->max_const, 0)) | + A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63)); + OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 2); + OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) | + A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0)); + OUT_RELOC(ring, fp->bo, 0, 0, 0); /* SP_FS_OBJ_START_REG */ + } OUT_PKT0(ring, REG_A3XX_SP_FS_FLAT_SHAD_MODE_REG_0, 2); OUT_RING(ring, 0x00000000); /* SP_FS_FLAT_SHAD_MODE_REG_0 */ @@ -360,24 +377,31 @@ fd3_program_emit(struct fd_ringbuffer *ring, OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0)); OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0)); - OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2); - OUT_RING(ring, A3XX_VPC_ATTR_TOTALATTR(fp->total_in) | - A3XX_VPC_ATTR_THRDASSIGN(1) | - A3XX_VPC_ATTR_LMSIZE(1)); - OUT_RING(ring, A3XX_VPC_PACK_NUMFPNONPOSVAR(fp->total_in) | - A3XX_VPC_PACK_NUMNONPOSVSVAR(fp->total_in)); - - OUT_PKT0(ring, REG_A3XX_VPC_VARYING_INTERP_MODE(0), 4); - OUT_RING(ring, fp->vinterp[0]); /* VPC_VARYING_INTERP[0].MODE */ - OUT_RING(ring, fp->vinterp[1]); /* VPC_VARYING_INTERP[1].MODE */ - OUT_RING(ring, fp->vinterp[2]); /* VPC_VARYING_INTERP[2].MODE */ - OUT_RING(ring, fp->vinterp[3]); /* VPC_VARYING_INTERP[3].MODE */ - - OUT_PKT0(ring, REG_A3XX_VPC_VARYING_PS_REPL_MODE(0), 4); - OUT_RING(ring, fp->vpsrepl[0]); /* VPC_VARYING_PS_REPL[0].MODE */ - OUT_RING(ring, fp->vpsrepl[1]); /* VPC_VARYING_PS_REPL[1].MODE */ - OUT_RING(ring, fp->vpsrepl[2]); /* VPC_VARYING_PS_REPL[2].MODE */ - OUT_RING(ring, fp->vpsrepl[3]); /* VPC_VARYING_PS_REPL[3].MODE */ + if (binning) { + OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2); + OUT_RING(ring, A3XX_VPC_ATTR_THRDASSIGN(1) | + A3XX_VPC_ATTR_LMSIZE(1)); + OUT_RING(ring, 0x00000000); + } else { + OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2); + OUT_RING(ring, A3XX_VPC_ATTR_TOTALATTR(fp->total_in) | + A3XX_VPC_ATTR_THRDASSIGN(1) | + A3XX_VPC_ATTR_LMSIZE(1)); + OUT_RING(ring, A3XX_VPC_PACK_NUMFPNONPOSVAR(fp->total_in) | + A3XX_VPC_PACK_NUMNONPOSVSVAR(fp->total_in)); + + OUT_PKT0(ring, REG_A3XX_VPC_VARYING_INTERP_MODE(0), 4); + OUT_RING(ring, fp->vinterp[0]); /* VPC_VARYING_INTERP[0].MODE */ + OUT_RING(ring, fp->vinterp[1]); /* VPC_VARYING_INTERP[1].MODE */ + OUT_RING(ring, fp->vinterp[2]); /* VPC_VARYING_INTERP[2].MODE */ + OUT_RING(ring, fp->vinterp[3]); /* VPC_VARYING_INTERP[3].MODE */ + + OUT_PKT0(ring, REG_A3XX_VPC_VARYING_PS_REPL_MODE(0), 4); + OUT_RING(ring, fp->vpsrepl[0]); /* VPC_VARYING_PS_REPL[0].MODE */ + OUT_RING(ring, fp->vpsrepl[1]); /* VPC_VARYING_PS_REPL[1].MODE */ + OUT_RING(ring, fp->vpsrepl[2]); /* VPC_VARYING_PS_REPL[2].MODE */ + OUT_RING(ring, fp->vpsrepl[3]); /* VPC_VARYING_PS_REPL[3].MODE */ + } OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1); OUT_RING(ring, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) | @@ -388,10 +412,12 @@ fd3_program_emit(struct fd_ringbuffer *ring, OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */ - emit_shader(ring, fp); + if (!binning) { + emit_shader(ring, fp); - OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); - OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */ + OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); + OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */ + } OUT_PKT0(ring, REG_A3XX_VFD_CONTROL_0, 2); OUT_RING(ring, A3XX_VFD_CONTROL_0_TOTALATTRTOVS(vp->total_in) | diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.h b/src/gallium/drivers/freedreno/a3xx/fd3_program.h index 85c22a54cf7..bd6483ff42c 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.h @@ -117,7 +117,7 @@ struct fd3_shader_stateobj { }; void fd3_program_emit(struct fd_ringbuffer *ring, - struct fd_program_stateobj *prog); + struct fd_program_stateobj *prog, bool binning); void fd3_prog_init(struct pipe_context *pctx); void fd3_prog_fini(struct pipe_context *pctx); diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c index 28be508e329..23f6a67734d 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.c +++ b/src/gallium/drivers/freedreno/freedreno_context.c @@ -34,16 +34,11 @@ #include "freedreno_gmem.h" #include "freedreno_util.h" -static void -fd_context_next_rb(struct pipe_context *pctx) +static struct fd_ringbuffer *next_rb(struct fd_context *ctx) { - struct fd_context *ctx = fd_context(pctx); struct fd_ringbuffer *ring; uint32_t ts; - fd_ringmarker_del(ctx->draw_start); - fd_ringmarker_del(ctx->draw_end); - /* grab next ringbuffer: */ ring = ctx->rings[(ctx->rings_idx++) % ARRAY_SIZE(ctx->rings)]; @@ -56,10 +51,36 @@ fd_context_next_rb(struct pipe_context *pctx) fd_ringbuffer_reset(ring); + return ring; +} + +static void +fd_context_next_rb(struct pipe_context *pctx) +{ + struct fd_context *ctx = fd_context(pctx); + struct fd_ringbuffer *ring; + + fd_ringmarker_del(ctx->draw_start); + fd_ringmarker_del(ctx->draw_end); + + ring = next_rb(ctx); + ctx->draw_start = fd_ringmarker_new(ring); ctx->draw_end = fd_ringmarker_new(ring); + fd_ringbuffer_set_parent(ring, NULL); ctx->ring = ring; + + fd_ringmarker_del(ctx->binning_start); + fd_ringmarker_del(ctx->binning_end); + + ring = next_rb(ctx); + + ctx->binning_start = fd_ringmarker_new(ring); + ctx->binning_end = fd_ringmarker_new(ring); + + fd_ringbuffer_set_parent(ring, ctx->ring); + ctx->binning_ring = ring; } /* emit accumulated render cmds, needed for example if render target has @@ -121,6 +142,10 @@ fd_context_destroy(struct pipe_context *pctx) DBG(""); + util_slab_destroy(&ctx->transfer_pool); + + util_dynarray_fini(&ctx->draw_patches); + if (ctx->blitter) util_blitter_destroy(ctx->blitter); @@ -129,7 +154,11 @@ fd_context_destroy(struct pipe_context *pctx) fd_ringmarker_del(ctx->draw_start); fd_ringmarker_del(ctx->draw_end); - fd_ringbuffer_del(ctx->ring); + fd_ringmarker_del(ctx->binning_start); + fd_ringmarker_del(ctx->binning_end); + + for (i = 0; i < ARRAY_SIZE(ctx->rings); i++) + fd_ringbuffer_del(ctx->rings[i]); for (i = 0; i < ARRAY_SIZE(ctx->pipe); i++) { struct fd_vsc_pipe *pipe = &ctx->pipe[i]; @@ -176,6 +205,8 @@ fd_context_init(struct fd_context *ctx, struct pipe_screen *pscreen, fd_context_next_rb(pctx); fd_reset_rmw_state(ctx); + util_dynarray_init(&ctx->draw_patches); + util_slab_create(&ctx->transfer_pool, sizeof(struct pipe_transfer), 16, UTIL_SLAB_SINGLETHREADED); diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h index a8abbca7a62..a0227e49c03 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.h +++ b/src/gallium/drivers/freedreno/freedreno_context.h @@ -111,7 +111,7 @@ struct fd_context { */ enum { /* align bitmask values w/ PIPE_CLEAR_*.. since that is convenient.. */ - FD_BUFFER_COLOR = PIPE_CLEAR_COLOR, + FD_BUFFER_COLOR = PIPE_CLEAR_COLOR0, FD_BUFFER_DEPTH = PIPE_CLEAR_DEPTH, FD_BUFFER_STENCIL = PIPE_CLEAR_STENCIL, FD_BUFFER_ALL = FD_BUFFER_COLOR | FD_BUFFER_DEPTH | FD_BUFFER_STENCIL, @@ -148,9 +148,14 @@ struct fd_context { struct fd_ringbuffer *rings[4]; unsigned rings_idx; + /* normal draw/clear cmds: */ struct fd_ringbuffer *ring; struct fd_ringmarker *draw_start, *draw_end; + /* binning pass draw/clear cmds: */ + struct fd_ringbuffer *binning_ring; + struct fd_ringmarker *binning_start, *binning_end; + /* Keep track if WAIT_FOR_IDLE is needed for registers we need * to update via RMW: */ @@ -165,6 +170,11 @@ struct fd_context { uint32_t rbrc_draw; } rmw; + /* Keep track of DRAW initiators that need to be patched up depending + * on whether we using binning or not: + */ + struct util_dynarray draw_patches; + struct pipe_scissor_state scissor; /* we don't have a disable/enable bit for scissor, so instead we keep diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c index 0069438c87d..d80f3565614 100644 --- a/src/gallium/drivers/freedreno/freedreno_draw.c +++ b/src/gallium/drivers/freedreno/freedreno_draw.c @@ -54,7 +54,9 @@ size2indextype(unsigned index_size) /* this is same for a2xx/a3xx, so split into helper: */ void -fd_draw_emit(struct fd_context *ctx, const struct pipe_draw_info *info) +fd_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, + enum pc_di_vis_cull_mode vismode, + const struct pipe_draw_info *info) { struct pipe_index_buffer *idx = &ctx->indexbuf; struct fd_bo *idx_bo = NULL; @@ -78,8 +80,8 @@ fd_draw_emit(struct fd_context *ctx, const struct pipe_draw_info *info) src_sel = DI_SRC_SEL_AUTO_INDEX; } - fd_draw(ctx, ctx->primtypes[info->mode], src_sel, info->count, - idx_type, idx_size, idx_offset, idx_bo); + fd_draw(ctx, ring, ctx->primtypes[info->mode], vismode, src_sel, + info->count, idx_type, idx_size, idx_offset, idx_bo); } static void @@ -180,6 +182,7 @@ fd_clear(struct pipe_context *pctx, unsigned buffers, ctx->clear(ctx, buffers, color, depth, stencil); ctx->dirty |= FD_DIRTY_ZSA | + FD_DIRTY_VIEWPORT | FD_DIRTY_RASTERIZER | FD_DIRTY_SAMPLE_MASK | FD_DIRTY_PROG | diff --git a/src/gallium/drivers/freedreno/freedreno_draw.h b/src/gallium/drivers/freedreno/freedreno_draw.h index 190c0e52d24..e8bb420889e 100644 --- a/src/gallium/drivers/freedreno/freedreno_draw.h +++ b/src/gallium/drivers/freedreno/freedreno_draw.h @@ -38,19 +38,21 @@ struct fd_ringbuffer; -void fd_draw_emit(struct fd_context *ctx, const struct pipe_draw_info *info); +void fd_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, + enum pc_di_vis_cull_mode vismode, + const struct pipe_draw_info *info); void fd_draw_init(struct pipe_context *pctx); static inline void -fd_draw(struct fd_context *ctx, enum pc_di_primtype primtype, +fd_draw(struct fd_context *ctx, struct fd_ringbuffer *ring, + enum pc_di_primtype primtype, + enum pc_di_vis_cull_mode vismode, enum pc_di_src_sel src_sel, uint32_t count, enum pc_di_index_size idx_type, uint32_t idx_size, uint32_t idx_offset, struct fd_bo *idx_bo) { - struct fd_ringbuffer *ring = ctx->ring; - /* for debug after a lock up, write a unique counter value * to scratch7 for each draw, to make it easier to match up * register dumps to cmdstream. The combination of IB @@ -64,7 +66,7 @@ fd_draw(struct fd_context *ctx, enum pc_di_primtype primtype, OUT_PKT3(ring, CP_DRAW_INDX, 3); OUT_RING(ring, 0x00000000); OUT_RING(ring, DRAW(1, DI_SRC_SEL_AUTO_INDEX, - INDEX_SIZE_IGN, IGNORE_VISIBILITY)); + INDEX_SIZE_IGN, USE_VISIBILITY)); OUT_RING(ring, 0); /* NumIndices */ /* ugg, hard-code register offset to avoid pulling in the @@ -76,8 +78,15 @@ fd_draw(struct fd_context *ctx, enum pc_di_primtype primtype, OUT_PKT3(ring, CP_DRAW_INDX, idx_bo ? 5 : 3); OUT_RING(ring, 0x00000000); /* viz query info. */ - OUT_RING(ring, DRAW(primtype, src_sel, - idx_type, IGNORE_VISIBILITY)); + if (vismode == USE_VISIBILITY) { + /* leave vis mode blank for now, it will be patched up when + * we know if we are binning or not + */ + OUT_RINGP(ring, DRAW(primtype, src_sel, idx_type, 0), + &ctx->draw_patches); + } else { + OUT_RING(ring, DRAW(primtype, src_sel, idx_type, vismode)); + } OUT_RING(ring, count); /* NumIndices */ if (idx_bo) { OUT_RELOC(ring, idx_bo, idx_offset, 0, 0); diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c index 47f7a310e8c..0270538a3d0 100644 --- a/src/gallium/drivers/freedreno/freedreno_gmem.c +++ b/src/gallium/drivers/freedreno/freedreno_gmem.c @@ -85,7 +85,8 @@ calculate_tiles(struct fd_context *ctx) uint32_t bin_w, bin_h; uint32_t max_width = bin_width(ctx); uint32_t cpp = 4; - uint32_t i, j, t, p, n, xoff, yoff; + uint32_t i, j, t, xoff, yoff; + uint32_t tpp_x, tpp_y; bool has_zs = !!(ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)); if (pfb->cbufs[0]) @@ -145,20 +146,65 @@ calculate_tiles(struct fd_context *ctx) gmem->width = width; gmem->height = height; - /* Assign tiles and pipes: - * NOTE we currently take a rather simplistic approach of - * mapping rows of tiles to a pipe. At some point it might - * be worth playing with different strategies and seeing if - * that makes much impact on performance. + /* + * Assign tiles and pipes: + * + * At some point it might be worth playing with different + * strategies and seeing if that makes much impact on + * performance. */ - t = p = n = 0; + +#define div_round_up(v, a) (((v) + (a) - 1) / (a)) + /* figure out number of tiles per pipe: */ + tpp_x = tpp_y = 1; + while (div_round_up(nbins_y, tpp_y) > 8) + tpp_y += 2; + while ((div_round_up(nbins_y, tpp_y) * + div_round_up(nbins_x, tpp_x)) > 8) + tpp_x += 1; + + /* configure pipes: */ + xoff = yoff = 0; + for (i = 0; i < ARRAY_SIZE(ctx->pipe); i++) { + struct fd_vsc_pipe *pipe = &ctx->pipe[i]; + + if (xoff >= nbins_x) { + xoff = 0; + yoff += tpp_y; + } + + if (yoff >= nbins_y) { + break; + } + + pipe->x = xoff; + pipe->y = yoff; + pipe->w = MIN2(tpp_x, nbins_x - xoff); + pipe->h = MIN2(tpp_y, nbins_y - yoff); + + xoff += tpp_x; + } + + for (; i < ARRAY_SIZE(ctx->pipe); i++) { + struct fd_vsc_pipe *pipe = &ctx->pipe[i]; + pipe->x = pipe->y = pipe->w = pipe->h = 0; + } + +#if 0 /* debug */ + printf("%dx%d ... tpp=%dx%d\n", nbins_x, nbins_y, tpp_x, tpp_y); + for (i = 0; i < 8; i++) { + struct fd_vsc_pipe *pipe = &ctx->pipe[i]; + printf("pipe[%d]: %ux%u @ %u,%u\n", i, + pipe->w, pipe->h, pipe->x, pipe->y); + } +#endif + + /* configure tiles: */ + t = 0; yoff = miny; for (i = 0; i < nbins_y; i++) { - struct fd_vsc_pipe *pipe = &ctx->pipe[p]; uint32_t bw, bh; - assert(p < ARRAY_SIZE(ctx->pipe)); - xoff = minx; /* clip bin height: */ @@ -166,13 +212,20 @@ calculate_tiles(struct fd_context *ctx) for (j = 0; j < nbins_x; j++) { struct fd_tile *tile = &ctx->tile[t]; + uint32_t n, p; assert(t < ARRAY_SIZE(ctx->tile)); + /* pipe number: */ + p = ((i / tpp_y) * div_round_up(nbins_x, tpp_x)) + (j / tpp_x); + + /* slot number: */ + n = ((i % tpp_y) * tpp_x) + (j % tpp_x); + /* clip bin width: */ bw = MIN2(bin_w, minx + width - xoff); - tile->n = n++; + tile->n = n; tile->p = p; tile->bin_w = bw; tile->bin_h = bh; @@ -184,22 +237,19 @@ calculate_tiles(struct fd_context *ctx) xoff += bw; } - /* one pipe per row: */ - pipe->x = 0; - pipe->y = i; - pipe->w = nbins_x; - pipe->h = 1; - - p++; - n = 0; - yoff += bh; } - for (; p < ARRAY_SIZE(ctx->pipe); p++) { - struct fd_vsc_pipe *pipe = &ctx->pipe[p]; - pipe->x = pipe->y = pipe->w = pipe->h = 0; +#if 0 /* debug */ + t = 0; + for (i = 0; i < nbins_y; i++) { + for (j = 0; j < nbins_x; j++) { + struct fd_tile *tile = &ctx->tile[t++]; + printf("|p:%u n:%u|", tile->p, tile->n); + } + printf("\n"); } +#endif } static void @@ -259,6 +309,7 @@ fd_gmem_render_tiles(struct pipe_context *pctx) /* mark the end of the clear/draw cmds before emitting per-tile cmds: */ fd_ringmarker_mark(ctx->draw_end); + fd_ringmarker_mark(ctx->binning_end); if (sysmem) { DBG("rendering sysmem (%s/%s)", @@ -277,8 +328,9 @@ fd_gmem_render_tiles(struct pipe_context *pctx) /* GPU executes starting from tile cmds, which IB back to draw cmds: */ fd_ringmarker_flush(ctx->draw_end); - /* mark start for next draw cmds: */ + /* mark start for next draw/binning cmds: */ fd_ringmarker_mark(ctx->draw_start); + fd_ringmarker_mark(ctx->binning_start); fd_reset_rmw_state(ctx); diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index 319e29f3ada..28a09166acd 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -64,12 +64,15 @@ static const struct debug_named_value debug_options[] = { {"direct", FD_DBG_DIRECT, "Force inline (SS_DIRECT) state loads"}, {"dbypass", FD_DBG_DBYPASS,"Disable GMEM bypass"}, {"fraghalf", FD_DBG_FRAGHALF, "Use half-precision in fragment shader"}, + {"binning", FD_DBG_BINNING, "Enable hw binning"}, + {"dbinning", FD_DBG_DBINNING, "Disable hw binning"}, DEBUG_NAMED_VALUE_END }; DEBUG_GET_ONCE_FLAGS_OPTION(fd_mesa_debug, "FD_MESA_DEBUG", debug_options, 0) int fd_mesa_debug = 0; +bool fd_binning_enabled = false; /* default to off for now */ static const char * fd_screen_get_name(struct pipe_screen *pscreen) @@ -386,6 +389,12 @@ fd_screen_create(struct fd_device *dev) fd_mesa_debug = debug_get_option_fd_mesa_debug(); + if (fd_mesa_debug & FD_DBG_BINNING) + fd_binning_enabled = true; + + if (fd_mesa_debug & FD_DBG_DBINNING) + fd_binning_enabled = false; + if (!screen) return NULL; diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h index 48d346eb35b..fae5ba06b1d 100644 --- a/src/gallium/drivers/freedreno/freedreno_util.h +++ b/src/gallium/drivers/freedreno/freedreno_util.h @@ -37,6 +37,7 @@ #include "util/u_debug.h" #include "util/u_math.h" #include "util/u_half.h" +#include "util/u_dynarray.h" #include "adreno_common.xml.h" #include "adreno_pm4.xml.h" @@ -52,16 +53,19 @@ enum adreno_stencil_op fd_stencil_op(unsigned op); /* TBD if it is same on a2xx, but for now: */ #define MAX_MIP_LEVELS A3XX_MAX_MIP_LEVELS -#define FD_DBG_MSGS 0x01 -#define FD_DBG_DISASM 0x02 -#define FD_DBG_DCLEAR 0x04 -#define FD_DBG_DGMEM 0x08 -#define FD_DBG_DSCIS 0x10 -#define FD_DBG_DIRECT 0x20 -#define FD_DBG_DBYPASS 0x40 -#define FD_DBG_FRAGHALF 0x80 +#define FD_DBG_MSGS 0x0001 +#define FD_DBG_DISASM 0x0002 +#define FD_DBG_DCLEAR 0x0004 +#define FD_DBG_DGMEM 0x0008 +#define FD_DBG_DSCIS 0x0010 +#define FD_DBG_DIRECT 0x0020 +#define FD_DBG_DBYPASS 0x0040 +#define FD_DBG_FRAGHALF 0x0080 +#define FD_DBG_BINNING 0x0100 +#define FD_DBG_DBINNING 0x0200 extern int fd_mesa_debug; +extern bool fd_binning_enabled; #define DBG(fmt, ...) \ do { if (fd_mesa_debug & FD_DBG_MSGS) \ @@ -87,6 +91,13 @@ static inline uint32_t DRAW(enum pc_di_primtype prim_type, (1 << 14); } +/* for tracking cmdstream positions that need to be patched: */ +struct fd_cs_patch { + uint32_t *cs; + uint32_t val; +}; +#define fd_patch_num_elements(buf) ((buf)->size / sizeof(struct fd_cs_patch)) +#define fd_patch_element(buf, i) util_dynarray_element(buf, struct fd_cs_patch, i) static inline enum pipe_format pipe_surface_format(struct pipe_surface *psurf) @@ -110,6 +121,21 @@ OUT_RING(struct fd_ringbuffer *ring, uint32_t data) *(ring->cur++) = data; } +/* like OUT_RING() but appends a cmdstream patch point to 'buf' */ +static inline void +OUT_RINGP(struct fd_ringbuffer *ring, uint32_t data, + struct util_dynarray *buf) +{ + if (LOG_DWORDS) { + DBG("ring[%p]: OUT_RINGP %04x: %08x", ring, + (uint32_t)(ring->cur - ring->last_start), data); + } + util_dynarray_append(buf, struct fd_cs_patch, ((struct fd_cs_patch){ + .cs = ring->cur++, + .val = data, + })); +} + static inline void OUT_RELOC(struct fd_ringbuffer *ring, struct fd_bo *bo, uint32_t offset, uint32_t or, int32_t shift) @@ -132,7 +158,7 @@ OUT_RELOCW(struct fd_ringbuffer *ring, struct fd_bo *bo, uint32_t offset, uint32_t or, int32_t shift) { if (LOG_DWORDS) { - DBG("ring[%p]: OUT_RELOC %04x: %p+%u << %d", ring, + DBG("ring[%p]: OUT_RELOCW %04x: %p+%u << %d", ring, (uint32_t)(ring->cur - ring->last_start), bo, offset, shift); } fd_ringbuffer_reloc(ring, &(struct fd_reloc){ |