freedreno/a3xx: support for hw binning pass

The binning pass sorts vertices into which bins/tiles they apply to. The visibility information generated during the binning pass can be used to speed up the rendering pass by filtering out vertices which do not apply to the current tile. See: https://github.com/freedreno/freedreno/wiki/Adreno-tiling#optimized-approach This brings a significant fps boost. A rough assortment of tests (supertuxkart, etracer, tremulous, glmark2 'build' test, etc) seems to yield a ~35-45% fps improvement. For now, to be conservative, the binning pass is not enabled yet by default. To enable it use: FD_MESA_DEBUG=binning So far I haven't found anything that breaks with binning enabled, but I'd like a bit more testing before I enable it as default. Signed-off-by: Rob Clark <[email protected]>
author: Rob Clark <[email protected]> 2014-01-07 10:55:07 -0500
committer: Rob Clark <[email protected]> 2014-01-08 16:30:18 -0500
commit: c0766528baaef48902c87bbdaa4f5926c472269b (patch)
tree: f825706059f50c37a9fda7961ec596b0fb6a65da /src/gallium/drivers/freedreno/freedreno_gmem.c
parent: bfb44c24bc1eff850d47984b2cb60c957ffc143d (diff)
1 files changed, 76 insertions, 24 deletions
diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c
index 47f7a310e8c..0270538a3d0 100644
--- a/src/gallium/drivers/freedreno/freedreno_gmem.c
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.c
@@ -85,7 +85,8 @@ calculate_tiles(struct fd_context *ctx)
 	uint32_t bin_w, bin_h;
 	uint32_t max_width = bin_width(ctx);
 	uint32_t cpp = 4;
-	uint32_t i, j, t, p, n, xoff, yoff;
+	uint32_t i, j, t, xoff, yoff;
+	uint32_t tpp_x, tpp_y;
 	bool has_zs = !!(ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL));
 
 	if (pfb->cbufs[0])
@@ -145,20 +146,65 @@ calculate_tiles(struct fd_context *ctx)
 	gmem->width = width;
 	gmem->height = height;
 
-	/* Assign tiles and pipes:
-	 * NOTE we currently take a rather simplistic approach of
-	 * mapping rows of tiles to a pipe.  At some point it might
-	 * be worth playing with different strategies and seeing if
-	 * that makes much impact on performance.
+	/*
+	 * Assign tiles and pipes:
+	 *
+	 * At some point it might be worth playing with different
+	 * strategies and seeing if that makes much impact on
+	 * performance.
 	 */
-	t = p = n = 0;
+
+#define div_round_up(v, a)  (((v) + (a) - 1) / (a))
+	/* figure out number of tiles per pipe: */
+	tpp_x = tpp_y = 1;
+	while (div_round_up(nbins_y, tpp_y) > 8)
+		tpp_y += 2;
+	while ((div_round_up(nbins_y, tpp_y) *
+			div_round_up(nbins_x, tpp_x)) > 8)
+		tpp_x += 1;
+
+	/* configure pipes: */
+	xoff = yoff = 0;
+	for (i = 0; i < ARRAY_SIZE(ctx->pipe); i++) {
+		struct fd_vsc_pipe *pipe = &ctx->pipe[i];
+
+		if (xoff >= nbins_x) {
+			xoff = 0;
+			yoff += tpp_y;
+		}
+
+		if (yoff >= nbins_y) {
+			break;
+		}
+
+		pipe->x = xoff;
+		pipe->y = yoff;
+		pipe->w = MIN2(tpp_x, nbins_x - xoff);
+		pipe->h = MIN2(tpp_y, nbins_y - yoff);
+
+		xoff += tpp_x;
+	}
+
+	for (; i < ARRAY_SIZE(ctx->pipe); i++) {
+		struct fd_vsc_pipe *pipe = &ctx->pipe[i];
+		pipe->x = pipe->y = pipe->w = pipe->h = 0;
+	}
+
+#if 0 /* debug */
+	printf("%dx%d ... tpp=%dx%d\n", nbins_x, nbins_y, tpp_x, tpp_y);
+	for (i = 0; i < 8; i++) {
+		struct fd_vsc_pipe *pipe = &ctx->pipe[i];
+		printf("pipe[%d]: %ux%u @ %u,%u\n", i,
+				pipe->w, pipe->h, pipe->x, pipe->y);
+	}
+#endif
+
+	/* configure tiles: */
+	t = 0;
 	yoff = miny;
 	for (i = 0; i < nbins_y; i++) {
-		struct fd_vsc_pipe *pipe = &ctx->pipe[p];
 		uint32_t bw, bh;
 
-		assert(p < ARRAY_SIZE(ctx->pipe));
-
 		xoff = minx;
 
 		/* clip bin height: */
@@ -166,13 +212,20 @@ calculate_tiles(struct fd_context *ctx)
 
 		for (j = 0; j < nbins_x; j++) {
 			struct fd_tile *tile = &ctx->tile[t];
+			uint32_t n, p;
 
 			assert(t < ARRAY_SIZE(ctx->tile));
 
+			/* pipe number: */
+			p = ((i / tpp_y) * div_round_up(nbins_x, tpp_x)) + (j / tpp_x);
+
+			/* slot number: */
+			n = ((i % tpp_y) * tpp_x) + (j % tpp_x);
+
 			/* clip bin width: */
 			bw = MIN2(bin_w, minx + width - xoff);
 
-			tile->n = n++;
+			tile->n = n;
 			tile->p = p;
 			tile->bin_w = bw;
 			tile->bin_h = bh;
@@ -184,22 +237,19 @@ calculate_tiles(struct fd_context *ctx)
 			xoff += bw;
 		}
 
-		/* one pipe per row: */
-		pipe->x = 0;
-		pipe->y = i;
-		pipe->w = nbins_x;
-		pipe->h = 1;
-
-		p++;
-		n = 0;
-
 		yoff += bh;
 	}
 
-	for (; p < ARRAY_SIZE(ctx->pipe); p++) {
-		struct fd_vsc_pipe *pipe = &ctx->pipe[p];
-		pipe->x = pipe->y = pipe->w = pipe->h = 0;
+#if 0 /* debug */
+	t = 0;
+	for (i = 0; i < nbins_y; i++) {
+		for (j = 0; j < nbins_x; j++) {
+			struct fd_tile *tile = &ctx->tile[t++];
+			printf("|p:%u n:%u|", tile->p, tile->n);
+		}
+		printf("\n");
 	}
+#endif
 }
 
 static void
@@ -259,6 +309,7 @@ fd_gmem_render_tiles(struct pipe_context *pctx)
 
 	/* mark the end of the clear/draw cmds before emitting per-tile cmds: */
 	fd_ringmarker_mark(ctx->draw_end);
+	fd_ringmarker_mark(ctx->binning_end);
 
 	if (sysmem) {
 		DBG("rendering sysmem (%s/%s)",
@@ -277,8 +328,9 @@ fd_gmem_render_tiles(struct pipe_context *pctx)
 	/* GPU executes starting from tile cmds, which IB back to draw cmds: */
 	fd_ringmarker_flush(ctx->draw_end);
 
-	/* mark start for next draw cmds: */
+	/* mark start for next draw/binning cmds: */
 	fd_ringmarker_mark(ctx->draw_start);
+	fd_ringmarker_mark(ctx->binning_start);
 
 	fd_reset_rmw_state(ctx);
author	Rob Clark <[email protected]>	2014-01-07 10:55:07 -0500
committer	Rob Clark <[email protected]>	2014-01-08 16:30:18 -0500
commit	c0766528baaef48902c87bbdaa4f5926c472269b (patch)
tree	f825706059f50c37a9fda7961ec596b0fb6a65da /src/gallium/drivers/freedreno/freedreno_gmem.c
parent	bfb44c24bc1eff850d47984b2cb60c957ffc143d (diff)