freedreno/a6xx: move const emit to state group

Eventually we want to move nearly everything, but no other state depends on const state, so this is the easiest one to move first. For webgl aquarium, this reduces GPU load by about 10%, since for each fish it does a uniform upload plus draw.. fish frequently are visible in only a single tile, so this skips the uniform uploads for other tiles. The additional step of avoiding WFI's when using CP_SET_DRAW_STATE seems to be work an additional 10% gain for aquarium. Signed-off-by: Rob Clark <[email protected]>
author: Rob Clark <[email protected]> 2018-10-07 13:59:27 -0400
committer: Rob Clark <[email protected]> 2018-10-17 12:44:48 -0400
commit: abcdf5627a29b7f1856b86bce4ff9bd0029a3099 (patch)
tree: f28d2aaa8e5eb6aec707217d9bd61b641aa991b2 /src/gallium
parent: a398d26fd2cb1ef075a07fa91d2c74613982a66f (diff)
4 files changed, 70 insertions, 15 deletions
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c
index fc4a53f8651..93f6a267fa9 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c
@@ -359,7 +359,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
 	if (tex->num_samplers > 0) {
 		struct fd_ringbuffer *state =
-			fd_ringbuffer_new_object(ctx->pipe, tex->num_samplers * 4);
+			fd_ringbuffer_new_flags(ctx->pipe, tex->num_samplers * 4 * 4,
+					FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
 		for (unsigned i = 0; i < tex->num_samplers; i++) {
 			static const struct fd6_sampler_stateobj dummy_sampler = {};
 			const struct fd6_sampler_stateobj *sampler = tex->samplers[i] ?
@@ -389,7 +390,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
 	if (tex->num_textures > 0) {
 		struct fd_ringbuffer *state =
-			fd_ringbuffer_new_object(ctx->pipe, tex->num_textures * 16);
+			fd_ringbuffer_new_flags(ctx->pipe, tex->num_textures * 16 * 4,
+					FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
 		for (unsigned i = 0; i < tex->num_textures; i++) {
 			static const struct fd6_pipe_sampler_view dummy_view = {};
 			const struct fd6_pipe_sampler_view *view = tex->textures[i] ?
@@ -791,9 +793,29 @@ fd6_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL1_MRT(nr));
 	}
 
-	ir3_emit_vs_consts(vp, ring, ctx, emit->info);
-	if (!emit->key.binning_pass)
-		ir3_emit_fs_consts(fp, ring, ctx);
+#define DIRTY_CONST (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST | \
+					 FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE)
+
+	if (ctx->dirty_shader[PIPE_SHADER_VERTEX] & DIRTY_CONST) {
+		struct fd_ringbuffer *vsconstobj =
+			fd_ringbuffer_new_flags(ctx->pipe, 0x1000,
+					FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
+
+		ir3_emit_vs_consts(vp, vsconstobj, ctx, emit->info);
+		fd6_emit_add_group(emit, vsconstobj, FD6_GROUP_VS_CONST, 0x7);
+		fd_ringbuffer_del(vsconstobj);
+	}
+
+	if ((ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & DIRTY_CONST) &&
+			!emit->key.binning_pass) {
+		struct fd_ringbuffer *fsconstobj =
+			fd_ringbuffer_new_flags(ctx->pipe, 0x1000,
+					FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
+
+		ir3_emit_fs_consts(fp, fsconstobj, ctx);
+		fd6_emit_add_group(emit, fsconstobj, FD6_GROUP_FS_CONST, 0x7);
+		fd_ringbuffer_del(fsconstobj);
+	}
 
 	struct pipe_stream_output_info *info = &vp->shader->stream_output;
 	if (info->num_outputs) {
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h
index a2117a1b244..4e27597a70b 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h
@@ -43,7 +43,8 @@ struct fd_ringbuffer;
  * need to be emit'd.
  */
 enum fd6_state_id {
-	FD6_GROUP_CONST,
+	FD6_GROUP_VS_CONST,
+	FD6_GROUP_FS_CONST,
 };
 
 struct fd6_state_group {
@@ -116,7 +117,7 @@ fd6_emit_add_group(struct fd6_emit *emit, struct fd_ringbuffer *stateobj,
 	if (fd_ringbuffer_size(stateobj) == 0)
 		return;
 	struct fd6_state_group *g = &emit->groups[emit->num_groups++];
-	g->stateobj = stateobj;
+	g->stateobj = fd_ringbuffer_ref(stateobj);
 	g->group_id = group_id;
 	g->enable_mask = enable_mask;
 }
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c b/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c
index 0c96250f974..11673992959 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c
@@ -751,6 +751,13 @@ fd6_emit_tile_gmem2mem(struct fd_batch *batch, struct fd_tile *tile)
 		OUT_RING(ring, A2XX_CP_SET_MARKER_0_MODE(0x5) | 0x10);
 	}
 
+	OUT_PKT7(ring, CP_SET_DRAW_STATE, 3);
+	OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
+			CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
+			CP_SET_DRAW_STATE__0_GROUP_ID(0));
+	OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0));
+	OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0));
+
 	OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
 	OUT_RING(ring, 0x0);
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index 5532a7f3467..ee063f84d73 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -552,6 +552,18 @@ ir3_shader_outputs(const struct ir3_shader *so)
 
 #include "freedreno_resource.h"
 
+static inline void
+ring_wfi(struct fd_batch *batch, struct fd_ringbuffer *ring)
+{
+	/* when we emit const state via ring (IB2) we need a WFI, but when
+	 * it is emit'd via stateobj, we don't
+	 */
+	if (ring->flags & FD_RINGBUFFER_OBJECT)
+		return;
+
+	fd_wfi(batch, ring);
+}
+
 static void
 emit_user_consts(struct fd_context *ctx, const struct ir3_shader_variant *v,
 		struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf)
@@ -579,7 +591,7 @@ emit_user_consts(struct fd_context *ctx, const struct ir3_shader_variant *v,
 		size = MIN2(size, 4 * max_const);
 
 		if (size > 0) {
-			fd_wfi(ctx->batch, ring);
+			ring_wfi(ctx->batch, ring);
 			ctx->emit_const(ring, v->type, 0,
 					cb->buffer_offset, size,
 					cb->user_buffer, cb->buffer);
@@ -611,7 +623,7 @@ emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v,
 			}
 		}
 
-		fd_wfi(ctx->batch, ring);
+		ring_wfi(ctx->batch, ring);
 		ctx->emit_const_bo(ring, v->type, false, offset * 4, params, prscs, offsets);
 	}
 }
@@ -631,7 +643,7 @@ emit_ssbo_sizes(struct fd_context *ctx, const struct ir3_shader_variant *v,
 			sizes[off] = sb->sb[index].buffer_size;
 		}
 
-		fd_wfi(ctx->batch, ring);
+		ring_wfi(ctx->batch, ring);
 		ctx->emit_const(ring, v->type, offset * 4,
 			0, ARRAY_SIZE(sizes), sizes, NULL);
 	}
@@ -673,7 +685,7 @@ emit_image_dims(struct fd_context *ctx, const struct ir3_shader_variant *v,
 			}
 		}
 
-		fd_wfi(ctx->batch, ring);
+		ring_wfi(ctx->batch, ring);
 		ctx->emit_const(ring, v->type, offset * 4,
 			0, ARRAY_SIZE(dims), dims, NULL);
 	}
@@ -696,7 +708,7 @@ emit_immediates(struct fd_context *ctx, const struct ir3_shader_variant *v,
 	size *= 4;
 
 	if (size > 0) {
-		fd_wfi(ctx->batch, ring);
+		ring_wfi(ctx->batch, ring);
 		ctx->emit_const(ring, v->type, base,
 			0, size, v->immediates[0].val, NULL);
 	}
@@ -729,7 +741,7 @@ emit_tfbos(struct fd_context *ctx, const struct ir3_shader_variant *v,
 			}
 		}
 
-		fd_wfi(ctx->batch, ring);
+		ring_wfi(ctx->batch, ring);
 		ctx->emit_const_bo(ring, v->type, true, offset * 4, params, prscs, offsets);
 	}
 }
@@ -787,6 +799,19 @@ emit_common_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin
 {
 	enum fd_dirty_shader_state dirty = ctx->dirty_shader[t];
 
+	/* When we use CP_SET_DRAW_STATE objects to emit constant state,
+	 * if we emit any of it we need to emit all.  This is because
+	 * we are using the same state-group-id each time for uniform
+	 * state, and if previous update is never evaluated (due to no
+	 * visible primitives in the current tile) then the new stateobj
+	 * completely replaces the old one.
+	 *
+	 * Possibly if we split up different parts of the const state to
+	 * different state-objects we could avoid this.
+	 */
+	if (dirty && (ring->flags & FD_RINGBUFFER_OBJECT))
+		dirty = ~0;
+
 	if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST)) {
 		struct fd_constbuf_stateobj *constbuf;
 		bool shader_dirty;
@@ -846,7 +871,7 @@ ir3_emit_vs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin
 				vertex_params_size = ARRAY_SIZE(vertex_params);
 			}
 
-			fd_wfi(ctx->batch, ring);
+			ring_wfi(ctx->batch, ring);
 
 			bool needs_vtxid_base =
 				ir3_find_sysval_regid(v, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) != regid(63, 0);
@@ -918,7 +943,7 @@ ir3_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin
 	/* emit compute-shader driver-params: */
 	uint32_t offset = v->constbase.driver_param;
 	if (v->constlen > offset) {
-		fd_wfi(ctx->batch, ring);
+		ring_wfi(ctx->batch, ring);
 
 		if (info->indirect) {
 			struct pipe_resource *indirect = NULL;
author	Rob Clark <[email protected]>	2018-10-07 13:59:27 -0400
committer	Rob Clark <[email protected]>	2018-10-17 12:44:48 -0400
commit	abcdf5627a29b7f1856b86bce4ff9bd0029a3099 (patch)
tree	f28d2aaa8e5eb6aec707217d9bd61b641aa991b2 /src/gallium
parent	a398d26fd2cb1ef075a07fa91d2c74613982a66f (diff)