From 30ab3e39fd4d9cceb8a1a7d06dc3d99b0f477a72 Mon Sep 17 00:00:00 2001
From: "Kristian H. Kristensen" <hoegsberg@google.com>
Date: Thu, 5 Sep 2019 15:12:23 -0700
Subject: freedreno/a6xx: Implement primitive count queries on GPU

The driver can't determine PIPE_QUERY_PRIMITIVES_GENERATED or
PIPE_QUERY_PRIMITIVES_EMITTED once we support geometry or
tessellation, since these stages add primitives at runtime.  Use the
WRITE_PRIMITIVE_COUNTS event to write back the primitive counts and
implement a hw query for this.

Reviewed-by: Rob Clark <robdclark@gmail.com>
---
 src/freedreno/registers/a6xx.xml                   |  4 +
 src/freedreno/registers/adreno_pm4.xml             |  1 +
 src/gallium/drivers/freedreno/a2xx/fd2_query.c     |  2 +-
 src/gallium/drivers/freedreno/a5xx/fd5_query.c     |  2 +-
 src/gallium/drivers/freedreno/a6xx/fd6_query.c     | 95 +++++++++++++++++++++-
 src/gallium/drivers/freedreno/freedreno_batch.h    |  2 +-
 src/gallium/drivers/freedreno/freedreno_context.h  |  2 +-
 src/gallium/drivers/freedreno/freedreno_query.c    |  9 +-
 src/gallium/drivers/freedreno/freedreno_query.h    |  7 ++
 .../drivers/freedreno/freedreno_query_acc.c        |  8 +-
 .../drivers/freedreno/freedreno_query_acc.h        |  5 +-
 src/gallium/drivers/freedreno/freedreno_query_hw.c |  3 +-
 src/gallium/drivers/freedreno/freedreno_query_hw.h |  2 +-
 src/gallium/drivers/freedreno/freedreno_query_sw.c |  2 +-
 src/gallium/drivers/freedreno/freedreno_query_sw.h |  2 +-
 15 files changed, 128 insertions(+), 18 deletions(-)

diff --git a/src/freedreno/registers/a6xx.xml b/src/freedreno/registers/a6xx.xml
index 6868ed5528b..13c3dcd5a01 100644
--- a/src/freedreno/registers/a6xx.xml
+++ b/src/freedreno/registers/a6xx.xml
@@ -2394,6 +2394,10 @@ to upconvert to 32b float internally?
 		<bitfield name="B_OFF" low="14" high="22" shr="2" type="uint"/>
 		<bitfield name="B_EN" pos="23" type="boolean"/>
 	</reg32>
+
+	<reg32 offset="0x9218" name="VPC_SO_STREAM_COUNTS_LO"/>
+	<reg32 offset="0x9219" name="VPC_SO_STREAM_COUNTS_HI"/>
+
 	<array offset="0x921a" name="VPC_SO" stride="7" length="4">
 		<reg32 offset="0" name="BUFFER_BASE_LO"/>
 		<reg32 offset="1" name="BUFFER_BASE_HI"/>
diff --git a/src/freedreno/registers/adreno_pm4.xml b/src/freedreno/registers/adreno_pm4.xml
index 06175d3e1ad..efadcd9a3fa 100644
--- a/src/freedreno/registers/adreno_pm4.xml
+++ b/src/freedreno/registers/adreno_pm4.xml
@@ -15,6 +15,7 @@ xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd">
 	<value name="VIZQUERY_START" value="7"/> <!-- on a2xx (??) -->
 	<value name="VIZQUERY_END" value="8"/>
 	<value name="SC_WAIT_WC" value="9"/>
+	<value name="WRITE_PRIMITIVE_COUNTS" value="9" variants="A6XX"/>
 	<value name="RST_PIX_CNT" value="13"/>
 	<value name="RST_VTX_CNT" value="14"/>
 	<value name="TILE_FLUSH" value="15"/>
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_query.c b/src/gallium/drivers/freedreno/a2xx/fd2_query.c
index 9e5bb450cd4..b80096c2c81 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_query.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_query.c
@@ -218,7 +218,7 @@ fd2_create_batch_query(struct pipe_context *pctx,
 		counters_per_group[entry->gid]++;
 	}
 
-	q = fd_acc_create_query2(ctx, 0, &perfcntr);
+	q = fd_acc_create_query2(ctx, 0, 0, &perfcntr);
 	aq = fd_acc_query(q);
 
 	/* sample buffer size is based on # of queries: */
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_query.c b/src/gallium/drivers/freedreno/a5xx/fd5_query.c
index b438c7a5634..28f296d57cf 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_query.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_query.c
@@ -433,7 +433,7 @@ fd5_create_batch_query(struct pipe_context *pctx,
 		counters_per_group[entry->gid]++;
 	}
 
-	q = fd_acc_create_query2(ctx, 0, &perfcntr);
+	q = fd_acc_create_query2(ctx, 0, 0, &perfcntr);
 	aq = fd_acc_query(q);
 
 	/* sample buffer size is based on # of queries: */
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_query.c b/src/gallium/drivers/freedreno/a6xx/fd6_query.c
index cdd2778757e..d950144bc7e 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_query.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_query.c
@@ -252,6 +252,96 @@ static const struct fd_acc_sample_provider timestamp = {
 		.result = timestamp_accumulate_result,
 };
 
+struct PACKED fd6_primitives_sample {
+	struct {
+		uint64_t generated, emitted;
+	} start[4], stop[4], result;
+};
+
+
+#define primitives_relocw(ring, aq, field) \
+	OUT_RELOCW(ring, fd_resource((aq)->prsc)->bo, offsetof(struct fd6_primitives_sample, field), 0, 0);
+#define primitives_reloc(ring, aq, field) \
+	OUT_RELOC(ring, fd_resource((aq)->prsc)->bo, offsetof(struct fd6_primitives_sample, field), 0, 0);
+
+static void
+primitive_counts_resume(struct fd_acc_query *aq, struct fd_batch *batch)
+{
+	struct fd_ringbuffer *ring = batch->draw;
+
+	OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_COUNTS_LO, 2);
+	primitives_relocw(ring, aq, start[0]);
+
+	fd6_event_write(batch, ring, WRITE_PRIMITIVE_COUNTS, false);
+}
+
+static void
+primitive_counts_pause(struct fd_acc_query *aq, struct fd_batch *batch)
+{
+	struct fd_ringbuffer *ring = batch->draw;
+
+	OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_COUNTS_LO, 2);
+	primitives_relocw(ring, aq, stop[0]);
+
+	fd6_event_write(batch, ring, WRITE_PRIMITIVE_COUNTS, false);
+
+	fd6_event_write(batch, batch->draw, CACHE_FLUSH_TS, true);
+
+	/* result += stop - start: */
+	OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
+	OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE |
+			CP_MEM_TO_MEM_0_NEG_C | 0x80000000);
+	primitives_relocw(ring, aq, result.emitted);
+	primitives_reloc(ring, aq, result.emitted);
+	primitives_reloc(ring, aq, stop[aq->base.index].emitted);
+	primitives_reloc(ring, aq, start[aq->base.index].emitted);
+
+	/* result += stop - start: */
+	OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
+	OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE |
+			CP_MEM_TO_MEM_0_NEG_C | 0x80000000);
+	primitives_relocw(ring, aq, result.generated);
+	primitives_reloc(ring, aq, result.generated);
+	primitives_reloc(ring, aq, stop[aq->base.index].generated);
+	primitives_reloc(ring, aq, start[aq->base.index].generated);
+}
+
+static void
+primitives_generated_result(struct fd_acc_query *aq, void *buf,
+		union pipe_query_result *result)
+{
+	struct fd6_primitives_sample *ps = buf;
+
+	result->u64 = ps->result.generated;
+}
+
+static const struct fd_acc_sample_provider primitives_generated = {
+	.query_type = PIPE_QUERY_PRIMITIVES_GENERATED,
+	.active = FD_STAGE_DRAW,
+	.size = sizeof(struct fd6_primitives_sample),
+	.resume = primitive_counts_resume,
+	.pause = primitive_counts_pause,
+	.result = primitives_generated_result,
+};
+
+static void
+primitives_emitted_result(struct fd_acc_query *aq, void *buf,
+		union pipe_query_result *result)
+{
+	struct fd6_primitives_sample *ps = buf;
+
+	result->u64 = ps->result.emitted;
+}
+
+static const struct fd_acc_sample_provider primitives_emitted = {
+	.query_type = PIPE_QUERY_PRIMITIVES_EMITTED,
+	.active = FD_STAGE_DRAW,
+	.size = sizeof(struct fd6_primitives_sample),
+	.resume = primitive_counts_resume,
+	.pause = primitive_counts_pause,
+	.result = primitives_emitted_result,
+};
+
 /*
  * Performance Counter (batch) queries:
  *
@@ -433,7 +523,7 @@ fd6_create_batch_query(struct pipe_context *pctx,
 		counters_per_group[entry->gid]++;
 	}
 
-	q = fd_acc_create_query2(ctx, 0, &perfcntr);
+	q = fd_acc_create_query2(ctx, 0, 0, &perfcntr);
 	aq = fd_acc_query(q);
 
 	/* sample buffer size is based on # of queries: */
@@ -463,4 +553,7 @@ fd6_query_context_init(struct pipe_context *pctx)
 
 	fd_acc_query_register_provider(pctx, &time_elapsed);
 	fd_acc_query_register_provider(pctx, &timestamp);
+
+	fd_acc_query_register_provider(pctx, &primitives_generated);
+	fd_acc_query_register_provider(pctx, &primitives_emitted);
 }
diff --git a/src/gallium/drivers/freedreno/freedreno_batch.h b/src/gallium/drivers/freedreno/freedreno_batch.h
index edf0840825b..f5ae07eb892 100644
--- a/src/gallium/drivers/freedreno/freedreno_batch.h
+++ b/src/gallium/drivers/freedreno/freedreno_batch.h
@@ -55,7 +55,7 @@ enum fd_render_stage {
 	FD_STAGE_ALL      = 0xff,
 };
 
-#define MAX_HW_SAMPLE_PROVIDERS 5
+#define MAX_HW_SAMPLE_PROVIDERS 7
 struct fd_hw_sample_provider;
 struct fd_hw_sample;
 
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index af3506fd685..60318cfe9e2 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -331,7 +331,7 @@ struct fd_context {
 	void (*launch_grid)(struct fd_context *ctx, const struct pipe_grid_info *info);
 
 	/* query: */
-	struct fd_query * (*create_query)(struct fd_context *ctx, unsigned query_type);
+	struct fd_query * (*create_query)(struct fd_context *ctx, unsigned query_type, unsigned index);
 	void (*query_prepare)(struct fd_batch *batch, uint32_t num_tiles);
 	void (*query_prepare_tile)(struct fd_batch *batch, uint32_t n,
 			struct fd_ringbuffer *ring);
diff --git a/src/gallium/drivers/freedreno/freedreno_query.c b/src/gallium/drivers/freedreno/freedreno_query.c
index def6166479b..1c98f4db156 100644
--- a/src/gallium/drivers/freedreno/freedreno_query.c
+++ b/src/gallium/drivers/freedreno/freedreno_query.c
@@ -41,11 +41,12 @@ static struct pipe_query *
 fd_create_query(struct pipe_context *pctx, unsigned query_type, unsigned index)
 {
 	struct fd_context *ctx = fd_context(pctx);
-	struct fd_query *q;
+	struct fd_query *q = NULL;
 
-	q = fd_sw_create_query(ctx, query_type);
-	if (!q && ctx->create_query)
-		q = ctx->create_query(ctx, query_type);
+	if (ctx->create_query)
+		q = ctx->create_query(ctx, query_type, index);
+	if (!q)
+		q = fd_sw_create_query(ctx, query_type, index);
 
 	return (struct pipe_query *) q;
 }
diff --git a/src/gallium/drivers/freedreno/freedreno_query.h b/src/gallium/drivers/freedreno/freedreno_query.h
index 15e9ce450de..e69ff7a88a1 100644
--- a/src/gallium/drivers/freedreno/freedreno_query.h
+++ b/src/gallium/drivers/freedreno/freedreno_query.h
@@ -46,6 +46,7 @@ struct fd_query {
 	const struct fd_query_funcs *funcs;
 	bool active;
 	int type;
+	unsigned index;
 };
 
 static inline struct fd_query *
@@ -102,6 +103,12 @@ int pidx(unsigned query_type)
 		return 3;
 	case PIPE_QUERY_TIMESTAMP:
 		return 4;
+
+	case PIPE_QUERY_PRIMITIVES_GENERATED:
+		return 5;
+	case PIPE_QUERY_PRIMITIVES_EMITTED:
+		return 6;
+
 	default:
 		return -1;
 	}
diff --git a/src/gallium/drivers/freedreno/freedreno_query_acc.c b/src/gallium/drivers/freedreno/freedreno_query_acc.c
index 2d4d79925bc..b0be16c233e 100644
--- a/src/gallium/drivers/freedreno/freedreno_query_acc.c
+++ b/src/gallium/drivers/freedreno/freedreno_query_acc.c
@@ -173,7 +173,7 @@ static const struct fd_query_funcs acc_query_funcs = {
 
 struct fd_query *
 fd_acc_create_query2(struct fd_context *ctx, unsigned query_type,
-		const struct fd_acc_sample_provider *provider)
+		unsigned index, const struct fd_acc_sample_provider *provider)
 {
 	struct fd_acc_query *aq;
 	struct fd_query *q;
@@ -192,19 +192,21 @@ fd_acc_create_query2(struct fd_context *ctx, unsigned query_type,
 	q = &aq->base;
 	q->funcs = &acc_query_funcs;
 	q->type = query_type;
+	q->index = index;
 
 	return q;
 }
 
 struct fd_query *
-fd_acc_create_query(struct fd_context *ctx, unsigned query_type)
+fd_acc_create_query(struct fd_context *ctx, unsigned query_type,
+		unsigned index)
 {
 	int idx = pidx(query_type);
 
 	if ((idx < 0) || !ctx->acc_sample_providers[idx])
 		return NULL;
 
-	return fd_acc_create_query2(ctx, query_type,
+	return fd_acc_create_query2(ctx, query_type, index,
 			ctx->acc_sample_providers[idx]);
 }
 
diff --git a/src/gallium/drivers/freedreno/freedreno_query_acc.h b/src/gallium/drivers/freedreno/freedreno_query_acc.h
index 3bbffe4436f..fdde07fcc4f 100644
--- a/src/gallium/drivers/freedreno/freedreno_query_acc.h
+++ b/src/gallium/drivers/freedreno/freedreno_query_acc.h
@@ -97,9 +97,10 @@ fd_acc_query(struct fd_query *q)
 	return (struct fd_acc_query *)q;
 }
 
-struct fd_query * fd_acc_create_query(struct fd_context *ctx, unsigned query_type);
+struct fd_query * fd_acc_create_query(struct fd_context *ctx, unsigned query_type,
+	unsigned index);
 struct fd_query * fd_acc_create_query2(struct fd_context *ctx, unsigned query_type,
-		const struct fd_acc_sample_provider *provider);
+		unsigned index, const struct fd_acc_sample_provider *provider);
 void fd_acc_query_set_stage(struct fd_batch *batch, enum fd_render_stage stage);
 void fd_acc_query_register_provider(struct pipe_context *pctx,
 		const struct fd_acc_sample_provider *provider);
diff --git a/src/gallium/drivers/freedreno/freedreno_query_hw.c b/src/gallium/drivers/freedreno/freedreno_query_hw.c
index 1c7048d3f28..b37126dc0da 100644
--- a/src/gallium/drivers/freedreno/freedreno_query_hw.c
+++ b/src/gallium/drivers/freedreno/freedreno_query_hw.c
@@ -266,7 +266,7 @@ static const struct fd_query_funcs hw_query_funcs = {
 };
 
 struct fd_query *
-fd_hw_create_query(struct fd_context *ctx, unsigned query_type)
+fd_hw_create_query(struct fd_context *ctx, unsigned query_type, unsigned index)
 {
 	struct fd_hw_query *hq;
 	struct fd_query *q;
@@ -289,6 +289,7 @@ fd_hw_create_query(struct fd_context *ctx, unsigned query_type)
 	q = &hq->base;
 	q->funcs = &hw_query_funcs;
 	q->type = query_type;
+	q->index = index;
 
 	return q;
 }
diff --git a/src/gallium/drivers/freedreno/freedreno_query_hw.h b/src/gallium/drivers/freedreno/freedreno_query_hw.h
index e711b837905..c9723ed16c0 100644
--- a/src/gallium/drivers/freedreno/freedreno_query_hw.h
+++ b/src/gallium/drivers/freedreno/freedreno_query_hw.h
@@ -136,7 +136,7 @@ fd_hw_query(struct fd_query *q)
 	return (struct fd_hw_query *)q;
 }
 
-struct fd_query * fd_hw_create_query(struct fd_context *ctx, unsigned query_type);
+struct fd_query * fd_hw_create_query(struct fd_context *ctx, unsigned query_type, unsigned index);
 /* helper for sample providers: */
 struct fd_hw_sample * fd_hw_sample_init(struct fd_batch *batch, uint32_t size);
 /* don't call directly, use fd_hw_sample_reference() */
diff --git a/src/gallium/drivers/freedreno/freedreno_query_sw.c b/src/gallium/drivers/freedreno/freedreno_query_sw.c
index 7a610d3b21b..96bc814ccb1 100644
--- a/src/gallium/drivers/freedreno/freedreno_query_sw.c
+++ b/src/gallium/drivers/freedreno/freedreno_query_sw.c
@@ -162,7 +162,7 @@ static const struct fd_query_funcs sw_query_funcs = {
 };
 
 struct fd_query *
-fd_sw_create_query(struct fd_context *ctx, unsigned query_type)
+fd_sw_create_query(struct fd_context *ctx, unsigned query_type, unsigned index)
 {
 	struct fd_sw_query *sq;
 	struct fd_query *q;
diff --git a/src/gallium/drivers/freedreno/freedreno_query_sw.h b/src/gallium/drivers/freedreno/freedreno_query_sw.h
index 8b754e08783..967e4af2117 100644
--- a/src/gallium/drivers/freedreno/freedreno_query_sw.h
+++ b/src/gallium/drivers/freedreno/freedreno_query_sw.h
@@ -48,6 +48,6 @@ fd_sw_query(struct fd_query *q)
 }
 
 struct fd_query * fd_sw_create_query(struct fd_context *ctx,
-		unsigned query_type);
+		unsigned query_type, unsigned index);
 
 #endif /* FREEDRENO_QUERY_SW_H_ */
-- 
cgit v1.2.3