aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorRhys Perry <[email protected]>2018-06-27 00:04:41 +0100
committerIlia Mirkin <[email protected]>2019-02-06 19:35:57 -0500
commit5b6f522fc29f4c8bbadd0466b6f61c1876c95807 (patch)
treec96fbad2e2f83bb077b95c2a0618d6da31c06bfd /src
parentcce495572136a606dd2a35e79f45080c3796e2cc (diff)
nvc0: add compute invocation counter
The strategy is to keep a CPU-side counter of the direct invocations, and a GPU-side counter of the indirect invocations, and then add them together for queries. The specific technique is a macro which multiplies a list of integers together and accumulates the product into SCRATCH registers held inside of the context. Another macro will read those values out and add them to the passed-in cpu-side counter to be stored in a query buffer the same way that all the other statistics are stored. Original implementation by Rhys Perry, redone by Ilia Mirkin to use the SCRATCH temporaries. Signed-off-by: Ilia Mirkin <[email protected]>
Diffstat (limited to 'src')
-rw-r--r--src/gallium/drivers/nouveau/nvc0/mme/com9097.mme90
-rw-r--r--src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h54
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nvc0_compute.c32
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nvc0_context.h4
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nvc0_macros.h4
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c23
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nvc0_screen.c2
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nve4_compute.c2
8 files changed, 207 insertions, 4 deletions
diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
index 38c2e868431..d6af8221b65 100644
--- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
+++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
@@ -580,3 +580,93 @@ crs_loop:
/* Enable */
exit maddr 0x1452 /* CONSERVATIVE_RASTER */
send 0x1
+
+/* NVC0_3D_MACRO_COMPUTE_COUNTER
+ *
+ * This macro takes 6 values, num_groups_* and group_size_*, and adds their
+ * product to the current value
+ *
+ * It's used for keeping track of the number of executed indirect
+ * compute invocations for statistics.
+ *
+ * SCRATCH[4] = current counter [low]
+ * SCRATCH[5] = current counter [high]
+ *
+ * arg = number of parameters to muliply together, ideally 6
+ * parm[0] = num_groups_x
+ * parm[1] = num_groups_y
+ * parm[2] = num_groups_z
+ * parm[3] = group_size_x
+ * parm[4] = group_size_y
+ * parm[5] = group_size_z
+ */
+.section #mme9097_compute_counter
+ mov $r7 $r1
+ mov $r1 1 /* low result */
+ mov $r2 0 /* high result */
+iic_loop_start:
+ parm $r3 /* val, next integer to multiply in */
+ /* multiplication start - look at low bit, add if set, shift right/left */
+ mov $r4 0 /* low temp */
+ mov $r5 0 /* high temp */
+iic_mul_start: /* temp = result * val */
+ braz annul $r3 #iic_mul_done
+iic_mul_body:
+ mov $r6 (extrinsrt 0x0 $r3 0 1 0) /* val & 1 - check low bit */
+ braz $r6 #iic_mul_cont /* bit not set */
+ mov $r3 (extrinsrt 0x0 $r3 1 31 0) /* val >>= 1 - shift right */
+
+ mov $r4 (add $r4 $r1) /* temp += result */
+ mov $r5 (adc $r5 $r2)
+iic_mul_cont:
+ mov $r1 (add $r1 $r1) /* shift left, part 1 (result *= 2) */
+ bra #iic_mul_start
+ mov $r2 (adc $r2 $r2) /* shift left, part 2 */
+iic_mul_done:
+ /* decrease loop counter, keep going if necessary */
+ mov $r7 (add $r7 -1)
+ /* result = temp ( = result * val ) */
+ mov $r1 $r4
+ branz $r7 #iic_loop_start
+ mov $r2 $r5
+
+ /* increment current value by newly-calculated invocation count */
+ read $r3 0xd04 /* SCRATCH[4] */
+ read $r4 0xd05 /* SCRATCH[5] */
+ maddr 0x1d04 /* SCRATCH[4] */
+ exit send (add $r3 $r1)
+ send (adc $r4 $r2)
+
+/* NVC0_3D_MACRO_COMPUTE_COUNTER_TO_QUERY
+ *
+ * This macro writes out the indirect counter plus a direct value to
+ * the given address using QUERY_GET (64-bit value).
+ *
+ * arg = direct counter low
+ * parm[0] = direct counter high
+ * parm[1] = query address high
+ * parm[2] = query address low
+ */
+.section #mme9097_compute_counter_to_query
+ parm $r2 /* counter high */
+ read $r3 0xd04 /* SCRATCH[4] */
+ read $r4 0xd05 /* SCRATCH[5] */
+ mov $r1 (add $r1 $r3)
+ mov $r2 (adc $r2 $r4)
+
+ parm $r3 maddr 0x16c0 /* QUERY_ADDRESS_HIGH */
+ parm $r4 send $r3
+ send $r4 /* r3 = addr high, r4 = addr low */
+ send $r1 /* sum low */
+ mov $r5 0x1000
+ send (extrinsrt 0x0 $r5 0x0 0x10 0x10) /* GET_SHORT */
+
+ /* add 4 to the address */
+ mov $r1 0x4
+ mov $r4 (add $r4 $r1) /* addr low */
+ mov $r3 (adc $r3 0x0) /* addr high */
+ maddr 0x16c0 /* QUERY_ADDRESS_HIGH */
+ send $r3 /* addr high */
+ send $r4 /* addr low */
+ exit send $r2 /* sum high */
+ send (extrinsrt 0x0 $r5 0x0 0x10 0x10) /* GET_SHORT */
diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
index 49c08911142..f068367c84e 100644
--- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
+++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
@@ -394,3 +394,57 @@ uint32_t mme9097_conservative_raster_state[] = {
0x051480a1,
0x00004041,
};
+
+uint32_t mme9097_compute_counter[] = {
+/* 0x0003: iic_loop_start */
+ 0x00000f11,
+/* 0x0006: iic_mul_start */
+/* 0x0007: iic_mul_body */
+ 0x00004111,
+ 0x00000211,
+/* 0x000c: iic_mul_cont */
+/* 0x000f: iic_mul_done */
+ 0x00000301,
+ 0x00000411,
+ 0x00000511,
+ 0x00025827,
+ 0x0040c612,
+ 0x00013007,
+ 0x07c2c312,
+ 0x00006410,
+ 0x0002ad10,
+ 0x00004910,
+ 0xfffe4007,
+ 0x00029210,
+ 0xffffff11,
+ 0x00002111,
+ 0xfffcb817,
+ 0x00002a11,
+ 0x03410315,
+ 0x03414415,
+ 0x07410021,
+ 0x000058c0,
+ 0x0002a040,
+};
+
+uint32_t mme9097_compute_counter_to_query[] = {
+ 0x00000201,
+ 0x03410315,
+ 0x03414415,
+ 0x0000c910,
+ 0x00031210,
+ 0x05b00351,
+ 0x00001c31,
+ 0x00002041,
+ 0x00000841,
+ 0x04000511,
+ 0x84014042,
+ 0x00010111,
+ 0x00006410,
+ 0x00021b10,
+ 0x05b00021,
+ 0x00001841,
+ 0x00002041,
+ 0x000010c1,
+ 0x84014042,
+};
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
index 28e16367326..3ab2f5e3d7f 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
@@ -500,4 +500,36 @@ nvc0_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_SUF);
nvc0->dirty_cp |= NVC0_NEW_CP_SURFACES;
nvc0->images_dirty[5] |= nvc0->images_valid[5];
+
+ nvc0_update_compute_invocations_counter(nvc0, info);
+}
+
+static void
+nvc0_compute_update_indirect_invocations(struct nvc0_context *nvc0,
+ const struct pipe_grid_info *info) {
+ struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+ struct nv04_resource *res = nv04_resource(info->indirect);
+ uint32_t offset = res->offset + info->indirect_offset;
+
+ nouveau_pushbuf_space(push, 16, 0, 8);
+ PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
+ BEGIN_1IC0(push, NVC0_3D(MACRO_COMPUTE_COUNTER), 7);
+ PUSH_DATA(push, 6);
+ PUSH_DATA(push, info->block[0]);
+ PUSH_DATA(push, info->block[1]);
+ PUSH_DATA(push, info->block[2]);
+ nouveau_pushbuf_data(push, res->bo, offset,
+ NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4);
+}
+
+void
+nvc0_update_compute_invocations_counter(struct nvc0_context *nvc0,
+ const struct pipe_grid_info *info) {
+ if (unlikely(info->indirect)) {
+ nvc0_compute_update_indirect_invocations(nvc0, info);
+ } else {
+ uint64_t invocations = info->block[0] * info->block[1] * info->block[2];
+ invocations *= info->grid[0] * info->grid[1] * info->grid[2];
+ nvc0->compute_invocations += invocations;
+ }
}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index c1351062676..4cfd207d4c0 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -282,6 +282,8 @@ struct nvc0_context {
uint16_t images_valid[6];
struct util_dynarray global_residents;
+
+ uint64_t compute_invocations;
};
static inline struct nvc0_context *
@@ -442,5 +444,7 @@ void nve4_launch_grid(struct pipe_context *, const struct pipe_grid_info *);
/* nvc0_compute.c */
void nvc0_launch_grid(struct pipe_context *, const struct pipe_grid_info *);
void nvc0_compute_validate_globals(struct nvc0_context *);
+void nvc0_update_compute_invocations_counter(struct nvc0_context *nvc0,
+ const struct pipe_grid_info *info);
#endif
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h b/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
index 7aa06337950..f4842fd6d68 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
@@ -39,4 +39,8 @@
#define NVC0_3D_MACRO_CONSERVATIVE_RASTER_STATE 0x00003868
+#define NVC0_3D_MACRO_COMPUTE_COUNTER 0x00003870
+
+#define NVC0_3D_MACRO_COMPUTE_COUNTER_TO_QUERY 0x00003878
+
#endif /* __NVC0_MACROS_H__ */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
index f6d5d0f5602..b6a214ccd49 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
@@ -122,6 +122,22 @@ nvc0_hw_destroy_query(struct nvc0_context *nvc0, struct nvc0_query *q)
FREE(hq);
}
+static void
+nvc0_hw_query_write_compute_invocations(struct nvc0_context *nvc0,
+ struct nvc0_hw_query *hq,
+ uint32_t offset)
+{
+ struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+
+ nouveau_pushbuf_space(push, 16, 0, 8);
+ PUSH_REFN(push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+ BEGIN_1IC0(push, NVC0_3D(MACRO_COMPUTE_COUNTER_TO_QUERY), 4);
+ PUSH_DATA (push, nvc0->compute_invocations);
+ PUSH_DATAh(push, nvc0->compute_invocations);
+ PUSH_DATAh(push, hq->bo->offset + hq->offset + offset);
+ PUSH_DATA (push, hq->bo->offset + hq->offset + offset);
+}
+
static boolean
nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q)
{
@@ -198,7 +214,7 @@ nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q)
nvc0_hw_query_get(push, q, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */
nvc0_hw_query_get(push, q, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */
nvc0_hw_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */
- ((uint64_t *)hq->data)[(12 + 10) * 2] = 0;
+ nvc0_hw_query_write_compute_invocations(nvc0, hq, 0xc0 + 0xa0);
break;
default:
break;
@@ -271,7 +287,7 @@ nvc0_hw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q)
nvc0_hw_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */
nvc0_hw_query_get(push, q, 0x80, 0x0d808002); /* TCP, LAUNCHES */
nvc0_hw_query_get(push, q, 0x90, 0x0e809002); /* TEP, LAUNCHES */
- ((uint64_t *)hq->data)[10 * 2] = 0;
+ nvc0_hw_query_write_compute_invocations(nvc0, hq, 0xa0);
break;
case PIPE_QUERY_TIMESTAMP_DISJOINT:
/* This query is not issued on GPU because disjoint is forced to false */
@@ -354,9 +370,8 @@ nvc0_hw_get_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
res64[0] = data64[1] - data64[3];
break;
case PIPE_QUERY_PIPELINE_STATISTICS:
- for (i = 0; i < 10; ++i)
+ for (i = 0; i < 11; ++i)
res64[i] = data64[i * 2] - data64[24 + i * 2];
- result->pipeline_statistics.cs_invocations = 0;
break;
case NVC0_HW_QUERY_TFB_BUFFER_OFFSET:
res32[0] = hq->data[1];
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 216fba49d9e..6a79fd9a903 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -1308,6 +1308,8 @@ nvc0_screen_create(struct nouveau_device *dev)
MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT, mme9097_draw_elts_indirect_count);
MK_MACRO(NVC0_3D_MACRO_QUERY_BUFFER_WRITE, mme9097_query_buffer_write);
MK_MACRO(NVC0_3D_MACRO_CONSERVATIVE_RASTER_STATE, mme9097_conservative_raster_state);
+ MK_MACRO(NVC0_3D_MACRO_COMPUTE_COUNTER, mme9097_compute_counter);
+ MK_MACRO(NVC0_3D_MACRO_COMPUTE_COUNTER_TO_QUERY, mme9097_compute_counter_to_query);
MK_MACRO(NVC0_CP_MACRO_LAUNCH_GRID_INDIRECT, mme90c0_launch_grid_indirect);
BEGIN_NVC0(push, NVC0_3D(RASTERIZE_ENABLE), 1);
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
index fcd7d9537f9..c5e4dec20bd 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -779,6 +779,8 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
BEGIN_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1);
PUSH_DATA (push, 0);
+ nvc0_update_compute_invocations_counter(nvc0, info);
+
out:
if (ret)
NOUVEAU_ERR("Failed to launch grid !\n");