From c52e92ec3a37c9ab3fb35132e62e1ddf6a770c27 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 25 May 2016 23:36:48 +0200 Subject: nvc0: allow to monitor MP perf counters with compute shaders To read out MP perf counters we use a compute shader and need to upload input data like a 64-bits addr used to store the values and a sequence ID for synchronization. Currently, this input data is uploaded as user uniforms which means that it's sticked to c0[], but if a compute shader from a real application is used, monitoring those performance counters will just overwrite some data and miserably crash. Instead, sticking the 64-bits addr and the sequence into the driver constant buffer seems like much better and will allow to monitor counters with GL 4.3 apps. Tested on GF119 and GK110, but should not hurt anything on GK104. Signed-off-by: Samuel Pitoiset Reviewed-by: Ilia Mirkin --- src/gallium/drivers/nouveau/nvc0/nvc0_context.h | 3 + .../drivers/nouveau/nvc0/nvc0_query_hw_sm.c | 71 ++++++++++++++++------ 2 files changed, 55 insertions(+), 19 deletions(-) (limited to 'src/gallium/drivers/nouveau/nvc0') diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h index 5be78aabebc..c0997583235 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h @@ -133,6 +133,9 @@ /* 8 surfaces, at 16 32-bits integers each */ #define NVC0_CB_AUX_SU_INFO(i) 0x400 + (i) * 16 * 4 #define NVC0_CB_AUX_SU_SIZE (NVC0_MAX_IMAGES * 16 * 4) +/* 1 64-bits address and 1 32-bits sequence */ +#define NVC0_CB_AUX_MP_INFO 0x600 +#define NVC0_CB_AUX_MP_SIZE 3 * 4 /* 4 32-bits floats for the vertex runout, put at the end */ #define NVC0_CB_AUX_RUNOUT_INFO NVC0_CB_USR_SIZE + NVC0_CB_AUX_SIZE diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c index 54e0140ba62..27cbbc40c3c 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c @@ -372,9 +372,9 @@ static const uint64_t nve4_read_hw_sm_counters_code[] = * mov b32 $r6 $pm6 * mov b32 $r7 $pm7 * set $p0 0x1 eq u32 $r8 0x0 - * mov b32 $r10 c0[0x0] + * mov b32 $r10 c7[0x600] * ext u32 $r8 $r12 0x414 - * mov b32 $r11 c0[0x4] + * mov b32 $r11 c7[0x604] * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04 * ext u32 $r9 $r12 0x208 * (not $p0) exit @@ -392,7 +392,7 @@ static const uint64_t nve4_read_hw_sm_counters_code[] = * add b32 $r12 $c $r12 $r9 * st b128 wt g[$r10d] $r0q * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00 - * mov b32 $r0 c0[0x8] + * mov b32 $r0 c7[0x608] * add b32 $r13 $r13 0x0 $c * $p1 st b128 wt g[$r12d+0x40] $r4q * st b32 wt g[$r12d+0x50] $r0 @@ -410,9 +410,9 @@ static const uint64_t nve4_read_hw_sm_counters_code[] = 0x2c00000028019c04ULL, 0x2c0000002c01dc04ULL, 0x190e0000fc81dc03ULL, - 0x2800400000029de4ULL, + 0x28005c1800029de4ULL, 0x7000c01050c21c03ULL, - 0x280040001002dde4ULL, + 0x28005c181002dde4ULL, 0x204282020042e047ULL, 0x7000c00820c25c03ULL, 0x80000000000021e7ULL, @@ -430,7 +430,7 @@ static const uint64_t nve4_read_hw_sm_counters_code[] = 0x4801000024c31c03ULL, 0x9400000000a01fc5ULL, 0x200002e04202c047ULL, - 0x2800400020001de4ULL, + 0x28005c1820001de4ULL, 0x0800000000d35c42ULL, 0x9400000100c107c5ULL, 0x9400000140c01f85ULL, @@ -453,9 +453,9 @@ static const uint64_t nvf0_read_hw_sm_counters_code[] = 0x86400000051c001aULL, 0x86400000059c001eULL, 0xdb201c007f9c201eULL, - 0x64c03c00001c002aULL, + 0x64c03ce0c01c002aULL, 0xc00000020a1c3021ULL, - 0x64c03c00009c002eULL, + 0x64c03ce0c09c002eULL, 0x0810a0808010b810ULL, 0xc0000001041c3025ULL, 0x180000000020003cULL, @@ -473,7 +473,7 @@ static const uint64_t nvf0_read_hw_sm_counters_code[] = 0xe0840000049c3032ULL, 0xfe800000001c2800ULL, 0x080000b81080b010ULL, - 0x64c03c00011c0002ULL, + 0x64c03ce0c11c0002ULL, 0xe08040007f9c3436ULL, 0xfe80000020043010ULL, 0xfc800000281c3000ULL, @@ -1105,14 +1105,14 @@ static const uint64_t nvc0_read_hw_sm_counters_code[] = * mov b32 $r6 $pm6 * mov b32 $r7 $pm7 * set $p0 0x1 eq u32 $r8 0x0 - * mov b32 $r10 c0[0x0] - * mov b32 $r11 c0[0x4] + * mov b32 $r10 c15[0x600] + * mov b32 $r11 c15[0x604] * ext u32 $r8 $r9 0x414 * (not $p0) exit * mul $r8 u32 $r8 u32 48 * add b32 $r10 $c $r10 $r8 * add b32 $r11 $r11 0x0 $c - * mov b32 $r8 c0[0x8] + * mov b32 $r8 c15[0x608] * st b128 wt g[$r10d+0x00] $r0q * st b128 wt g[$r10d+0x10] $r4q * st b32 wt g[$r10d+0x20] $r8 @@ -1128,14 +1128,14 @@ static const uint64_t nvc0_read_hw_sm_counters_code[] = 0x2c00000028019c04ULL, 0x2c0000002c01dc04ULL, 0x190e0000fc81dc03ULL, - 0x2800400000029de4ULL, - 0x280040001002dde4ULL, + 0x28007c1800029de4ULL, + 0x28007c181002dde4ULL, 0x7000c01050921c03ULL, 0x80000000000021e7ULL, 0x10000000c0821c02ULL, 0x4801000020a29c03ULL, 0x0800000000b2dc42ULL, - 0x2800400020021de4ULL, + 0x28007c1820021de4ULL, 0x9400000000a01fc5ULL, 0x9400000040a11fc5ULL, 0x9400000080a21f85ULL, @@ -1813,6 +1813,40 @@ nvc0_hw_sm_get_program(struct nvc0_screen *screen) return prog; } +static inline void +nvc0_hw_sm_upload_input(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_screen *screen = nvc0->screen; + uint64_t address; + const int s = 5; + + address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); + + PUSH_SPACE(push, 11); + + if (screen->base.class_3d >= NVE4_3D_CLASS) { + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, address + NVC0_CB_AUX_MP_INFO); + PUSH_DATA (push, address + NVC0_CB_AUX_MP_INFO); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 3 * 4); + PUSH_DATA (push, 0x1); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 3); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + } else { + BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3); + PUSH_DATA (push, 2048); + PUSH_DATAh(push, address); + PUSH_DATA (push, address); + BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 3); + PUSH_DATA (push, NVC0_CB_AUX_MP_INFO); + } + PUSH_DATA (push, (hq->bo->offset + hq->base_offset)); + PUSH_DATAh(push, (hq->bo->offset + hq->base_offset)); + PUSH_DATA (push, hq->sequence); +} + static void nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) { @@ -1857,11 +1891,10 @@ nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) PUSH_SPACE(push, 1); IMMED_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 0); - pipe->bind_compute_state(pipe, screen->pm.prog); - input[0] = (hq->bo->offset + hq->base_offset); - input[1] = (hq->bo->offset + hq->base_offset) >> 32; - input[2] = hq->sequence; + /* upload input data for the compute shader which reads MP counters */ + nvc0_hw_sm_upload_input(nvc0, hq); + pipe->bind_compute_state(pipe, screen->pm.prog); for (i = 0; i < 3; i++) { info.block[i] = block[i]; info.grid[i] = grid[i]; -- cgit v1.2.3