From 30dd0599250c4743ded25663d32c263ab226510c Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sun, 8 Mar 2020 16:42:23 -0700 Subject: freedreno/computerator: add performance counter support Signed-off-by: Rob Clark Part-of: --- src/freedreno/computerator/a6xx.c | 107 ++++++++++++++++++++++++++++++++ src/freedreno/computerator/main.c | 110 ++++++++++++++++++++++++++++++++- src/freedreno/computerator/main.h | 19 ++++++ src/freedreno/computerator/meson.build | 1 + 4 files changed, 236 insertions(+), 1 deletion(-) (limited to 'src/freedreno') diff --git a/src/freedreno/computerator/a6xx.c b/src/freedreno/computerator/a6xx.c index c9960d66a8d..df09116657c 100644 --- a/src/freedreno/computerator/a6xx.c +++ b/src/freedreno/computerator/a6xx.c @@ -40,9 +40,17 @@ struct a6xx_backend { unsigned seqno; struct fd_bo *control_mem; + + struct fd_bo *query_mem; + const struct perfcntr *perfcntrs; + unsigned num_perfcntrs; }; define_cast(backend, a6xx_backend); +/* + * Data structures shared with GPU: + */ + /* This struct defines the layout of the fd6_context::control buffer: */ struct fd6_control { uint32_t seqno; /* seqno for async CP_EVENT_WRITE, etc */ @@ -65,6 +73,26 @@ struct fd6_control { #define control_ptr(a6xx_backend, member) \ (a6xx_backend)->control_mem, offsetof(struct fd6_control, member), 0, 0 + +struct PACKED fd6_query_sample { + uint64_t start; + uint64_t result; + uint64_t stop; +}; + + +/* offset of a single field of an array of fd6_query_sample: */ +#define query_sample_idx(a6xx_backend, idx, field) \ + (a6xx_backend)->query_mem, \ + (idx * sizeof(struct fd6_query_sample)) + \ + offsetof(struct fd6_query_sample, field), \ + 0, 0 + + +/* + * Backend implementation: + */ + static struct kernel * a6xx_assemble(struct backend *b, FILE *in) { @@ -307,6 +335,8 @@ cache_flush(struct fd_ringbuffer *ring, struct kernel *kernel) static void a6xx_emit_grid(struct kernel *kernel, uint32_t grid[3], struct fd_submit *submit) { + struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel); + struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend); struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(submit, 0, FD_RINGBUFFER_PRIMARY | FD_RINGBUFFER_GROWABLE); @@ -344,6 +374,34 @@ a6xx_emit_grid(struct kernel *kernel, uint32_t grid[3], struct fd_submit *submit OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Y */ OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Z */ + if (a6xx_backend->num_perfcntrs > 0) { + a6xx_backend->query_mem = fd_bo_new(a6xx_backend->dev, + a6xx_backend->num_perfcntrs * sizeof(struct fd6_query_sample), + DRM_FREEDRENO_GEM_TYPE_KMEM, "query"); + + /* configure the performance counters to count the requested + * countables: + */ + for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) { + const struct perfcntr *counter = &a6xx_backend->perfcntrs[i]; + + OUT_PKT4(ring, counter->select_reg, 1); + OUT_RING(ring, counter->selector); + } + + OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0); + + /* and snapshot the start values: */ + for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) { + const struct perfcntr *counter = &a6xx_backend->perfcntrs[i]; + + OUT_PKT7(ring, CP_REG_TO_MEM, 3); + OUT_RING(ring, CP_REG_TO_MEM_0_64B | + CP_REG_TO_MEM_0_REG(counter->counter_reg_lo)); + OUT_RELOCW(ring, query_sample_idx(a6xx_backend, i, start)); + } + } + OUT_PKT7(ring, CP_EXEC_CS, 4); OUT_RING(ring, 0x00000000); OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(grid[0])); @@ -352,9 +410,56 @@ a6xx_emit_grid(struct kernel *kernel, uint32_t grid[3], struct fd_submit *submit OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0); + if (a6xx_backend->num_perfcntrs > 0) { + /* snapshot the end values: */ + for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) { + const struct perfcntr *counter = &a6xx_backend->perfcntrs[i]; + + OUT_PKT7(ring, CP_REG_TO_MEM, 3); + OUT_RING(ring, CP_REG_TO_MEM_0_64B | + CP_REG_TO_MEM_0_REG(counter->counter_reg_lo)); + OUT_RELOCW(ring, query_sample_idx(a6xx_backend, i, stop)); + } + + /* and compute the result: */ + for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) { + /* result += stop - start: */ + OUT_PKT7(ring, CP_MEM_TO_MEM, 9); + OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | + CP_MEM_TO_MEM_0_NEG_C); + OUT_RELOCW(ring, query_sample_idx(a6xx_backend, i, result)); /* dst */ + OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, result)); /* srcA */ + OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, stop)); /* srcB */ + OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, start)); /* srcC */ + } + } + cache_flush(ring, kernel); } +static void +a6xx_set_perfcntrs(struct backend *b, const struct perfcntr *perfcntrs, + unsigned num_perfcntrs) +{ + struct a6xx_backend *a6xx_backend = to_a6xx_backend(b); + + a6xx_backend->perfcntrs = perfcntrs; + a6xx_backend->num_perfcntrs = num_perfcntrs; +} + +static void +a6xx_read_perfcntrs(struct backend *b, uint64_t *results) +{ + struct a6xx_backend *a6xx_backend = to_a6xx_backend(b); + + fd_bo_cpu_prep(a6xx_backend->query_mem, NULL, DRM_FREEDRENO_PREP_READ); + struct fd6_query_sample *samples = fd_bo_map(a6xx_backend->query_mem); + + for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) { + results[i] = samples[i].result; + } +} + struct backend * a6xx_init(struct fd_device *dev, uint32_t gpu_id) { @@ -364,6 +469,8 @@ a6xx_init(struct fd_device *dev, uint32_t gpu_id) .assemble = a6xx_assemble, .disassemble = a6xx_disassemble, .emit_grid = a6xx_emit_grid, + .set_perfcntrs = a6xx_set_perfcntrs, + .read_perfcntrs = a6xx_read_perfcntrs, }; a6xx_backend->compiler = ir3_compiler_create(dev, gpu_id); diff --git a/src/freedreno/computerator/main.c b/src/freedreno/computerator/main.c index 691fd5a8a4a..618812841e4 100644 --- a/src/freedreno/computerator/main.c +++ b/src/freedreno/computerator/main.c @@ -22,10 +22,14 @@ */ #include +#include +#include #include #include "util/u_math.h" +#include "perfcntrs/freedreno_perfcntr.h" + #include "main.h" @@ -91,13 +95,14 @@ dump_hex(void *buf, int sz) } } -static const char *shortopts = "df:g:h"; +static const char *shortopts = "df:g:hp:"; static const struct option longopts[] = { {"disasm", no_argument, 0, 'd'}, {"file", required_argument, 0, 'f'}, {"groups", required_argument, 0, 'g'}, {"help", no_argument, 0, 'h'}, + {"perfcntr", required_argument, 0, 'p'}, {0, 0, 0, 0} }; @@ -111,18 +116,101 @@ usage(const char *name) " -f, --file=FILE read shader from file (instead of stdin)\n" " -g, --groups=X,Y,Z use specified group size\n" " -h, --help show this message\n" + " -p, --perfcntr=LIST sample specified performance counters (comma\n" + " separated list)\n" , name); } +/* performance counter description: */ +static unsigned num_groups; +static const struct fd_perfcntr_group *groups; + +/* Track enabled counters per group: */ +static unsigned *enabled_counters; + +static void +setup_counter(const char *name, struct perfcntr *c) +{ + for (int i = 0; i < num_groups; i++) { + const struct fd_perfcntr_group *group = &groups[i]; + + for (int j = 0; j < group->num_countables; j++) { + const struct fd_perfcntr_countable *countable = &group->countables[j]; + + if (strcmp(name, countable->name) != 0) + continue; + + /* + * Allocate a counter to use to monitor the requested countable: + */ + if (enabled_counters[i] >= group->num_counters) { + errx(-1, "Too many counters selected in group: %s", group->name); + } + + unsigned idx = enabled_counters[i]++; + const struct fd_perfcntr_counter *counter = &group->counters[idx]; + + /* + * And initialize the perfcntr struct, pulling together the info + * about selected counter and countable, to simplify life for the + * backend: + */ + c->name = name; + c->select_reg = counter->select_reg; + c->counter_reg_lo = counter->counter_reg_lo; + c->counter_reg_hi = counter->counter_reg_hi; + c->selector = countable->selector; + + return; + } + } + + errx(-1, "could not find countable: %s", name); +} + +static struct perfcntr * +parse_perfcntrs(uint32_t gpu_id, const char *perfcntrstr, unsigned *num_perfcntrs) +{ + struct perfcntr *counters = NULL; + char *cnames, *s; + unsigned cnt = 0; + + groups = fd_perfcntrs(gpu_id, &num_groups); + enabled_counters = calloc(num_groups, sizeof(enabled_counters[0])); + + cnames = strdup(perfcntrstr); + while ((s = strstr(cnames, ","))) { + char *name = cnames; + s[0] = '\0'; + cnames = &s[1]; + + counters = realloc(counters, ++cnt * sizeof(counters[0])); + setup_counter(name, &counters[cnt-1]); + } + + char * name = cnames; + counters = realloc(counters, ++cnt * sizeof(counters[0])); + setup_counter(name, &counters[cnt-1]); + + *num_perfcntrs = cnt; + + return counters; +} + int main(int argc, char **argv) { FILE *in = stdin; + const char *perfcntrstr = NULL; + struct perfcntr *perfcntrs = NULL; + unsigned num_perfcntrs = 0; bool disasm = false; uint32_t grid[3] = {0}; int opt, ret; + setlocale(LC_NUMERIC, "en_US.UTF-8"); + while ((opt = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { switch (opt) { case 'd': @@ -140,6 +228,9 @@ main(int argc, char **argv) break; case 'h': goto usage; + case 'p': + perfcntrstr = optarg; + break; default: printf("unrecognized arg: %c\n", opt); goto usage; @@ -185,6 +276,14 @@ main(int argc, char **argv) struct fd_submit *submit = fd_submit_new(pipe); + if (perfcntrstr) { + if (!backend->set_perfcntrs) { + err(1, "performance counters not supported"); + } + perfcntrs = parse_perfcntrs(gpu_id, perfcntrstr, &num_perfcntrs); + backend->set_perfcntrs(backend, perfcntrs, num_perfcntrs); + } + backend->emit_grid(kernel, grid, submit); fd_submit_flush(submit, -1, NULL, NULL); @@ -198,6 +297,15 @@ main(int argc, char **argv) dump_float(map, kernel->buf_sizes[i] * 4); } + if (perfcntrstr) { + uint64_t results[num_perfcntrs]; + backend->read_perfcntrs(backend, results); + + for (unsigned i = 0; i < num_perfcntrs; i++) { + printf("%s:\t%'"PRIu64"\n", perfcntrs[i].name, results[i]); + } + } + return 0; usage: diff --git a/src/freedreno/computerator/main.h b/src/freedreno/computerator/main.h index 9e9325aa7f6..57b1ac07cb6 100644 --- a/src/freedreno/computerator/main.h +++ b/src/freedreno/computerator/main.h @@ -46,12 +46,31 @@ struct kernel { struct fd_bo *bufs[MAX_BUFS]; }; +struct perfcntr { + const char *name; + + /* for backend to configure/read the counter, describes + * the selected counter: + */ + unsigned select_reg; + unsigned counter_reg_lo; + unsigned counter_reg_hi; + /* and selected countable: + */ + unsigned selector; +}; + /* per-generation entry-points: */ struct backend { struct kernel *(*assemble)(struct backend *b, FILE *in); void (*disassemble)(struct kernel *kernel, FILE *out); void (*emit_grid)(struct kernel *kernel, uint32_t grid[3], struct fd_submit *submit); + + /* performance-counter API: */ + void (*set_perfcntrs)(struct backend *b, const struct perfcntr *perfcntrs, + unsigned num_perfcntrs); + void (*read_perfcntrs)(struct backend *b, uint64_t *results); }; #define define_cast(_from, _to) \ diff --git a/src/freedreno/computerator/meson.build b/src/freedreno/computerator/meson.build index 68a58efeaa6..80e6f66a772 100644 --- a/src/freedreno/computerator/meson.build +++ b/src/freedreno/computerator/meson.build @@ -56,6 +56,7 @@ computerator = executable( link_with : [ libfreedreno_drm, libfreedreno_ir3, + libfreedreno_perfcntrs, ], dependencies : [ dep_libdrm, -- cgit v1.2.3