diff options
author | Nicolai Hähnle <[email protected]> | 2015-11-25 15:30:03 +0100 |
---|---|---|
committer | Nicolai Hähnle <[email protected]> | 2015-11-25 15:52:09 +0100 |
commit | ad22006892c5511dac7d0d680633a1b857da49fb (patch) | |
tree | 46c64acd6d362d3ca3060787f1549e850561cb5d /src/gallium/drivers/radeon/r600_perfcounter.c | |
parent | b9fc01aee75dcc2d56750ea430e32d74127faf69 (diff) |
radeonsi: implement AMD_performance_monitor for CIK+
Expose most of the performance counter groups that are exposed by Catalyst.
Ideally, the driver will work with GPUPerfStudio at some point, but we are not
quite there yet. In any case, this is the reason for grouping multiple
instances of hardware blocks in the way it is implemented.
The counters can also be shown using the Gallium HUD. If one is interested to
see how work is distributed across multiple shader engines, one can set the
environment variable RADEON_PC_SEPARATE_SE=1 to obtain finer-grained performance
counter groups.
Part of the implementation is in radeon because an implementation for
older hardware would largely follow along the same lines, but exposing
a different set of blocks which are programmed slightly differently.
Reviewed-by: Marek Olšák <[email protected]>
Diffstat (limited to 'src/gallium/drivers/radeon/r600_perfcounter.c')
-rw-r--r-- | src/gallium/drivers/radeon/r600_perfcounter.c | 636 |
1 files changed, 636 insertions, 0 deletions
diff --git a/src/gallium/drivers/radeon/r600_perfcounter.c b/src/gallium/drivers/radeon/r600_perfcounter.c new file mode 100644 index 00000000000..a710c042b27 --- /dev/null +++ b/src/gallium/drivers/radeon/r600_perfcounter.c @@ -0,0 +1,636 @@ +/* + * Copyright 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Nicolai Hähnle <[email protected]> + * + */ + +#include "util/u_memory.h" +#include "r600_query.h" +#include "r600_pipe_common.h" +#include "r600d_common.h" + +/* Max counters per HW block */ +#define R600_QUERY_MAX_COUNTERS 16 + +static const char * const r600_pc_shader_suffix[] = { + "", "_PS", "_VS", "_GS", "_ES", "_HS", "_LS", "_CS" +}; + +static struct r600_perfcounter_block * +lookup_counter(struct r600_perfcounters *pc, unsigned index, + unsigned *base_gid, unsigned *sub_index) +{ + struct r600_perfcounter_block *block = pc->blocks; + unsigned bid; + + *base_gid = 0; + for (bid = 0; bid < pc->num_blocks; ++bid, ++block) { + unsigned total = block->num_groups * block->num_selectors; + + if (index < total) { + *sub_index = index; + return block; + } + + index -= total; + *base_gid += block->num_groups; + } + + return NULL; +} + +static struct r600_perfcounter_block * +lookup_group(struct r600_perfcounters *pc, unsigned *index) +{ + unsigned bid; + struct r600_perfcounter_block *block = pc->blocks; + + for (bid = 0; bid < pc->num_blocks; ++bid, ++block) { + if (*index < block->num_groups) + return block; + *index -= block->num_groups; + } + + return NULL; +} + +struct r600_pc_group { + struct r600_pc_group *next; + struct r600_perfcounter_block *block; + unsigned sub_gid; /* only used during init */ + unsigned result_base; /* only used during init */ + int se; + int instance; + unsigned num_counters; + unsigned selectors[R600_QUERY_MAX_COUNTERS]; +}; + +struct r600_pc_counter { + unsigned base; + unsigned dwords; + unsigned stride; +}; + +struct r600_query_pc { + struct r600_query_hw b; + + unsigned shaders; + unsigned num_counters; + struct r600_pc_counter *counters; + struct r600_pc_group *groups; +}; + +static void r600_pc_query_destroy(struct r600_common_context *ctx, + struct r600_query *rquery) +{ + struct r600_query_pc *query = (struct r600_query_pc *)rquery; + + while (query->groups) { + struct r600_pc_group *group = query->groups; + query->groups = group->next; + FREE(group); + } + + FREE(query->counters); + + r600_query_hw_destroy(ctx, rquery); +} + +static void r600_pc_query_emit_start(struct r600_common_context *ctx, + struct r600_query_hw *hwquery, + struct r600_resource *buffer, uint64_t va) +{ + struct r600_perfcounters *pc = ctx->screen->perfcounters; + struct r600_query_pc *query = (struct r600_query_pc *)hwquery; + struct r600_pc_group *group; + int current_se = -1; + int current_instance = -1; + + if (query->shaders) + pc->emit_shaders(ctx, query->shaders); + + for (group = query->groups; group; group = group->next) { + struct r600_perfcounter_block *block = group->block; + + if (group->se != current_se || group->instance != current_instance) { + current_se = group->se; + current_instance = group->instance; + pc->emit_instance(ctx, group->se, group->instance); + } + + pc->emit_select(ctx, block, group->num_counters, group->selectors); + } + + if (current_se != -1 || current_instance != -1) + pc->emit_instance(ctx, -1, -1); + + pc->emit_start(ctx, buffer, va); +} + +static void r600_pc_query_emit_stop(struct r600_common_context *ctx, + struct r600_query_hw *hwquery, + struct r600_resource *buffer, uint64_t va) +{ + struct r600_perfcounters *pc = ctx->screen->perfcounters; + struct r600_query_pc *query = (struct r600_query_pc *)hwquery; + struct r600_pc_group *group; + + pc->emit_stop(ctx, buffer, va); + + for (group = query->groups; group; group = group->next) { + struct r600_perfcounter_block *block = group->block; + unsigned se = group->se >= 0 ? group->se : 0; + unsigned se_end = se + 1; + + if ((block->flags & R600_PC_BLOCK_SE) && (group->se < 0)) + se_end = ctx->screen->info.max_se; + + do { + unsigned instance = group->instance >= 0 ? group->instance : 0; + + do { + pc->emit_instance(ctx, se, instance); + pc->emit_read(ctx, block, + group->num_counters, group->selectors, + buffer, va); + va += 4 * group->num_counters; + } while (group->instance < 0 && ++instance < block->num_instances); + } while (++se < se_end); + } + + pc->emit_instance(ctx, -1, -1); +} + +static void r600_pc_query_clear_result(struct r600_query_hw *hwquery, + union pipe_query_result *result) +{ + struct r600_query_pc *query = (struct r600_query_pc *)hwquery; + + memset(result, 0, sizeof(result->batch[0]) * query->num_counters); +} + +static void r600_pc_query_add_result(struct r600_common_context *ctx, + struct r600_query_hw *hwquery, + void *buffer, + union pipe_query_result *result) +{ + struct r600_query_pc *query = (struct r600_query_pc *)hwquery; + uint32_t *results = buffer; + unsigned i, j; + + for (i = 0; i < query->num_counters; ++i) { + struct r600_pc_counter *counter = &query->counters[i]; + + if (counter->base == ~0) + continue; + + for (j = 0; j < counter->dwords; ++j) { + uint32_t value = results[counter->base + j * counter->stride]; + result->batch[i].u32 += value; + } + } +} + +static struct r600_query_ops batch_query_ops = { + .destroy = r600_pc_query_destroy, + .begin = r600_query_hw_begin, + .end = r600_query_hw_end, + .get_result = r600_query_hw_get_result +}; + +static struct r600_query_hw_ops batch_query_hw_ops = { + .emit_start = r600_pc_query_emit_start, + .emit_stop = r600_pc_query_emit_stop, + .clear_result = r600_pc_query_clear_result, + .add_result = r600_pc_query_add_result, +}; + +static struct r600_pc_group *get_group_state(struct r600_common_screen *screen, + struct r600_query_pc *query, + struct r600_perfcounter_block *block, + unsigned sub_gid) +{ + struct r600_pc_group *group = query->groups; + + while (group) { + if (group->block == block && group->sub_gid == sub_gid) + return group; + group = group->next; + } + + group = CALLOC_STRUCT(r600_pc_group); + if (!group) + return NULL; + + group->block = block; + group->sub_gid = sub_gid; + + if (block->flags & R600_PC_BLOCK_SHADER) { + unsigned sub_gids = block->num_instances; + unsigned shader_id; + unsigned shader_mask; + unsigned query_shader_mask; + + if (block->flags & R600_PC_BLOCK_SE_GROUPS) + sub_gids = sub_gids * screen->info.max_se; + shader_id = sub_gid / sub_gids; + sub_gid = sub_gid % sub_gids; + + if (shader_id == 0) + shader_mask = R600_PC_SHADER_ALL; + else + shader_mask = 1 << (shader_id - 1); + + query_shader_mask = query->shaders & R600_PC_SHADER_ALL; + if (query_shader_mask && query_shader_mask != shader_mask) { + fprintf(stderr, "r600_perfcounter: incompatible shader groups\n"); + FREE(group); + return NULL; + } + query->shaders |= shader_mask; + } + + if (block->flags & R600_PC_BLOCK_SHADER_WINDOWED) { + // A non-zero value in query->shaders ensures that the shader + // masking is reset unless the user explicitly requests one. + query->shaders |= R600_PC_SHADER_WINDOWING; + } + + if (block->flags & R600_PC_BLOCK_SE_GROUPS) { + group->se = sub_gid / block->num_instances; + sub_gid = sub_gid % block->num_instances; + } else { + group->se = -1; + } + + if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) { + group->instance = sub_gid; + } else { + group->instance = -1; + } + + group->next = query->groups; + query->groups = group; + + return group; +} + +struct pipe_query *r600_create_batch_query(struct pipe_context *ctx, + unsigned num_queries, + unsigned *query_types) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + struct r600_common_screen *screen = rctx->screen; + struct r600_perfcounters *pc = screen->perfcounters; + struct r600_perfcounter_block *block; + struct r600_pc_group *group; + struct r600_query_pc *query; + unsigned base_gid, sub_gid, sub_index; + unsigned i, j; + + if (!pc) + return NULL; + + query = CALLOC_STRUCT(r600_query_pc); + if (!query) + return NULL; + + query->b.b.ops = &batch_query_ops; + query->b.ops = &batch_query_hw_ops; + query->b.flags = R600_QUERY_HW_FLAG_TIMER; + + query->num_counters = num_queries; + + /* Collect selectors per group */ + for (i = 0; i < num_queries; ++i) { + unsigned sub_gid; + + if (query_types[i] < R600_QUERY_FIRST_PERFCOUNTER) + goto error; + + block = lookup_counter(pc, query_types[i] - R600_QUERY_FIRST_PERFCOUNTER, + &base_gid, &sub_index); + if (!block) + goto error; + + sub_gid = sub_index / block->num_selectors; + sub_index = sub_index % block->num_selectors; + + group = get_group_state(screen, query, block, sub_gid); + if (!group) + goto error; + + if (group->num_counters >= block->num_counters) { + fprintf(stderr, + "perfcounter group %s: too many selected\n", + block->basename); + goto error; + } + group->selectors[group->num_counters] = sub_index; + ++group->num_counters; + } + + /* Compute result bases and CS size per group */ + query->b.num_cs_dw_begin = pc->num_start_cs_dwords; + query->b.num_cs_dw_end = pc->num_stop_cs_dwords; + + query->b.num_cs_dw_begin += pc->num_instance_cs_dwords; /* conservative */ + query->b.num_cs_dw_end += pc->num_instance_cs_dwords; + + i = 0; + for (group = query->groups; group; group = group->next) { + struct r600_perfcounter_block *block = group->block; + unsigned select_dw, read_dw; + unsigned instances = 1; + + if ((block->flags & R600_PC_BLOCK_SE) && group->se < 0) + instances = rctx->screen->info.max_se; + if (group->instance < 0) + instances *= block->num_instances; + + group->result_base = i; + query->b.result_size += 4 * instances * group->num_counters; + i += instances * group->num_counters; + + pc->get_size(block, group->num_counters, group->selectors, + &select_dw, &read_dw); + query->b.num_cs_dw_begin += select_dw; + query->b.num_cs_dw_end += instances * read_dw; + query->b.num_cs_dw_begin += pc->num_instance_cs_dwords; /* conservative */ + query->b.num_cs_dw_end += instances * pc->num_instance_cs_dwords; + } + + if (query->shaders) { + if ((query->shaders & R600_PC_SHADER_ALL) == 0) + query->shaders |= R600_PC_SHADER_ALL; + query->b.num_cs_dw_begin += pc->num_shaders_cs_dwords; + } + + /* Map user-supplied query array to result indices */ + query->counters = CALLOC(num_queries, sizeof(*query->counters)); + for (i = 0; i < num_queries; ++i) { + struct r600_pc_counter *counter = &query->counters[i]; + struct r600_perfcounter_block *block; + + block = lookup_counter(pc, query_types[i] - R600_QUERY_FIRST_PERFCOUNTER, + &base_gid, &sub_index); + + sub_gid = sub_index / block->num_selectors; + sub_index = sub_index % block->num_selectors; + + group = get_group_state(screen, query, block, sub_gid); + assert(group != NULL); + + for (j = 0; j < group->num_counters; ++j) { + if (group->selectors[j] == sub_index) + break; + } + + counter->base = group->result_base + j; + counter->stride = group->num_counters; + + counter->dwords = 1; + if ((block->flags & R600_PC_BLOCK_SE) && group->se < 0) + counter->dwords = screen->info.max_se; + if (group->instance < 0) + counter->dwords *= block->num_instances; + } + + if (!r600_query_hw_init(rctx, &query->b)) + goto error; + + return (struct pipe_query *)query; + +error: + r600_pc_query_destroy(rctx, &query->b.b); + return NULL; +} + +int r600_get_perfcounter_info(struct r600_common_screen *screen, + unsigned index, + struct pipe_driver_query_info *info) +{ + struct r600_perfcounters *pc = screen->perfcounters; + struct r600_perfcounter_block *block; + unsigned base_gid, sub; + + if (!pc) + return 0; + + if (!info) { + unsigned bid, num_queries = 0; + + for (bid = 0; bid < pc->num_blocks; ++bid) { + num_queries += pc->blocks[bid].num_selectors * + pc->blocks[bid].num_groups; + } + + return num_queries; + } + + block = lookup_counter(pc, index, &base_gid, &sub); + if (!block) + return 0; + + info->name = block->selector_names + sub * block->selector_name_stride; + info->query_type = R600_QUERY_FIRST_PERFCOUNTER + index; + info->max_value.u64 = 0; + info->type = PIPE_DRIVER_QUERY_TYPE_UINT; + info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE; + info->group_id = base_gid + sub / block->num_selectors; + info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH; + return 1; +} + +int r600_get_perfcounter_group_info(struct r600_common_screen *screen, + unsigned index, + struct pipe_driver_query_group_info *info) +{ + struct r600_perfcounters *pc = screen->perfcounters; + struct r600_perfcounter_block *block; + + if (!pc) + return 0; + + if (!info) + return pc->num_groups; + + block = lookup_group(pc, &index); + if (!block) + return 0; + info->name = block->group_names + index * block->group_name_stride; + info->num_queries = block->num_selectors; + info->max_active_queries = block->num_counters; + return 1; +} + +void r600_perfcounters_destroy(struct r600_common_screen *rscreen) +{ + if (rscreen->perfcounters) + rscreen->perfcounters->cleanup(rscreen); +} + +boolean r600_perfcounters_init(struct r600_perfcounters *pc, + unsigned num_blocks) +{ + pc->blocks = CALLOC(num_blocks, sizeof(struct r600_perfcounter_block)); + if (!pc->blocks) + return FALSE; + + pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", FALSE); + pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", FALSE); + + return TRUE; +} + +boolean r600_perfcounters_add_block(struct r600_common_screen *rscreen, + struct r600_perfcounters *pc, + const char *name, unsigned flags, + unsigned counters, unsigned selectors, + unsigned instances, void *data) +{ + struct r600_perfcounter_block *block = &pc->blocks[pc->num_blocks]; + unsigned i, j, k; + unsigned groups_shader = 1, groups_se = 1, groups_instance = 1; + unsigned namelen; + char *groupname; + char *p; + + assert(counters <= R600_QUERY_MAX_COUNTERS); + + block->basename = name; + block->flags = flags; + block->num_counters = counters; + block->num_selectors = selectors; + block->num_instances = MAX2(instances, 1); + block->data = data; + + if (pc->separate_se && (block->flags & R600_PC_BLOCK_SE)) + block->flags |= R600_PC_BLOCK_SE_GROUPS; + if (pc->separate_instance && block->num_instances > 1) + block->flags |= R600_PC_BLOCK_INSTANCE_GROUPS; + + if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) { + groups_instance = block->num_instances; + block->num_groups = groups_instance; + } else { + block->num_groups = 1; + } + + if (block->flags & R600_PC_BLOCK_SE_GROUPS) { + groups_se = rscreen->info.max_se; + block->num_groups *= groups_se; + } + + if (block->flags & R600_PC_BLOCK_SHADER) { + groups_shader = ARRAY_SIZE(r600_pc_shader_suffix); + block->num_groups *= groups_shader; + } + + namelen = strlen(name); + block->group_name_stride = namelen + 1; + if (block->flags & R600_PC_BLOCK_SHADER) + block->group_name_stride += 3; + if (block->flags & R600_PC_BLOCK_SE_GROUPS) { + assert(groups_se <= 10); + block->group_name_stride += 1; + + if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) + block->group_name_stride += 1; + } + if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) { + assert(groups_instance <= 100); + block->group_name_stride += 2; + } + + block->group_names = MALLOC(block->num_groups * block->group_name_stride); + if (!block->group_names) + goto error; + + groupname = block->group_names; + for (i = 0; i < groups_shader; ++i) { + unsigned shaderlen = strlen(r600_pc_shader_suffix[i]); + for (j = 0; j < groups_se; ++j) { + for (k = 0; k < groups_instance; ++k) { + strcpy(groupname, name); + p = groupname + namelen; + + if (block->flags & R600_PC_BLOCK_SHADER) { + strcpy(p, r600_pc_shader_suffix[i]); + p += shaderlen; + } + + if (block->flags & R600_PC_BLOCK_SE_GROUPS) { + p += sprintf(p, "%d", j); + if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) + *p++ = '_'; + } + + if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) + p += sprintf(p, "%d", k); + + groupname += block->group_name_stride; + } + } + } + + assert(selectors <= 1000); + block->selector_name_stride = block->group_name_stride + 4; + block->selector_names = MALLOC(block->num_groups * selectors * + block->selector_name_stride); + if (!block->selector_names) + goto error_groupnames; + + groupname = block->group_names; + p = block->selector_names; + for (i = 0; i < block->num_groups; ++i) { + for (j = 0; j < selectors; ++j) { + sprintf(p, "%s_%03d", groupname, j); + p += block->selector_name_stride; + } + groupname += block->group_name_stride; + } + + ++pc->num_blocks; + pc->num_groups += block->num_groups; + + return TRUE; + +error_groupnames: + FREE(block->group_names); +error: + return FALSE; +} + +void r600_perfcounters_do_destroy(struct r600_perfcounters *pc) +{ + unsigned i; + + for (i = 0; i < pc->num_blocks; ++i) { + FREE(pc->blocks[i].group_names); + FREE(pc->blocks[i].selector_names); + } + FREE(pc->blocks); + FREE(pc); +} |