diff options
-rw-r--r-- | src/mesa/drivers/dri/i965/Makefile.sources | 1 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_context.c | 4 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_context.h | 43 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_performance_monitor.c | 1471 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/intel_batchbuffer.c | 9 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/intel_extensions.c | 70 |
6 files changed, 1 insertions, 1597 deletions
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources index 0a7ba1bb34d..dd546826d19 100644 --- a/src/mesa/drivers/dri/i965/Makefile.sources +++ b/src/mesa/drivers/dri/i965/Makefile.sources @@ -134,7 +134,6 @@ i965_FILES = \ brw_multisample_state.h \ brw_nir_uniforms.cpp \ brw_object_purgeable.c \ - brw_performance_monitor.c \ brw_pipe_control.c \ brw_program.c \ brw_program.h \ diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index 4ca77c789b4..45490a0f5cf 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -1136,10 +1136,6 @@ brwCreateContext(gl_api api, _mesa_initialize_dispatch_tables(ctx); _mesa_initialize_vbo_vtxfmt(ctx); - if (ctx->Extensions.AMD_performance_monitor) { - brw_init_performance_monitors(brw); - } - vbo_use_buffer_objects(ctx); vbo_always_unmap_buffers(ctx); diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index a2817131a50..2dd2686e033 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -1170,43 +1170,6 @@ struct brw_context bool supported; } predicate; - struct { - /** A map from pipeline statistics counter IDs to MMIO addresses. */ - const int *statistics_registers; - - /** The number of active monitors using OA counters. */ - unsigned oa_users; - - /** - * A buffer object storing OA counter snapshots taken at the start and - * end of each batch (creating "bookends" around the batch). - */ - drm_intel_bo *bookend_bo; - - /** The number of snapshots written to bookend_bo. */ - int bookend_snapshots; - - /** - * An array of monitors whose results haven't yet been assembled based on - * the data in buffer objects. - * - * These may be active, or have already ended. However, the results - * have not been requested. - */ - struct brw_perf_monitor_object **unresolved; - int unresolved_elements; - int unresolved_array_size; - - /** - * Mapping from a uint32_t offset within an OA snapshot to the ID of - * the counter which MI_REPORT_PERF_COUNT stores there. - */ - const int *oa_snapshot_layout; - - /** Number of 32-bit entries in a hardware counter snapshot. */ - int entries_per_oa_snapshot; - } perfmon; - int num_atoms[BRW_NUM_PIPELINES]; const struct brw_tracked_state render_atoms[76]; const struct brw_tracked_state compute_atoms[11]; @@ -1522,12 +1485,6 @@ bool brw_render_target_supported(struct brw_context *brw, struct gl_renderbuffer *rb); uint32_t brw_depth_format(struct brw_context *brw, mesa_format format); -/* brw_performance_monitor.c */ -void brw_init_performance_monitors(struct brw_context *brw); -void brw_dump_perf_monitors(struct brw_context *brw); -void brw_perf_monitor_new_batch(struct brw_context *brw); -void brw_perf_monitor_finish_batch(struct brw_context *brw); - /* intel_buffer_objects.c */ int brw_bo_map(struct brw_context *brw, drm_intel_bo *bo, int write_enable, const char *bo_name); diff --git a/src/mesa/drivers/dri/i965/brw_performance_monitor.c b/src/mesa/drivers/dri/i965/brw_performance_monitor.c deleted file mode 100644 index f8e50e10fa3..00000000000 --- a/src/mesa/drivers/dri/i965/brw_performance_monitor.c +++ /dev/null @@ -1,1471 +0,0 @@ -/* - * Copyright © 2013 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - */ - -/** - * \file brw_performance_monitor.c - * - * Implementation of the GL_AMD_performance_monitor extension. - * - * On Gen5+ hardware, we have two sources of performance counter data: - * the Observability Architecture counters (MI_REPORT_PERF_COUNT), and - * the Pipeline Statistics Registers. We expose both sets of raw data, - * as well as some useful processed values. - * - * The Observability Architecture (OA) counters for Gen6+ are documented - * in a separate document from the rest of the PRMs. It is available at: - * https://01.org/linuxgraphics/documentation/driver-documentation-prms - * => 2013 Intel Core Processor Family => Observability Performance Counters - * (This one volume covers Sandybridge, Ivybridge, Baytrail, and Haswell.) - * - * On Ironlake, the OA counters were called "CHAPS" counters. Sadly, no public - * documentation exists; our implementation is based on the source code for the - * intel_perf_counters utility (which is available as part of intel-gpu-tools). - */ - -#include <limits.h> - -#include "util/bitset.h" -#include "main/hash.h" -#include "main/macros.h" -#include "main/mtypes.h" -#include "main/performance_monitor.h" - -#include "util/ralloc.h" - -#include "brw_context.h" -#include "brw_defines.h" -#include "intel_batchbuffer.h" - -#define FILE_DEBUG_FLAG DEBUG_PERFMON - -/** - * i965 representation of a performance monitor object. - */ -struct brw_perf_monitor_object -{ - /** The base class. */ - struct gl_perf_monitor_object base; - - /** - * BO containing OA counter snapshots at monitor Begin/End time. - */ - drm_intel_bo *oa_bo; - - /** Indexes into bookend_bo (snapshot numbers) for various segments. */ - int oa_head_end; - int oa_middle_start; - int oa_tail_start; - - /** - * Storage for OA results accumulated so far. - * - * An array indexed by the counter ID in the OA_COUNTERS group. - * - * When we run out of space in bookend_bo, we compute the results so far - * and add them to the value stored here. Then, we can discard bookend_bo. - */ - uint32_t *oa_results; - - /** - * BO containing starting and ending snapshots for any active pipeline - * statistics counters. - */ - drm_intel_bo *pipeline_stats_bo; - - /** - * Storage for final pipeline statistics counter results. - */ - uint64_t *pipeline_stats_results; -}; - -/** Downcasting convenience macro. */ -static inline struct brw_perf_monitor_object * -brw_perf_monitor(struct gl_perf_monitor_object *m) -{ - return (struct brw_perf_monitor_object *) m; -} - -#define SECOND_SNAPSHOT_OFFSET_IN_BYTES 2048 - -/* A random value used to ensure we're getting valid snapshots. */ -#define REPORT_ID 0xd2e9c607 - -/******************************************************************************/ - -#define COUNTER(name) \ - { \ - .Name = name, \ - .Type = GL_UNSIGNED_INT, \ - .Minimum = { .u32 = 0 }, \ - .Maximum = { .u32 = ~0 }, \ - } - -#define COUNTER64(name) \ - { \ - .Name = name, \ - .Type = GL_UNSIGNED_INT64_AMD, \ - .Minimum = { .u64 = 0 }, \ - .Maximum = { .u64 = ~0 }, \ - } - -#define GROUP(name, max_active, counter_list) \ - { \ - .Name = name, \ - .MaxActiveCounters = max_active, \ - .Counters = counter_list, \ - .NumCounters = ARRAY_SIZE(counter_list), \ - } - -/** Performance Monitor Group IDs */ -enum brw_counter_groups { - OA_COUNTERS, /* Observability Architecture (MI_REPORT_PERF_COUNT) Counters */ - PIPELINE_STATS_COUNTERS, /* Pipeline Statistics Register Counters */ -}; - -/** - * Ironlake: - * @{ - * - * The list of CHAPS counters unfortunately does not appear in any public - * documentation, but is available by reading the source code for the - * intel_perf_counters utility (shipped as part of intel-gpu-tools). - */ -static const struct gl_perf_monitor_counter gen5_raw_chaps_counters[] = { - COUNTER("cycles the CS unit is starved"), - COUNTER("cycles the CS unit is stalled"), - COUNTER("cycles the VF unit is starved"), - COUNTER("cycles the VF unit is stalled"), - COUNTER("cycles the VS unit is starved"), - COUNTER("cycles the VS unit is stalled"), - COUNTER("cycles the GS unit is starved"), - COUNTER("cycles the GS unit is stalled"), - COUNTER("cycles the CL unit is starved"), - COUNTER("cycles the CL unit is stalled"), - COUNTER("cycles the SF unit is starved"), - COUNTER("cycles the SF unit is stalled"), - COUNTER("cycles the WZ unit is starved"), - COUNTER("cycles the WZ unit is stalled"), - COUNTER("Z buffer read/write"), - COUNTER("cycles each EU was active"), - COUNTER("cycles each EU was suspended"), - COUNTER("cycles threads loaded all EUs"), - COUNTER("cycles filtering active"), - COUNTER("cycles PS threads executed"), - COUNTER("subspans written to RC"), - COUNTER("bytes read for texture reads"), - COUNTER("texels returned from sampler"), - COUNTER("polygons not culled"), - COUNTER("clocks MASF has valid message"), - COUNTER("64b writes/reads from RC"), - COUNTER("reads on dataport"), - COUNTER("clocks MASF has valid msg not consumed by sampler"), - COUNTER("cycles any EU is stalled for math"), -}; - -static const int gen5_oa_snapshot_layout[] = -{ - -1, /* Report ID */ - -1, /* TIMESTAMP (64-bit) */ - -1, /* ...second half... */ - 0, /* cycles the CS unit is starved */ - 1, /* cycles the CS unit is stalled */ - 2, /* cycles the VF unit is starved */ - 3, /* cycles the VF unit is stalled */ - 4, /* cycles the VS unit is starved */ - 5, /* cycles the VS unit is stalled */ - 6, /* cycles the GS unit is starved */ - 7, /* cycles the GS unit is stalled */ - 8, /* cycles the CL unit is starved */ - 9, /* cycles the CL unit is stalled */ - 10, /* cycles the SF unit is starved */ - 11, /* cycles the SF unit is stalled */ - 12, /* cycles the WZ unit is starved */ - 13, /* cycles the WZ unit is stalled */ - 14, /* Z buffer read/write */ - 15, /* cycles each EU was active */ - 16, /* cycles each EU was suspended */ - 17, /* cycles threads loaded all EUs */ - 18, /* cycles filtering active */ - 19, /* cycles PS threads executed */ - 20, /* subspans written to RC */ - 21, /* bytes read for texture reads */ - 22, /* texels returned from sampler */ - 23, /* polygons not culled */ - 24, /* clocks MASF has valid message */ - 25, /* 64b writes/reads from RC */ - 26, /* reads on dataport */ - 27, /* clocks MASF has valid msg not consumed by sampler */ - 28, /* cycles any EU is stalled for math */ -}; - -static const struct gl_perf_monitor_group gen5_groups[] = { - [OA_COUNTERS] = GROUP("CHAPS Counters", INT_MAX, gen5_raw_chaps_counters), - /* Our pipeline statistics counter handling requires hardware contexts. */ -}; -/** @} */ - -/** - * Sandybridge: - * @{ - * - * A few of the counters here (A17-A20) are not included in the latest - * documentation, but are described in the Ironlake PRM (which strangely - * documents Sandybridge's performance counter system, not Ironlake's). - * It's unclear whether they work or not; empirically, they appear to. - */ - -/** - * Aggregating counters A0-A28: - */ -static const struct gl_perf_monitor_counter gen6_raw_oa_counters[] = { - /* A0: 0 */ COUNTER("Aggregated Core Array Active"), - /* A1: 1 */ COUNTER("Aggregated Core Array Stalled"), - /* A2: 2 */ COUNTER("Vertex Shader Active Time"), - /* A3: Not actually hooked up on Sandybridge. */ - /* A4: 3 */ COUNTER("Vertex Shader Stall Time - Core Stall"), - /* A5: 4 */ COUNTER("# VS threads loaded"), - /* A6: 5 */ COUNTER("Vertex Shader Ready but not running Time"), - /* A7: 6 */ COUNTER("Geometry Shader Active Time"), - /* A8: Not actually hooked up on Sandybridge. */ - /* A9: 7 */ COUNTER("Geometry Shader Stall Time - Core Stall"), - /* A10: 8 */ COUNTER("# GS threads loaded"), - /* A11: 9 */ COUNTER("Geometry Shader Ready but not running Time"), - /* A12: 10 */ COUNTER("Pixel Shader Active Time"), - /* A13: Not actually hooked up on Sandybridge. */ - /* A14: 11 */ COUNTER("Pixel Shader Stall Time - Core Stall"), - /* A15: 12 */ COUNTER("# PS threads loaded"), - /* A16: 13 */ COUNTER("Pixel Shader Ready but not running Time"), - /* A17: 14 */ COUNTER("Early Z Test Pixels Passing"), - /* A18: 15 */ COUNTER("Early Z Test Pixels Failing"), - /* A19: 16 */ COUNTER("Early Stencil Test Pixels Passing"), - /* A20: 17 */ COUNTER("Early Stencil Test Pixels Failing"), - /* A21: 18 */ COUNTER("Pixel Kill Count"), - /* A22: 19 */ COUNTER("Alpha Test Pixels Failed"), - /* A23: 20 */ COUNTER("Post PS Stencil Pixels Failed"), - /* A24: 21 */ COUNTER("Post PS Z buffer Pixels Failed"), - /* A25: 22 */ COUNTER("Pixels/samples Written in the frame buffer"), - /* A26: 23 */ COUNTER("GPU Busy"), - /* A27: 24 */ COUNTER("CL active and not stalled"), - /* A28: 25 */ COUNTER("SF active and stalled"), -}; - -/** - * Sandybridge: Counter Select = 001 - * A0 A1 A2 A3 A4 TIMESTAMP RPT_ID - * A5 A6 A7 A8 A9 A10 A11 A12 - * A13 A14 A15 A16 A17 A18 A19 A20 - * A21 A22 A23 A24 A25 A26 A27 A28 - * - * (Yes, this is a strange order.) We also have to remap for missing counters. - */ -static const int gen6_oa_snapshot_layout[] = -{ - -1, /* Report ID */ - -1, /* TIMESTAMP (64-bit) */ - -1, /* ...second half... */ - 3, /* A4: Vertex Shader Stall Time - Core Stall */ - -1, /* A3: (not available) */ - 2, /* A2: Vertex Shader Active Time */ - 1, /* A1: Aggregated Core Array Stalled */ - 0, /* A0: Aggregated Core Array Active */ - 10, /* A12: Pixel Shader Active Time */ - 9, /* A11: Geometry Shader ready but not running Time */ - 8, /* A10: # GS threads loaded */ - 7, /* A9: Geometry Shader Stall Time - Core Stall */ - -1, /* A8: (not available) */ - 6, /* A7: Geometry Shader Active Time */ - 5, /* A6: Vertex Shader ready but not running Time */ - 4, /* A5: # VS Threads Loaded */ - 17, /* A20: Early Stencil Test Pixels Failing */ - 16, /* A19: Early Stencil Test Pixels Passing */ - 15, /* A18: Early Z Test Pixels Failing */ - 14, /* A17: Early Z Test Pixels Passing */ - 13, /* A16: Pixel Shader ready but not running Time */ - 12, /* A15: # PS threads loaded */ - 11, /* A14: Pixel Shader Stall Time - Core Stall */ - -1, /* A13: (not available) */ - 25, /* A28: SF active and stalled */ - 24, /* A27: CL active and not stalled */ - 23, /* A26: GPU Busy */ - 22, /* A25: Pixels/samples Written in the frame buffer */ - 21, /* A24: Post PS Z buffer Pixels Failed */ - 20, /* A23: Post PS Stencil Pixels Failed */ - 19, /* A22: Alpha Test Pixels Failed */ - 18, /* A21: Pixel Kill Count */ -}; - -static const struct gl_perf_monitor_counter gen6_statistics_counters[] = { - COUNTER64("IA_VERTICES_COUNT"), - COUNTER64("IA_PRIMITIVES_COUNT"), - COUNTER64("VS_INVOCATION_COUNT"), - COUNTER64("GS_INVOCATION_COUNT"), - COUNTER64("GS_PRIMITIVES_COUNT"), - COUNTER64("CL_INVOCATION_COUNT"), - COUNTER64("CL_PRIMITIVES_COUNT"), - COUNTER64("PS_INVOCATION_COUNT"), - COUNTER64("PS_DEPTH_COUNT"), - COUNTER64("SO_NUM_PRIMS_WRITTEN"), - COUNTER64("SO_PRIM_STORAGE_NEEDED"), -}; - -/** MMIO register addresses for each pipeline statistics counter. */ -static const int gen6_statistics_register_addresses[] = { - IA_VERTICES_COUNT, - IA_PRIMITIVES_COUNT, - VS_INVOCATION_COUNT, - GS_INVOCATION_COUNT, - GS_PRIMITIVES_COUNT, - CL_INVOCATION_COUNT, - CL_PRIMITIVES_COUNT, - PS_INVOCATION_COUNT, - PS_DEPTH_COUNT, - GEN6_SO_NUM_PRIMS_WRITTEN, - GEN6_SO_PRIM_STORAGE_NEEDED, -}; - -static const struct gl_perf_monitor_group gen6_groups[] = { - GROUP("Observability Architecture Counters", INT_MAX, gen6_raw_oa_counters), - GROUP("Pipeline Statistics Registers", INT_MAX, gen6_statistics_counters), -}; -/** @} */ - -/** - * Ivybridge/Baytrail/Haswell: - * @{ - */ -static const struct gl_perf_monitor_counter gen7_raw_oa_counters[] = { - COUNTER("Aggregated Core Array Active"), - COUNTER("Aggregated Core Array Stalled"), - COUNTER("Vertex Shader Active Time"), - COUNTER("Vertex Shader Stall Time - Core Stall"), - COUNTER("# VS threads loaded"), - COUNTER("Hull Shader Active Time"), - COUNTER("Hull Shader Stall Time - Core Stall"), - COUNTER("# HS threads loaded"), - COUNTER("Domain Shader Active Time"), - COUNTER("Domain Shader Stall Time - Core Stall"), - COUNTER("# DS threads loaded"), - COUNTER("Compute Shader Active Time"), - COUNTER("Compute Shader Stall Time - Core Stall"), - COUNTER("# CS threads loaded"), - COUNTER("Geometry Shader Active Time"), - COUNTER("Geometry Shader Stall Time - Core Stall"), - COUNTER("# GS threads loaded"), - COUNTER("Pixel Shader Active Time"), - COUNTER("Pixel Shader Stall Time - Core Stall"), - COUNTER("# PS threads loaded"), - COUNTER("HiZ Fast Z Test Pixels Passing"), - COUNTER("HiZ Fast Z Test Pixels Failing"), - COUNTER("Slow Z Test Pixels Passing"), - COUNTER("Slow Z Test Pixels Failing"), - COUNTER("Pixel Kill Count"), - COUNTER("Alpha Test Pixels Failed"), - COUNTER("Post PS Stencil Pixels Failed"), - COUNTER("Post PS Z buffer Pixels Failed"), - COUNTER("3D/GPGPU Render Target Writes"), - COUNTER("Render Engine Busy"), - COUNTER("VS bottleneck"), - COUNTER("GS bottleneck"), -}; - -/** - * Ivybridge/Baytrail/Haswell: Counter Select = 101 - * A4 A3 A2 A1 A0 TIMESTAMP ReportID - * A12 A11 A10 A9 A8 A7 A6 A5 - * A20 A19 A18 A17 A16 A15 A14 A13 - * A28 A27 A26 A25 A24 A23 A22 A21 - * A36 A35 A34 A33 A32 A31 A30 A29 - * A44 A43 A42 A41 A40 A39 A38 A37 - * B7 B6 B5 B4 B3 B2 B1 B0 - * Rsv Rsv Rsv Rsv Rsv Rsv Rsv Rsv - */ -static const int gen7_oa_snapshot_layout[] = -{ - -1, /* Report ID */ - -1, /* TIMESTAMP (64-bit) */ - -1, /* ...second half... */ - 0, /* A0: Aggregated Core Array Active */ - 1, /* A1: Aggregated Core Array Stalled */ - 2, /* A2: Vertex Shader Active Time */ - -1, /* A3: Reserved */ - 3, /* A4: Vertex Shader Stall Time - Core Stall */ - 4, /* A5: # VS threads loaded */ - -1, /* A6: Reserved */ - 5, /* A7: Hull Shader Active Time */ - -1, /* A8: Reserved */ - 6, /* A9: Hull Shader Stall Time - Core Stall */ - 7, /* A10: # HS threads loaded */ - -1, /* A11: Reserved */ - 8, /* A12: Domain Shader Active Time */ - -1, /* A13: Reserved */ - 9, /* A14: Domain Shader Stall Time - Core Stall */ - 10, /* A15: # DS threads loaded */ - -1, /* A16: Reserved */ - 11, /* A17: Compute Shader Active Time */ - -1, /* A18: Reserved */ - 12, /* A19: Compute Shader Stall Time - Core Stall */ - 13, /* A20: # CS threads loaded */ - -1, /* A21: Reserved */ - 14, /* A22: Geometry Shader Active Time */ - -1, /* A23: Reserved */ - 15, /* A24: Geometry Shader Stall Time - Core Stall */ - 16, /* A25: # GS threads loaded */ - -1, /* A26: Reserved */ - 17, /* A27: Pixel Shader Active Time */ - -1, /* A28: Reserved */ - 18, /* A29: Pixel Shader Stall Time - Core Stall */ - 19, /* A30: # PS threads loaded */ - -1, /* A31: Reserved */ - 20, /* A32: HiZ Fast Z Test Pixels Passing */ - 21, /* A33: HiZ Fast Z Test Pixels Failing */ - 22, /* A34: Slow Z Test Pixels Passing */ - 23, /* A35: Slow Z Test Pixels Failing */ - 24, /* A36: Pixel Kill Count */ - 25, /* A37: Alpha Test Pixels Failed */ - 26, /* A38: Post PS Stencil Pixels Failed */ - 27, /* A39: Post PS Z buffer Pixels Failed */ - 28, /* A40: 3D/GPGPU Render Target Writes */ - 29, /* A41: Render Engine Busy */ - 30, /* A42: VS bottleneck */ - 31, /* A43: GS bottleneck */ - -1, /* A44: Reserved */ - -1, /* B0 */ - -1, /* B1 */ - -1, /* B2 */ - -1, /* B3 */ - -1, /* B4 */ - -1, /* B5 */ - -1, /* B6 */ - -1, /* B7 */ - -1, /* Reserved */ - -1, /* Reserved */ - -1, /* Reserved */ - -1, /* Reserved */ - -1, /* Reserved */ - -1, /* Reserved */ - -1, /* Reserved */ - -1, /* Reserved */ -}; - -static const struct gl_perf_monitor_counter gen7_statistics_counters[] = { - COUNTER64("IA_VERTICES_COUNT"), - COUNTER64("IA_PRIMITIVES_COUNT"), - COUNTER64("VS_INVOCATION_COUNT"), - COUNTER64("HS_INVOCATION_COUNT"), - COUNTER64("DS_INVOCATION_COUNT"), - COUNTER64("GS_INVOCATION_COUNT"), - COUNTER64("GS_PRIMITIVES_COUNT"), - COUNTER64("CL_INVOCATION_COUNT"), - COUNTER64("CL_PRIMITIVES_COUNT"), - COUNTER64("PS_INVOCATION_COUNT"), - COUNTER64("PS_DEPTH_COUNT"), - COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 0)"), - COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 1)"), - COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 2)"), - COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 3)"), - COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 0)"), - COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 1)"), - COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 2)"), - COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 3)"), -}; - -/** MMIO register addresses for each pipeline statistics counter. */ -static const int gen7_statistics_register_addresses[] = { - IA_VERTICES_COUNT, - IA_PRIMITIVES_COUNT, - VS_INVOCATION_COUNT, - HS_INVOCATION_COUNT, - DS_INVOCATION_COUNT, - GS_INVOCATION_COUNT, - GS_PRIMITIVES_COUNT, - CL_INVOCATION_COUNT, - CL_PRIMITIVES_COUNT, - PS_INVOCATION_COUNT, - PS_DEPTH_COUNT, - GEN7_SO_NUM_PRIMS_WRITTEN(0), - GEN7_SO_NUM_PRIMS_WRITTEN(1), - GEN7_SO_NUM_PRIMS_WRITTEN(2), - GEN7_SO_NUM_PRIMS_WRITTEN(3), - GEN7_SO_PRIM_STORAGE_NEEDED(0), - GEN7_SO_PRIM_STORAGE_NEEDED(1), - GEN7_SO_PRIM_STORAGE_NEEDED(2), - GEN7_SO_PRIM_STORAGE_NEEDED(3), -}; - -static const struct gl_perf_monitor_group gen7_groups[] = { - GROUP("Observability Architecture Counters", INT_MAX, gen7_raw_oa_counters), - GROUP("Pipeline Statistics Registers", INT_MAX, gen7_statistics_counters), -}; -/** @} */ - -/******************************************************************************/ - -static GLboolean brw_is_perf_monitor_result_available(struct gl_context *, struct gl_perf_monitor_object *); - -static void -dump_perf_monitor_callback(GLuint name, void *monitor_void, void *brw_void) -{ - struct brw_context *brw = brw_void; - struct gl_context *ctx = brw_void; - struct gl_perf_monitor_object *m = monitor_void; - struct brw_perf_monitor_object *monitor = monitor_void; - - const char *resolved = ""; - for (int i = 0; i < brw->perfmon.unresolved_elements; i++) { - if (brw->perfmon.unresolved[i] == monitor) { - resolved = "Unresolved"; - break; - } - } - - DBG("%4d %-7s %-6s %-10s %-11s <%3d, %3d, %3d> %-6s %-9s\n", - name, - m->Active ? "Active" : "", - m->Ended ? "Ended" : "", - resolved, - brw_is_perf_monitor_result_available(ctx, m) ? "Available" : "", - monitor->oa_head_end, - monitor->oa_middle_start, - monitor->oa_tail_start, - monitor->oa_bo ? "OA BO" : "", - monitor->pipeline_stats_bo ? "Stats BO" : ""); -} - -void -brw_dump_perf_monitors(struct brw_context *brw) -{ - struct gl_context *ctx = &brw->ctx; - DBG("Monitors: (OA users = %d)\n", brw->perfmon.oa_users); - _mesa_HashWalk(ctx->PerfMonitor.Monitors, dump_perf_monitor_callback, brw); -} - -/******************************************************************************/ - -static bool -monitor_needs_statistics_registers(struct brw_context *brw, - struct gl_perf_monitor_object *m) -{ - return brw->gen >= 6 && m->ActiveGroups[PIPELINE_STATS_COUNTERS]; -} - -/** - * Take a snapshot of any monitored pipeline statistics counters. - */ -static void -snapshot_statistics_registers(struct brw_context *brw, - struct brw_perf_monitor_object *monitor, - uint32_t offset) -{ - struct gl_context *ctx = &brw->ctx; - const int group = PIPELINE_STATS_COUNTERS; - const int num_counters = ctx->PerfMonitor.Groups[group].NumCounters; - - brw_emit_mi_flush(brw); - - for (int i = 0; i < num_counters; i++) { - if (BITSET_TEST(monitor->base.ActiveCounters[group], i)) { - assert(ctx->PerfMonitor.Groups[group].Counters[i].Type == - GL_UNSIGNED_INT64_AMD); - - brw_store_register_mem64(brw, monitor->pipeline_stats_bo, - brw->perfmon.statistics_registers[i], - offset + i * sizeof(uint64_t)); - } - } -} - -/** - * Gather results from pipeline_stats_bo, storing the final values. - * - * This allows us to free pipeline_stats_bo (which is 4K) in favor of a much - * smaller array of final results. - */ -static void -gather_statistics_results(struct brw_context *brw, - struct brw_perf_monitor_object *monitor) -{ - struct gl_context *ctx = &brw->ctx; - const int num_counters = - ctx->PerfMonitor.Groups[PIPELINE_STATS_COUNTERS].NumCounters; - - monitor->pipeline_stats_results = calloc(num_counters, sizeof(uint64_t)); - if (monitor->pipeline_stats_results == NULL) { - _mesa_error_no_memory(__func__); - return; - } - - drm_intel_bo_map(monitor->pipeline_stats_bo, false); - uint64_t *start = monitor->pipeline_stats_bo->virtual; - uint64_t *end = start + (SECOND_SNAPSHOT_OFFSET_IN_BYTES / sizeof(uint64_t)); - - for (int i = 0; i < num_counters; i++) { - monitor->pipeline_stats_results[i] = end[i] - start[i]; - } - drm_intel_bo_unmap(monitor->pipeline_stats_bo); - drm_intel_bo_unreference(monitor->pipeline_stats_bo); - monitor->pipeline_stats_bo = NULL; -} - -/******************************************************************************/ - -static bool -monitor_needs_oa(struct brw_context *brw, - struct gl_perf_monitor_object *m) -{ - return m->ActiveGroups[OA_COUNTERS]; -} - -/** - * Enable the Observability Architecture counters by whacking OACONTROL. - */ -static void -start_oa_counters(struct brw_context *brw) -{ - unsigned counter_format; - - /* Pick the counter format which gives us all the counters. */ - switch (brw->gen) { - case 5: - return; /* Ironlake counters are always running. */ - case 6: - counter_format = 0b001; - break; - case 7: - counter_format = 0b101; - break; - default: - unreachable("Tried to enable OA counters on an unsupported generation."); - } - - BEGIN_BATCH(3); - OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2)); - OUT_BATCH(OACONTROL); - OUT_BATCH(counter_format << OACONTROL_COUNTER_SELECT_SHIFT | - OACONTROL_ENABLE_COUNTERS); - ADVANCE_BATCH(); -} - -/** - * Disable OA counters. - */ -static void -stop_oa_counters(struct brw_context *brw) -{ - /* Ironlake counters never stop. */ - if (brw->gen == 5) - return; - - BEGIN_BATCH(3); - OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2)); - OUT_BATCH(OACONTROL); - OUT_BATCH(0); - ADVANCE_BATCH(); -} - -/** - * The amount of batch space it takes to emit an MI_REPORT_PERF_COUNT snapshot, - * including the required PIPE_CONTROL flushes. - * - * Sandybridge is the worst case scenario: brw_emit_mi_flush expands to four - * PIPE_CONTROLs which are 5 DWords each. We have to flush before and after - * MI_REPORT_PERF_COUNT, so multiply by two. Finally, add the 3 DWords for - * MI_REPORT_PERF_COUNT itself. - */ -#define MI_REPORT_PERF_COUNT_BATCH_DWORDS (2 * (4 * 5) + 3) - -/** - * Emit an MI_REPORT_PERF_COUNT command packet. - * - * This writes the current OA counter values to buffer. - */ -static void -emit_mi_report_perf_count(struct brw_context *brw, - drm_intel_bo *bo, - uint32_t offset_in_bytes, - uint32_t report_id) -{ - assert(offset_in_bytes % 64 == 0); - - /* Make sure the commands to take a snapshot fits in a single batch. */ - intel_batchbuffer_require_space(brw, MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4, - RENDER_RING); - int batch_used = USED_BATCH(brw->batch); - - /* Reports apparently don't always get written unless we flush first. */ - brw_emit_mi_flush(brw); - - if (brw->gen == 5) { - /* Ironlake requires two MI_REPORT_PERF_COUNT commands to write all - * the counters. The report ID is ignored in the second set. - */ - BEGIN_BATCH(6); - OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_0); - OUT_RELOC(bo, - I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, - offset_in_bytes); - OUT_BATCH(report_id); - - OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_1); - OUT_RELOC(bo, - I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, - offset_in_bytes + 64); - OUT_BATCH(report_id); - ADVANCE_BATCH(); - } else if (brw->gen == 6) { - BEGIN_BATCH(3); - OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT); - OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, - offset_in_bytes | MI_COUNTER_ADDRESS_GTT); - OUT_BATCH(report_id); - ADVANCE_BATCH(); - } else if (brw->gen == 7) { - BEGIN_BATCH(3); - OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT); - OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, - offset_in_bytes); - OUT_BATCH(report_id); - ADVANCE_BATCH(); - } else { - unreachable("Unsupported generation for performance counters."); - } - - /* Reports apparently don't always get written unless we flush after. */ - brw_emit_mi_flush(brw); - - (void) batch_used; - assert(USED_BATCH(brw->batch) - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4); -} - -/** - * Add a monitor to the global list of "unresolved monitors." - * - * Monitors are "unresolved" if they refer to OA counter snapshots in - * bookend_bo. Results (even partial ones) must be gathered for all - * unresolved monitors before it's safe to discard bookend_bo. - */ -static void -add_to_unresolved_monitor_list(struct brw_context *brw, - struct brw_perf_monitor_object *monitor) -{ - if (brw->perfmon.unresolved_elements >= - brw->perfmon.unresolved_array_size) { - brw->perfmon.unresolved_array_size *= 2; - brw->perfmon.unresolved = reralloc(brw, brw->perfmon.unresolved, - struct brw_perf_monitor_object *, - brw->perfmon.unresolved_array_size); - } - - brw->perfmon.unresolved[brw->perfmon.unresolved_elements++] = monitor; -} - -/** - * If possible, throw away the contents of bookend BO. - * - * When all monitoring stops, and no monitors need data from bookend_bo to - * compute results, we can discard it and start writing snapshots at the - * beginning again. This helps reduce the amount of buffer wraparound. - */ -static void -clean_bookend_bo(struct brw_context *brw) -{ - if (brw->perfmon.unresolved_elements == 0) { - DBG("***Resetting bookend snapshots to 0\n"); - brw->perfmon.bookend_snapshots = 0; - } -} - -/** - * Remove a monitor from the global list of "unresolved monitors." - * - * This can happen when: - * - We finish computing a completed monitor's results. - * - We discard unwanted monitor results. - * - A monitor's results can be computed without relying on bookend_bo. - */ -static void -drop_from_unresolved_monitor_list(struct brw_context *brw, - struct brw_perf_monitor_object *monitor) -{ - for (int i = 0; i < brw->perfmon.unresolved_elements; i++) { - if (brw->perfmon.unresolved[i] == monitor) { - int last_elt = --brw->perfmon.unresolved_elements; - - if (i == last_elt) { - brw->perfmon.unresolved[i] = NULL; - } else { - brw->perfmon.unresolved[i] = brw->perfmon.unresolved[last_elt]; - } - - clean_bookend_bo(brw); - return; - } - } -} - -/** - * Given pointers to starting and ending OA snapshots, add the deltas for each - * counter to the results. - */ -static void -add_deltas(struct brw_context *brw, - struct brw_perf_monitor_object *monitor, - uint32_t *start, uint32_t *end) -{ - /* Look for expected report ID values to ensure data is present. */ - assert(start[0] == REPORT_ID); - assert(end[0] == REPORT_ID); - - /* Subtract each counter's ending and starting values, then add the - * difference to the counter's value so far. - */ - for (int i = 3; i < brw->perfmon.entries_per_oa_snapshot; i++) { - /* When debugging, it's useful to note when the ending value is less than - * the starting value; aggregating counters should always increase in - * value (or remain unchanged). This happens periodically due to - * wraparound, but can also indicate serious problems. - */ -#ifdef DEBUG - if (end[i] < start[i]) { - int counter = brw->perfmon.oa_snapshot_layout[i]; - if (counter >= 0) { - DBG("WARNING: \"%s\" ending value was less than the starting " - "value: %u < %u (end - start = %u)\n", - brw->ctx.PerfMonitor.Groups[0].Counters[counter].Name, - end[i], start[i], end[i] - start[i]); - } - } -#endif - monitor->oa_results[i] += end[i] - start[i]; - } -} - -/** - * Gather OA counter results (partial or full) from a series of snapshots. - * - * Monitoring can start or stop at any time, likely at some point mid-batch. - * We write snapshots for both events, storing them in monitor->oa_bo. - * - * Ideally, we would simply subtract those two snapshots to obtain the final - * counter results. Unfortunately, our hardware doesn't preserve their values - * across context switches or GPU sleep states. In order to support multiple - * concurrent OA clients, as well as reliable data across power management, - * we have to take snapshots at the start and end of batches as well. - * - * This results in a three-part sequence of (start, end) intervals: - * - The "head" is from the BeginPerfMonitor snapshot to the end of the first - * batchbuffer. - * - The "middle" is a series of (batch start, batch end) snapshots which - * bookend any batchbuffers between the ones which start/end monitoring. - * - The "tail" is from the start of the last batch where monitoring was - * active to the EndPerfMonitor snapshot. - * - * Due to wrapping in the bookend BO, we may have to accumulate partial results. - * If so, we handle the "head" and any "middle" results so far. When monitoring - * eventually ends, we handle additional "middle" batches and the "tail." - */ -static void -gather_oa_results(struct brw_context *brw, - struct brw_perf_monitor_object *monitor, - uint32_t *bookend_buffer) -{ - struct gl_perf_monitor_object *m = &monitor->base; - assert(monitor->oa_bo != NULL); - - drm_intel_bo_map(monitor->oa_bo, false); - uint32_t *monitor_buffer = monitor->oa_bo->virtual; - - /* If monitoring was entirely contained within a single batch, then the - * bookend BO is irrelevant. Just subtract monitor->bo's two snapshots. - */ - if (monitor->oa_middle_start == -1) { - add_deltas(brw, monitor, - monitor_buffer, - monitor_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES / - sizeof(uint32_t))); - drm_intel_bo_unmap(monitor->oa_bo); - return; - } - - const ptrdiff_t snapshot_size = brw->perfmon.entries_per_oa_snapshot; - - /* First, add the contributions from the "head" interval: - * (snapshot taken at BeginPerfMonitor time, - * snapshot taken at the end of the first batch after monitoring began) - */ - if (monitor->oa_head_end != -1) { - assert(monitor->oa_head_end < brw->perfmon.bookend_snapshots); - add_deltas(brw, monitor, - monitor_buffer, - bookend_buffer + snapshot_size * monitor->oa_head_end); - - /* Make sure we don't count the "head" again in the future. */ - monitor->oa_head_end = -1; - } - - /* Next, count the contributions from the "middle" batches. These are - * (batch begin, batch end) deltas while monitoring was active. - */ - int last_snapshot; - if (m->Ended) - last_snapshot = monitor->oa_tail_start; - else - last_snapshot = brw->perfmon.bookend_snapshots; - - for (int s = monitor->oa_middle_start; s < last_snapshot; s += 2) { - add_deltas(brw, monitor, - bookend_buffer + snapshot_size * s, - bookend_buffer + snapshot_size * (s + 1)); - } - - /* Finally, if the monitor has ended, we need to count the contributions of - * the "tail" interval: - * (start of the batch where monitoring ended, EndPerfMonitor snapshot) - */ - if (m->Ended) { - assert(monitor->oa_tail_start != -1); - add_deltas(brw, monitor, - bookend_buffer + snapshot_size * monitor->oa_tail_start, - monitor_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES / - sizeof(uint32_t))); - } - - drm_intel_bo_unmap(monitor->oa_bo); - - /* If the monitor has ended, then we've gathered all the results, and - * can free the monitor's OA BO. - */ - if (m->Ended) { - drm_intel_bo_unreference(monitor->oa_bo); - monitor->oa_bo = NULL; - - /* The monitor's OA result is now resolved. */ - DBG("Marking %d resolved - results gathered\n", m->Name); - drop_from_unresolved_monitor_list(brw, monitor); - } -} - -/** - * Handle running out of space in the bookend BO. - * - * When we run out of space in the bookend BO, we need to gather up partial - * results for every unresolved monitor. This allows us to free the snapshot - * data in bookend_bo, freeing up the space for reuse. We call this "wrapping." - * - * This will completely compute the result for any unresolved monitors that - * have ended. - */ -static void -wrap_bookend_bo(struct brw_context *brw) -{ - DBG("****Wrap bookend BO****\n"); - /* Note that wrapping will only occur at the start of a batch, since that's - * where we reserve space. So the current batch won't reference bookend_bo - * or any monitor BOs. This means we don't need to worry about - * synchronization. - * - * Also, EndPerfMonitor guarantees that only monitors which span multiple - * batches exist in the unresolved monitor list. - */ - assert(brw->perfmon.oa_users > 0); - - drm_intel_bo_map(brw->perfmon.bookend_bo, false); - uint32_t *bookend_buffer = brw->perfmon.bookend_bo->virtual; - for (int i = 0; i < brw->perfmon.unresolved_elements; i++) { - struct brw_perf_monitor_object *monitor = brw->perfmon.unresolved[i]; - struct gl_perf_monitor_object *m = &monitor->base; - - gather_oa_results(brw, monitor, bookend_buffer); - - if (m->Ended) { - /* gather_oa_results() dropped the monitor from the unresolved list, - * throwing our indices off by one. - */ - --i; - } else { - /* When we create the new bookend_bo, snapshot #0 will be the - * beginning of another "middle" BO. - */ - monitor->oa_middle_start = 0; - assert(monitor->oa_head_end == -1); - assert(monitor->oa_tail_start == -1); - } - } - drm_intel_bo_unmap(brw->perfmon.bookend_bo); - - brw->perfmon.bookend_snapshots = 0; -} - -/* This is fairly arbitrary; the trade off is memory usage vs. extra overhead - * from wrapping. On Gen7, 32768 should be enough for 128 snapshots before - * wrapping (since each is 256 bytes). - */ -#define BOOKEND_BO_SIZE_BYTES 32768 - -/** - * Check whether bookend_bo has space for a given number of snapshots. - */ -static bool -has_space_for_bookend_snapshots(struct brw_context *brw, int snapshots) -{ - int snapshot_bytes = brw->perfmon.entries_per_oa_snapshot * sizeof(uint32_t); - - /* There are brw->perfmon.bookend_snapshots - 1 existing snapshots. */ - int total_snapshots = (brw->perfmon.bookend_snapshots - 1) + snapshots; - - return total_snapshots * snapshot_bytes < BOOKEND_BO_SIZE_BYTES; -} - -/** - * Write an OA counter snapshot to bookend_bo. - */ -static void -emit_bookend_snapshot(struct brw_context *brw) -{ - int snapshot_bytes = brw->perfmon.entries_per_oa_snapshot * sizeof(uint32_t); - int offset_in_bytes = brw->perfmon.bookend_snapshots * snapshot_bytes; - - emit_mi_report_perf_count(brw, brw->perfmon.bookend_bo, offset_in_bytes, - REPORT_ID); - ++brw->perfmon.bookend_snapshots; -} - -/******************************************************************************/ - -/** - * Initialize a monitor to sane starting state; throw away old buffers. - */ -static void -reinitialize_perf_monitor(struct brw_context *brw, - struct brw_perf_monitor_object *monitor) -{ - if (monitor->oa_bo) { - drm_intel_bo_unreference(monitor->oa_bo); - monitor->oa_bo = NULL; - } - - /* Since the results are now invalid, we don't need to hold on to any - * snapshots in bookend_bo. The monitor is effectively "resolved." - */ - drop_from_unresolved_monitor_list(brw, monitor); - - monitor->oa_head_end = -1; - monitor->oa_middle_start = -1; - monitor->oa_tail_start = -1; - - free(monitor->oa_results); - monitor->oa_results = NULL; - - if (monitor->pipeline_stats_bo) { - drm_intel_bo_unreference(monitor->pipeline_stats_bo); - monitor->pipeline_stats_bo = NULL; - } - - free(monitor->pipeline_stats_results); - monitor->pipeline_stats_results = NULL; -} - -/** - * Driver hook for glBeginPerformanceMonitorAMD(). - */ -static GLboolean -brw_begin_perf_monitor(struct gl_context *ctx, - struct gl_perf_monitor_object *m) -{ - struct brw_context *brw = brw_context(ctx); - struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); - - DBG("Begin(%d)\n", m->Name); - - reinitialize_perf_monitor(brw, monitor); - - if (monitor_needs_oa(brw, m)) { - /* If the global OA bookend BO doesn't exist, allocate it. This should - * only happen once, but we delay until BeginPerfMonitor time to avoid - * wasting memory for contexts that don't use performance monitors. - */ - if (!brw->perfmon.bookend_bo) { - brw->perfmon.bookend_bo = drm_intel_bo_alloc(brw->bufmgr, - "OA bookend BO", - BOOKEND_BO_SIZE_BYTES, 64); - } - - monitor->oa_bo = - drm_intel_bo_alloc(brw->bufmgr, "perf. monitor OA bo", 4096, 64); -#ifdef DEBUG - /* Pre-filling the BO helps debug whether writes landed. */ - drm_intel_bo_map(monitor->oa_bo, true); - memset((char *) monitor->oa_bo->virtual, 0xff, 4096); - drm_intel_bo_unmap(monitor->oa_bo); -#endif - - /* Allocate storage for accumulated OA counter values. */ - monitor->oa_results = - calloc(brw->perfmon.entries_per_oa_snapshot, sizeof(uint32_t)); - - /* If the OA counters aren't already on, enable them. */ - if (brw->perfmon.oa_users == 0) { - /* Ensure the OACONTROL enable and snapshot land in the same batch. */ - int space = (MI_REPORT_PERF_COUNT_BATCH_DWORDS + 3) * 4; - intel_batchbuffer_require_space(brw, space, RENDER_RING); - start_oa_counters(brw); - } - - /* Take a starting OA counter snapshot. */ - emit_mi_report_perf_count(brw, monitor->oa_bo, 0, REPORT_ID); - - monitor->oa_head_end = brw->perfmon.bookend_snapshots; - monitor->oa_middle_start = brw->perfmon.bookend_snapshots + 1; - monitor->oa_tail_start = -1; - - /* Add the monitor to the unresolved list. */ - add_to_unresolved_monitor_list(brw, monitor); - - ++brw->perfmon.oa_users; - } - - if (monitor_needs_statistics_registers(brw, m)) { - monitor->pipeline_stats_bo = - drm_intel_bo_alloc(brw->bufmgr, "perf. monitor stats bo", 4096, 64); - - /* Take starting snapshots. */ - snapshot_statistics_registers(brw, monitor, 0); - } - - return true; -} - -/** - * Driver hook for glEndPerformanceMonitorAMD(). - */ -static void -brw_end_perf_monitor(struct gl_context *ctx, - struct gl_perf_monitor_object *m) -{ - struct brw_context *brw = brw_context(ctx); - struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); - - DBG("End(%d)\n", m->Name); - - if (monitor_needs_oa(brw, m)) { - /* Take an ending OA counter snapshot. */ - emit_mi_report_perf_count(brw, monitor->oa_bo, - SECOND_SNAPSHOT_OFFSET_IN_BYTES, REPORT_ID); - - --brw->perfmon.oa_users; - - if (brw->perfmon.oa_users == 0) - stop_oa_counters(brw); - - if (monitor->oa_head_end == brw->perfmon.bookend_snapshots) { - assert(monitor->oa_head_end != -1); - /* We never actually wrote the snapshot for the end of the first batch - * after BeginPerfMonitor. This means that monitoring was contained - * entirely within a single batch, so we can ignore bookend_bo and - * just compare the monitor's begin/end snapshots directly. - */ - monitor->oa_head_end = -1; - monitor->oa_middle_start = -1; - monitor->oa_tail_start = -1; - - /* We can also mark it resolved since it won't depend on bookend_bo. */ - DBG("Marking %d resolved - entirely in one batch\n", m->Name); - drop_from_unresolved_monitor_list(brw, monitor); - } else { - /* We've written at least one batch end snapshot, so the monitoring - * spanned multiple batches. Mark which snapshot corresponds to the - * start of the current batch. - */ - monitor->oa_tail_start = brw->perfmon.bookend_snapshots - 1; - } - } - - if (monitor_needs_statistics_registers(brw, m)) { - /* Take ending snapshots. */ - snapshot_statistics_registers(brw, monitor, - SECOND_SNAPSHOT_OFFSET_IN_BYTES); - } -} - -/** - * Reset a performance monitor, throwing away any results. - */ -static void -brw_reset_perf_monitor(struct gl_context *ctx, - struct gl_perf_monitor_object *m) -{ - struct brw_context *brw = brw_context(ctx); - struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); - - reinitialize_perf_monitor(brw, monitor); - - if (m->Active) { - brw_begin_perf_monitor(ctx, m); - } -} - -/** - * Is a performance monitor result available? - */ -static GLboolean -brw_is_perf_monitor_result_available(struct gl_context *ctx, - struct gl_perf_monitor_object *m) -{ - struct brw_context *brw = brw_context(ctx); - struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); - - bool oa_available = true; - bool stats_available = true; - - if (monitor_needs_oa(brw, m)) { - oa_available = !monitor->oa_bo || - (!drm_intel_bo_references(brw->batch.bo, monitor->oa_bo) && - !drm_intel_bo_busy(monitor->oa_bo)); - } - - if (monitor_needs_statistics_registers(brw, m)) { - stats_available = !monitor->pipeline_stats_bo || - (!drm_intel_bo_references(brw->batch.bo, monitor->pipeline_stats_bo) && - !drm_intel_bo_busy(monitor->pipeline_stats_bo)); - } - - return oa_available && stats_available; -} - -/** - * Get the performance monitor result. - */ -static void -brw_get_perf_monitor_result(struct gl_context *ctx, - struct gl_perf_monitor_object *m, - GLsizei data_size, - GLuint *data, - GLint *bytes_written) -{ - struct brw_context *brw = brw_context(ctx); - struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); - const GLuint *const data_end = (GLuint *)((uint8_t *) data + data_size); - - DBG("GetResult(%d)\n", m->Name); - brw_dump_perf_monitors(brw); - - /* This hook should only be called when results are available. */ - assert(m->Ended); - - /* Copy data to the supplied array (data). - * - * The output data format is: <group ID, counter ID, value> for each - * active counter. The API allows counters to appear in any order. - */ - GLsizei offset = 0; - - if (monitor_needs_oa(brw, m)) { - /* Gather up the results from the BO, unless we already did due to the - * bookend BO wrapping. - */ - if (monitor->oa_bo) { - /* Since the result is available, all the necessary snapshots will - * have been written to the bookend BO. If other monitors are - * active, the bookend BO may be busy or referenced by the current - * batch, but only for writing snapshots beyond oa_tail_start, - * which we don't care about. - * - * Using an unsynchronized mapping avoids stalling for an - * indeterminate amount of time. - */ - drm_intel_gem_bo_map_unsynchronized(brw->perfmon.bookend_bo); - - gather_oa_results(brw, monitor, brw->perfmon.bookend_bo->virtual); - - drm_intel_bo_unmap(brw->perfmon.bookend_bo); - } - - for (int i = 0; i < brw->perfmon.entries_per_oa_snapshot; i++) { - int group = OA_COUNTERS; - int counter = brw->perfmon.oa_snapshot_layout[i]; - - /* We always capture all the OA counters, but the application may - * have only asked for a subset. Skip unwanted counters. - */ - if (counter < 0 || !BITSET_TEST(m->ActiveCounters[group], counter)) - continue; - - if (data + offset + 3 <= data_end) { - data[offset++] = group; - data[offset++] = counter; - data[offset++] = monitor->oa_results[i]; - } - } - - clean_bookend_bo(brw); - } - - if (monitor_needs_statistics_registers(brw, m)) { - const int num_counters = - ctx->PerfMonitor.Groups[PIPELINE_STATS_COUNTERS].NumCounters; - - if (!monitor->pipeline_stats_results) { - gather_statistics_results(brw, monitor); - - /* Check if we did really get the results */ - if (!monitor->pipeline_stats_results) { - if (bytes_written) { - *bytes_written = 0; - } - return; - } - } - - for (int i = 0; i < num_counters; i++) { - if (BITSET_TEST(m->ActiveCounters[PIPELINE_STATS_COUNTERS], i)) { - if (data + offset + 4 <= data_end) { - data[offset++] = PIPELINE_STATS_COUNTERS; - data[offset++] = i; - *((uint64_t *) (&data[offset])) = monitor->pipeline_stats_results[i]; - offset += 2; - } - } - } - } - - if (bytes_written) - *bytes_written = offset * sizeof(uint32_t); -} - -/** - * Create a new performance monitor object. - */ -static struct gl_perf_monitor_object * -brw_new_perf_monitor(struct gl_context *ctx) -{ - (void) ctx; - return calloc(1, sizeof(struct brw_perf_monitor_object)); -} - -/** - * Delete a performance monitor object. - */ -static void -brw_delete_perf_monitor(struct gl_context *ctx, struct gl_perf_monitor_object *m) -{ - struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); - DBG("Delete(%d)\n", m->Name); - reinitialize_perf_monitor(brw_context(ctx), monitor); - free(monitor); -} - -/******************************************************************************/ - -/** - * Called at the start of every render ring batch. - * - * Enable OA counters and emit the "start of batchbuffer" bookend OA snapshot. - * Since it's a new batch, there will be plenty of space for the commands. - */ -void -brw_perf_monitor_new_batch(struct brw_context *brw) -{ - assert(brw->batch.ring == RENDER_RING); - assert(brw->gen < 6 || USED_BATCH(brw->batch) == 0); - - if (brw->perfmon.oa_users == 0) - return; - - start_oa_counters(brw); - - /* Make sure bookend_bo has enough space for a pair of snapshots. - * If not, "wrap" the BO: gather up any results so far, and start from - * the beginning of the buffer. Reserving a pair guarantees that wrapping - * will only happen at the beginning of a batch, where it's safe to map BOs - * (as the batch is empty and can't refer to any of them yet). - */ - if (!has_space_for_bookend_snapshots(brw, 2)) - wrap_bookend_bo(brw); - - DBG("Bookend Begin Snapshot (%d)\n", brw->perfmon.bookend_snapshots); - emit_bookend_snapshot(brw); -} - -/** - * Called at the end of every render ring batch. - * - * Emit the "end of batchbuffer" bookend OA snapshot and disable the counters. - * - * This relies on there being enough space in BATCH_RESERVED. - */ -void -brw_perf_monitor_finish_batch(struct brw_context *brw) -{ - assert(brw->batch.ring == RENDER_RING); - - if (brw->perfmon.oa_users == 0) - return; - - DBG("Bookend End Snapshot (%d)\n", brw->perfmon.bookend_snapshots); - - /* Not safe to wrap; should've reserved space already. */ - assert(has_space_for_bookend_snapshots(brw, 1)); - - emit_bookend_snapshot(brw); - - stop_oa_counters(brw); -} - -/******************************************************************************/ - -void -brw_init_performance_monitors(struct brw_context *brw) -{ - struct gl_context *ctx = &brw->ctx; - - ctx->Driver.NewPerfMonitor = brw_new_perf_monitor; - ctx->Driver.DeletePerfMonitor = brw_delete_perf_monitor; - ctx->Driver.BeginPerfMonitor = brw_begin_perf_monitor; - ctx->Driver.EndPerfMonitor = brw_end_perf_monitor; - ctx->Driver.ResetPerfMonitor = brw_reset_perf_monitor; - ctx->Driver.IsPerfMonitorResultAvailable = brw_is_perf_monitor_result_available; - ctx->Driver.GetPerfMonitorResult = brw_get_perf_monitor_result; - - if (brw->gen == 5) { - ctx->PerfMonitor.Groups = gen5_groups; - ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen5_groups); - brw->perfmon.oa_snapshot_layout = gen5_oa_snapshot_layout; - brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen5_oa_snapshot_layout); - } else if (brw->gen == 6) { - ctx->PerfMonitor.Groups = gen6_groups; - ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen6_groups); - brw->perfmon.oa_snapshot_layout = gen6_oa_snapshot_layout; - brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen6_oa_snapshot_layout); - brw->perfmon.statistics_registers = gen6_statistics_register_addresses; - } else if (brw->gen == 7) { - ctx->PerfMonitor.Groups = gen7_groups; - ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen7_groups); - brw->perfmon.oa_snapshot_layout = gen7_oa_snapshot_layout; - brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen7_oa_snapshot_layout); - brw->perfmon.statistics_registers = gen7_statistics_register_addresses; - } - - brw->perfmon.unresolved = - ralloc_array(brw, struct brw_perf_monitor_object *, 1); - brw->perfmon.unresolved_elements = 0; - brw->perfmon.unresolved_array_size = 1; -} diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c index f136fae6cc3..65c27731cd2 100644 --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c @@ -175,8 +175,7 @@ do_batch_dump(struct brw_context *brw) void intel_batchbuffer_emit_render_ring_prelude(struct brw_context *brw) { - /* We may need to enable and snapshot OA counters. */ - brw_perf_monitor_new_batch(brw); + /* Un-used currently */ } /** @@ -211,9 +210,6 @@ brw_new_batch(struct brw_context *brw) */ if (INTEL_DEBUG & DEBUG_SHADER_TIME) brw_collect_and_report_shader_time(brw); - - if (INTEL_DEBUG & DEBUG_PERFMON) - brw_dump_perf_monitors(brw); } /** @@ -241,9 +237,6 @@ brw_finish_batch(struct brw_context *brw) if (brw->gen >= 7) gen7_restore_default_l3_config(brw); - /* We may also need to snapshot and disable OA counters. */ - brw_perf_monitor_finish_batch(brw); - if (brw->is_haswell) { /* From the Haswell PRM, Volume 2b, Command Reference: Instructions, * 3DSTATE_CC_STATE_POINTERS > "Note": diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c index bbb794240c0..1ecefc19449 100644 --- a/src/mesa/drivers/dri/i965/intel_extensions.c +++ b/src/mesa/drivers/dri/i965/intel_extensions.c @@ -100,71 +100,6 @@ can_do_pipelined_register_writes(struct brw_context *brw) return success; } -static bool -can_write_oacontrol(struct brw_context *brw) -{ - if (brw->gen < 6 || brw->gen >= 8) - return false; - - static int result = -1; - if (result != -1) - return result; - - /* Set "Select Context ID" to a particular address (which is likely not a - * context), but leave all counting disabled. This should be harmless. - */ - const int expected_value = 0x31337000; - const int offset = 110; - - uint32_t *data; - /* Set a value in a BO to a known quantity. The workaround BO already - * exists and doesn't contain anything important, so we may as well use it. - */ - drm_intel_bo_map(brw->workaround_bo, true); - data = brw->workaround_bo->virtual; - data[offset] = 0xffffffff; - drm_intel_bo_unmap(brw->workaround_bo); - - /* Write OACONTROL. */ - BEGIN_BATCH(3); - OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2)); - OUT_BATCH(OACONTROL); - OUT_BATCH(expected_value); - ADVANCE_BATCH(); - - brw_emit_mi_flush(brw); - - /* Save the register's value back to the buffer. */ - BEGIN_BATCH(3); - OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2)); - OUT_BATCH(OACONTROL); - OUT_RELOC(brw->workaround_bo, - I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, - offset * sizeof(uint32_t)); - ADVANCE_BATCH(); - - brw_emit_mi_flush(brw); - - /* Set OACONTROL back to zero (everything off). */ - BEGIN_BATCH(3); - OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2)); - OUT_BATCH(OACONTROL); - OUT_BATCH(0); - ADVANCE_BATCH(); - - intel_batchbuffer_flush(brw); - - /* Check whether the value got written. */ - drm_intel_bo_map(brw->workaround_bo, false); - data = brw->workaround_bo->virtual; - bool success = data[offset] == expected_value; - drm_intel_bo_unmap(brw->workaround_bo); - - result = success; - - return success; -} - /** * Initializes potential list of extensions if ctx == NULL, or actually enables * extensions for a context. @@ -290,11 +225,6 @@ intelInitExtensions(struct gl_context *ctx) ctx->Extensions.ARB_texture_query_levels = ctx->Const.GLSLVersion >= 130; ctx->Extensions.ARB_texture_query_lod = true; ctx->Extensions.EXT_timer_query = true; - - if (brw->gen == 5 || can_write_oacontrol(brw)) { - ctx->Extensions.AMD_performance_monitor = true; - ctx->Extensions.INTEL_performance_query = true; - } } if (brw->gen >= 6) { |