diff options
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_context.h | 9 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_performance_monitor.c | 297 |
2 files changed, 306 insertions, 0 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index a2720d22c43..43d0bbf7a5e 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -1402,6 +1402,15 @@ struct brw_context struct { /** A map from pipeline statistics counter IDs to MMIO addresses. */ const int *statistics_registers; + + /** + * Mapping from a uint32_t offset within an OA snapshot to the ID of + * the counter which MI_REPORT_PERF_COUNT stores there. + */ + const int *oa_snapshot_layout; + + /** Number of 32-bit entries in a hardware counter snapshot. */ + int entries_per_oa_snapshot; } perfmon; int num_atoms; diff --git a/src/mesa/drivers/dri/i965/brw_performance_monitor.c b/src/mesa/drivers/dri/i965/brw_performance_monitor.c index 725e8bb4e1d..850dba78abc 100644 --- a/src/mesa/drivers/dri/i965/brw_performance_monitor.c +++ b/src/mesa/drivers/dri/i965/brw_performance_monitor.c @@ -25,6 +25,21 @@ * \file brw_performance_monitor.c * * Implementation of the GL_AMD_performance_monitor extension. + * + * On Gen5+ hardware, we have two sources of performance counter data: + * the Observability Architecture counters (MI_REPORT_PERF_COUNT), and + * the Pipeline Statistics Registers. We expose both sets of raw data, + * as well as some useful processed values. + * + * The Observability Architecture (OA) counters for Gen6+ are documented + * in a separate document from the rest of the PRMs. It is available at: + * https://01.org/linuxgraphics/documentation/driver-documentation-prms + * => 2013 Intel Core Processor Family => Observability Performance Counters + * (This one volume covers Sandybridge, Ivybridge, Baytrail, and Haswell.) + * + * On Ironlake, the OA counters were called "CHAPS" counters. Sadly, no public + * documentation exists; our implementation is based on the source code for the + * intel_perf_counters utility (which is available as part of intel-gpu-tools). */ #include <limits.h> @@ -100,14 +115,88 @@ brw_perf_monitor(struct gl_perf_monitor_object *m) /** Performance Monitor Group IDs */ enum brw_counter_groups { + OA_COUNTERS, /* Observability Architecture (MI_REPORT_PERF_COUNT) Counters */ PIPELINE_STATS_COUNTERS, /* Pipeline Statistics Register Counters */ }; /** * Ironlake: * @{ + * + * The list of CHAPS counters unfortunately does not appear in any public + * documentation, but is available by reading the source code for the + * intel_perf_counters utility (shipped as part of intel-gpu-tools). */ +const static struct gl_perf_monitor_counter gen5_raw_chaps_counters[] = { + COUNTER("cycles the CS unit is starved"), + COUNTER("cycles the CS unit is stalled"), + COUNTER("cycles the VF unit is starved"), + COUNTER("cycles the VF unit is stalled"), + COUNTER("cycles the VS unit is starved"), + COUNTER("cycles the VS unit is stalled"), + COUNTER("cycles the GS unit is starved"), + COUNTER("cycles the GS unit is stalled"), + COUNTER("cycles the CL unit is starved"), + COUNTER("cycles the CL unit is stalled"), + COUNTER("cycles the SF unit is starved"), + COUNTER("cycles the SF unit is stalled"), + COUNTER("cycles the WZ unit is starved"), + COUNTER("cycles the WZ unit is stalled"), + COUNTER("Z buffer read/write"), + COUNTER("cycles each EU was active"), + COUNTER("cycles each EU was suspended"), + COUNTER("cycles threads loaded all EUs"), + COUNTER("cycles filtering active"), + COUNTER("cycles PS threads executed"), + COUNTER("subspans written to RC"), + COUNTER("bytes read for texture reads"), + COUNTER("texels returned from sampler"), + COUNTER("polygons not culled"), + COUNTER("clocks MASF has valid message"), + COUNTER("64b writes/reads from RC"), + COUNTER("reads on dataport"), + COUNTER("clocks MASF has valid msg not consumed by sampler"), + COUNTER("cycles any EU is stalled for math"), +}; + +const static int gen5_oa_snapshot_layout[] = +{ + -1, /* Report ID */ + -1, /* TIMESTAMP (64-bit) */ + -1, /* ...second half... */ + 0, /* cycles the CS unit is starved */ + 1, /* cycles the CS unit is stalled */ + 2, /* cycles the VF unit is starved */ + 3, /* cycles the VF unit is stalled */ + 4, /* cycles the VS unit is starved */ + 5, /* cycles the VS unit is stalled */ + 6, /* cycles the GS unit is starved */ + 7, /* cycles the GS unit is stalled */ + 8, /* cycles the CL unit is starved */ + 9, /* cycles the CL unit is stalled */ + 10, /* cycles the SF unit is starved */ + 11, /* cycles the SF unit is stalled */ + 12, /* cycles the WZ unit is starved */ + 13, /* cycles the WZ unit is stalled */ + 14, /* Z buffer read/write */ + 15, /* cycles each EU was active */ + 16, /* cycles each EU was suspended */ + 17, /* cycles threads loaded all EUs */ + 18, /* cycles filtering active */ + 19, /* cycles PS threads executed */ + 20, /* subspans written to RC */ + 21, /* bytes read for texture reads */ + 22, /* texels returned from sampler */ + 23, /* polygons not culled */ + 24, /* clocks MASF has valid message */ + 25, /* 64b writes/reads from RC */ + 26, /* reads on dataport */ + 27, /* clocks MASF has valid msg not consumed by sampler */ + 28, /* cycles any EU is stalled for math */ +}; + const static struct gl_perf_monitor_group gen5_groups[] = { + [OA_COUNTERS] = GROUP("CHAPS Counters", INT_MAX, gen5_raw_chaps_counters), /* Our pipeline statistics counter handling requires hardware contexts. */ }; /** @} */ @@ -115,7 +204,93 @@ const static struct gl_perf_monitor_group gen5_groups[] = { /** * Sandybridge: * @{ + * + * A few of the counters here (A17-A20) are not included in the latest + * documentation, but are described in the Ironlake PRM (which strangely + * documents Sandybridge's performance counter system, not Ironlake's). + * It's unclear whether they work or not; empirically, they appear to. + */ + +/** + * Aggregating counters A0-A28: + */ +const static struct gl_perf_monitor_counter gen6_raw_oa_counters[] = { + /* A0: 0 */ COUNTER("Aggregated Core Array Active"), + /* A1: 1 */ COUNTER("Aggregated Core Array Stalled"), + /* A2: 2 */ COUNTER("Vertex Shader Active Time"), + /* A3: Not actually hooked up on Sandybridge. */ + /* A4: 3 */ COUNTER("Vertex Shader Stall Time - Core Stall"), + /* A5: 4 */ COUNTER("# VS threads loaded"), + /* A6: 5 */ COUNTER("Vertex Shader Ready but not running Time"), + /* A7: 6 */ COUNTER("Geometry Shader Active Time"), + /* A8: Not actually hooked up on Sandybridge. */ + /* A9: 7 */ COUNTER("Geometry Shader Stall Time - Core Stall"), + /* A10: 8 */ COUNTER("# GS threads loaded"), + /* A11: 9 */ COUNTER("Geometry Shader Ready but not running Time"), + /* A12: 10 */ COUNTER("Pixel Shader Active Time"), + /* A13: Not actually hooked up on Sandybridge. */ + /* A14: 11 */ COUNTER("Pixel Shader Stall Time - Core Stall"), + /* A15: 12 */ COUNTER("# PS threads loaded"), + /* A16: 13 */ COUNTER("Pixel Shader Ready but not running Time"), + /* A17: 14 */ COUNTER("Early Z Test Pixels Passing"), + /* A18: 15 */ COUNTER("Early Z Test Pixels Failing"), + /* A19: 16 */ COUNTER("Early Stencil Test Pixels Passing"), + /* A20: 17 */ COUNTER("Early Stencil Test Pixels Failing"), + /* A21: 18 */ COUNTER("Pixel Kill Count"), + /* A22: 19 */ COUNTER("Alpha Test Pixels Failed"), + /* A23: 20 */ COUNTER("Post PS Stencil Pixels Failed"), + /* A24: 21 */ COUNTER("Post PS Z buffer Pixels Failed"), + /* A25: 22 */ COUNTER("Pixels/samples Written in the frame buffer"), + /* A26: 23 */ COUNTER("GPU Busy"), + /* A27: 24 */ COUNTER("CL active and not stalled"), + /* A28: 25 */ COUNTER("SF active and stalled"), +}; + +/** + * Sandybridge: Counter Select = 001 + * A0 A1 A2 A3 A4 TIMESTAMP RPT_ID + * A5 A6 A7 A8 A9 A10 A11 A12 + * A13 A14 A15 A16 A17 A18 A19 A20 + * A21 A22 A23 A24 A25 A26 A27 A28 + * + * (Yes, this is a strange order.) We also have to remap for missing counters. */ +const static int gen6_oa_snapshot_layout[] = +{ + -1, /* Report ID */ + -1, /* TIMESTAMP (64-bit) */ + -1, /* ...second half... */ + 3, /* A4: Vertex Shader Stall Time - Core Stall */ + -1, /* A3: (not available) */ + 2, /* A2: Vertex Shader Active Time */ + 1, /* A1: Aggregated Core Array Stalled */ + 0, /* A0: Aggregated Core Array Active */ + 10, /* A12: Pixel Shader Active Time */ + 9, /* A11: Geometry Shader ready but not running Time */ + 8, /* A10: # GS threads loaded */ + 7, /* A9: Geometry Shader Stall Time - Core Stall */ + -1, /* A8: (not available) */ + 6, /* A7: Geometry Shader Active Time */ + 5, /* A6: Vertex Shader ready but not running Time */ + 4, /* A5: # VS Threads Loaded */ + 17, /* A20: Early Stencil Test Pixels Failing */ + 16, /* A19: Early Stencil Test Pixels Passing */ + 15, /* A18: Early Z Test Pixels Failing */ + 14, /* A17: Early Z Test Pixels Passing */ + 13, /* A16: Pixel Shader ready but not running Time */ + 12, /* A15: # PS threads loaded */ + 11, /* A14: Pixel Shader Stall Time - Core Stall */ + -1, /* A13: (not available) */ + 25, /* A28: SF active and stalled */ + 24, /* A27: CL active and not stalled */ + 23, /* A26: GPU Busy */ + 22, /* A25: Pixels/samples Written in the frame buffer */ + 21, /* A24: Post PS Z buffer Pixels Failed */ + 20, /* A23: Post PS Stencil Pixels Failed */ + 19, /* A22: Alpha Test Pixels Failed */ + 18, /* A21: Pixel Kill Count */ +}; + const static struct gl_perf_monitor_counter gen6_statistics_counters[] = { COUNTER64("IA_VERTICES_COUNT"), COUNTER64("IA_PRIMITIVES_COUNT"), @@ -146,6 +321,7 @@ const static int gen6_statistics_register_addresses[] = { }; const static struct gl_perf_monitor_group gen6_groups[] = { + GROUP("Observability Architecture Counters", INT_MAX, gen6_raw_oa_counters), GROUP("Pipeline Statistics Registers", INT_MAX, gen6_statistics_counters), }; /** @} */ @@ -154,6 +330,120 @@ const static struct gl_perf_monitor_group gen6_groups[] = { * Ivybridge/Baytrail/Haswell: * @{ */ +const static struct gl_perf_monitor_counter gen7_raw_oa_counters[] = { + COUNTER("Aggregated Core Array Active"), + COUNTER("Aggregated Core Array Stalled"), + COUNTER("Vertex Shader Active Time"), + COUNTER("Vertex Shader Stall Time - Core Stall"), + COUNTER("# VS threads loaded"), + COUNTER("Hull Shader Active Time"), + COUNTER("Hull Shader Stall Time - Core Stall"), + COUNTER("# HS threads loaded"), + COUNTER("Domain Shader Active Time"), + COUNTER("Domain Shader Stall Time - Core Stall"), + COUNTER("# DS threads loaded"), + COUNTER("Compute Shader Active Time"), + COUNTER("Compute Shader Stall Time - Core Stall"), + COUNTER("# CS threads loaded"), + COUNTER("Geometry Shader Active Time"), + COUNTER("Geometry Shader Stall Time - Core Stall"), + COUNTER("# GS threads loaded"), + COUNTER("Pixel Shader Active Time"), + COUNTER("Pixel Shader Stall Time - Core Stall"), + COUNTER("# PS threads loaded"), + COUNTER("HiZ Fast Z Test Pixels Passing"), + COUNTER("HiZ Fast Z Test Pixels Failing"), + COUNTER("Slow Z Test Pixels Passing"), + COUNTER("Slow Z Test Pixels Failing"), + COUNTER("Pixel Kill Count"), + COUNTER("Alpha Test Pixels Failed"), + COUNTER("Post PS Stencil Pixels Failed"), + COUNTER("Post PS Z buffer Pixels Failed"), + COUNTER("3D/GPGPU Render Target Writes"), + COUNTER("Render Engine Busy"), + COUNTER("VS bottleneck"), + COUNTER("GS bottleneck"), +}; + +/** + * Ivybridge/Baytrail/Haswell: Counter Select = 101 + * A4 A3 A2 A1 A0 TIMESTAMP ReportID + * A12 A11 A10 A9 A8 A7 A6 A5 + * A20 A19 A18 A17 A16 A15 A14 A13 + * A28 A27 A26 A25 A24 A23 A22 A21 + * A36 A35 A34 A33 A32 A31 A30 A29 + * A44 A43 A42 A41 A40 A39 A38 A37 + * B7 B6 B5 B4 B3 B2 B1 B0 + * Rsv Rsv Rsv Rsv Rsv Rsv Rsv Rsv + */ +const static int gen7_oa_snapshot_layout[] = +{ + -1, /* Report ID */ + -1, /* TIMESTAMP (64-bit) */ + -1, /* ...second half... */ + 0, /* A0: Aggregated Core Array Active */ + 1, /* A1: Aggregated Core Array Stalled */ + 2, /* A2: Vertex Shader Active Time */ + -1, /* A3: Reserved */ + 3, /* A4: Vertex Shader Stall Time - Core Stall */ + 4, /* A5: # VS threads loaded */ + -1, /* A6: Reserved */ + 5, /* A7: Hull Shader Active Time */ + -1, /* A8: Reserved */ + 6, /* A9: Hull Shader Stall Time - Core Stall */ + 7, /* A10: # HS threads loaded */ + -1, /* A11: Reserved */ + 8, /* A12: Domain Shader Active Time */ + -1, /* A13: Reserved */ + 9, /* A14: Domain Shader Stall Time - Core Stall */ + 10, /* A15: # DS threads loaded */ + -1, /* A16: Reserved */ + 11, /* A17: Compute Shader Active Time */ + -1, /* A18: Reserved */ + 12, /* A19: Compute Shader Stall Time - Core Stall */ + 13, /* A20: # CS threads loaded */ + -1, /* A21: Reserved */ + 14, /* A22: Geometry Shader Active Time */ + -1, /* A23: Reserved */ + 15, /* A24: Geometry Shader Stall Time - Core Stall */ + 16, /* A25: # GS threads loaded */ + -1, /* A26: Reserved */ + 17, /* A27: Pixel Shader Active Time */ + -1, /* A28: Reserved */ + 18, /* A29: Pixel Shader Stall Time - Core Stall */ + 19, /* A30: # PS threads loaded */ + -1, /* A31: Reserved */ + 20, /* A32: HiZ Fast Z Test Pixels Passing */ + 21, /* A33: HiZ Fast Z Test Pixels Failing */ + 22, /* A34: Slow Z Test Pixels Passing */ + 23, /* A35: Slow Z Test Pixels Failing */ + 24, /* A36: Pixel Kill Count */ + 25, /* A37: Alpha Test Pixels Failed */ + 26, /* A38: Post PS Stencil Pixels Failed */ + 27, /* A39: Post PS Z buffer Pixels Failed */ + 28, /* A40: 3D/GPGPU Render Target Writes */ + 29, /* A41: Render Engine Busy */ + 30, /* A42: VS bottleneck */ + 31, /* A43: GS bottleneck */ + -1, /* A44: Reserved */ + -1, /* B0 */ + -1, /* B1 */ + -1, /* B2 */ + -1, /* B3 */ + -1, /* B4 */ + -1, /* B5 */ + -1, /* B6 */ + -1, /* B7 */ + -1, /* Reserved */ + -1, /* Reserved */ + -1, /* Reserved */ + -1, /* Reserved */ + -1, /* Reserved */ + -1, /* Reserved */ + -1, /* Reserved */ + -1, /* Reserved */ +}; + const static struct gl_perf_monitor_counter gen7_statistics_counters[] = { COUNTER64("IA_VERTICES_COUNT"), COUNTER64("IA_PRIMITIVES_COUNT"), @@ -200,6 +490,7 @@ const static int gen7_statistics_register_addresses[] = { }; const static struct gl_perf_monitor_group gen7_groups[] = { + GROUP("Observability Architecture Counters", INT_MAX, gen7_raw_oa_counters), GROUP("Pipeline Statistics Registers", INT_MAX, gen7_statistics_counters), }; /** @} */ @@ -481,13 +772,19 @@ brw_init_performance_monitors(struct brw_context *brw) if (brw->gen == 5) { ctx->PerfMonitor.Groups = gen5_groups; ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen5_groups); + brw->perfmon.oa_snapshot_layout = gen5_oa_snapshot_layout; + brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen5_oa_snapshot_layout); } else if (brw->gen == 6) { ctx->PerfMonitor.Groups = gen6_groups; ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen6_groups); + brw->perfmon.oa_snapshot_layout = gen6_oa_snapshot_layout; + brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen6_oa_snapshot_layout); brw->perfmon.statistics_registers = gen6_statistics_register_addresses; } else if (brw->gen == 7) { ctx->PerfMonitor.Groups = gen7_groups; ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen7_groups); + brw->perfmon.oa_snapshot_layout = gen7_oa_snapshot_layout; + brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen7_oa_snapshot_layout); brw->perfmon.statistics_registers = gen7_statistics_register_addresses; } } |