summaryrefslogtreecommitdiffstats
path: root/src/mesa/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'src/mesa/drivers')
-rw-r--r--src/mesa/drivers/dri/i965/Makefile.sources1
-rw-r--r--src/mesa/drivers/dri/i965/brw_context.c4
-rw-r--r--src/mesa/drivers/dri/i965/brw_context.h43
-rw-r--r--src/mesa/drivers/dri/i965/brw_performance_monitor.c1471
-rw-r--r--src/mesa/drivers/dri/i965/intel_batchbuffer.c9
-rw-r--r--src/mesa/drivers/dri/i965/intel_extensions.c70
6 files changed, 1 insertions, 1597 deletions
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 0a7ba1bb34d..dd546826d19 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -134,7 +134,6 @@ i965_FILES = \
brw_multisample_state.h \
brw_nir_uniforms.cpp \
brw_object_purgeable.c \
- brw_performance_monitor.c \
brw_pipe_control.c \
brw_program.c \
brw_program.h \
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 4ca77c789b4..45490a0f5cf 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -1136,10 +1136,6 @@ brwCreateContext(gl_api api,
_mesa_initialize_dispatch_tables(ctx);
_mesa_initialize_vbo_vtxfmt(ctx);
- if (ctx->Extensions.AMD_performance_monitor) {
- brw_init_performance_monitors(brw);
- }
-
vbo_use_buffer_objects(ctx);
vbo_always_unmap_buffers(ctx);
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index a2817131a50..2dd2686e033 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1170,43 +1170,6 @@ struct brw_context
bool supported;
} predicate;
- struct {
- /** A map from pipeline statistics counter IDs to MMIO addresses. */
- const int *statistics_registers;
-
- /** The number of active monitors using OA counters. */
- unsigned oa_users;
-
- /**
- * A buffer object storing OA counter snapshots taken at the start and
- * end of each batch (creating "bookends" around the batch).
- */
- drm_intel_bo *bookend_bo;
-
- /** The number of snapshots written to bookend_bo. */
- int bookend_snapshots;
-
- /**
- * An array of monitors whose results haven't yet been assembled based on
- * the data in buffer objects.
- *
- * These may be active, or have already ended. However, the results
- * have not been requested.
- */
- struct brw_perf_monitor_object **unresolved;
- int unresolved_elements;
- int unresolved_array_size;
-
- /**
- * Mapping from a uint32_t offset within an OA snapshot to the ID of
- * the counter which MI_REPORT_PERF_COUNT stores there.
- */
- const int *oa_snapshot_layout;
-
- /** Number of 32-bit entries in a hardware counter snapshot. */
- int entries_per_oa_snapshot;
- } perfmon;
-
int num_atoms[BRW_NUM_PIPELINES];
const struct brw_tracked_state render_atoms[76];
const struct brw_tracked_state compute_atoms[11];
@@ -1522,12 +1485,6 @@ bool brw_render_target_supported(struct brw_context *brw,
struct gl_renderbuffer *rb);
uint32_t brw_depth_format(struct brw_context *brw, mesa_format format);
-/* brw_performance_monitor.c */
-void brw_init_performance_monitors(struct brw_context *brw);
-void brw_dump_perf_monitors(struct brw_context *brw);
-void brw_perf_monitor_new_batch(struct brw_context *brw);
-void brw_perf_monitor_finish_batch(struct brw_context *brw);
-
/* intel_buffer_objects.c */
int brw_bo_map(struct brw_context *brw, drm_intel_bo *bo, int write_enable,
const char *bo_name);
diff --git a/src/mesa/drivers/dri/i965/brw_performance_monitor.c b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
deleted file mode 100644
index f8e50e10fa3..00000000000
--- a/src/mesa/drivers/dri/i965/brw_performance_monitor.c
+++ /dev/null
@@ -1,1471 +0,0 @@
-/*
- * Copyright © 2013 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * \file brw_performance_monitor.c
- *
- * Implementation of the GL_AMD_performance_monitor extension.
- *
- * On Gen5+ hardware, we have two sources of performance counter data:
- * the Observability Architecture counters (MI_REPORT_PERF_COUNT), and
- * the Pipeline Statistics Registers. We expose both sets of raw data,
- * as well as some useful processed values.
- *
- * The Observability Architecture (OA) counters for Gen6+ are documented
- * in a separate document from the rest of the PRMs. It is available at:
- * https://01.org/linuxgraphics/documentation/driver-documentation-prms
- * => 2013 Intel Core Processor Family => Observability Performance Counters
- * (This one volume covers Sandybridge, Ivybridge, Baytrail, and Haswell.)
- *
- * On Ironlake, the OA counters were called "CHAPS" counters. Sadly, no public
- * documentation exists; our implementation is based on the source code for the
- * intel_perf_counters utility (which is available as part of intel-gpu-tools).
- */
-
-#include <limits.h>
-
-#include "util/bitset.h"
-#include "main/hash.h"
-#include "main/macros.h"
-#include "main/mtypes.h"
-#include "main/performance_monitor.h"
-
-#include "util/ralloc.h"
-
-#include "brw_context.h"
-#include "brw_defines.h"
-#include "intel_batchbuffer.h"
-
-#define FILE_DEBUG_FLAG DEBUG_PERFMON
-
-/**
- * i965 representation of a performance monitor object.
- */
-struct brw_perf_monitor_object
-{
- /** The base class. */
- struct gl_perf_monitor_object base;
-
- /**
- * BO containing OA counter snapshots at monitor Begin/End time.
- */
- drm_intel_bo *oa_bo;
-
- /** Indexes into bookend_bo (snapshot numbers) for various segments. */
- int oa_head_end;
- int oa_middle_start;
- int oa_tail_start;
-
- /**
- * Storage for OA results accumulated so far.
- *
- * An array indexed by the counter ID in the OA_COUNTERS group.
- *
- * When we run out of space in bookend_bo, we compute the results so far
- * and add them to the value stored here. Then, we can discard bookend_bo.
- */
- uint32_t *oa_results;
-
- /**
- * BO containing starting and ending snapshots for any active pipeline
- * statistics counters.
- */
- drm_intel_bo *pipeline_stats_bo;
-
- /**
- * Storage for final pipeline statistics counter results.
- */
- uint64_t *pipeline_stats_results;
-};
-
-/** Downcasting convenience macro. */
-static inline struct brw_perf_monitor_object *
-brw_perf_monitor(struct gl_perf_monitor_object *m)
-{
- return (struct brw_perf_monitor_object *) m;
-}
-
-#define SECOND_SNAPSHOT_OFFSET_IN_BYTES 2048
-
-/* A random value used to ensure we're getting valid snapshots. */
-#define REPORT_ID 0xd2e9c607
-
-/******************************************************************************/
-
-#define COUNTER(name) \
- { \
- .Name = name, \
- .Type = GL_UNSIGNED_INT, \
- .Minimum = { .u32 = 0 }, \
- .Maximum = { .u32 = ~0 }, \
- }
-
-#define COUNTER64(name) \
- { \
- .Name = name, \
- .Type = GL_UNSIGNED_INT64_AMD, \
- .Minimum = { .u64 = 0 }, \
- .Maximum = { .u64 = ~0 }, \
- }
-
-#define GROUP(name, max_active, counter_list) \
- { \
- .Name = name, \
- .MaxActiveCounters = max_active, \
- .Counters = counter_list, \
- .NumCounters = ARRAY_SIZE(counter_list), \
- }
-
-/** Performance Monitor Group IDs */
-enum brw_counter_groups {
- OA_COUNTERS, /* Observability Architecture (MI_REPORT_PERF_COUNT) Counters */
- PIPELINE_STATS_COUNTERS, /* Pipeline Statistics Register Counters */
-};
-
-/**
- * Ironlake:
- * @{
- *
- * The list of CHAPS counters unfortunately does not appear in any public
- * documentation, but is available by reading the source code for the
- * intel_perf_counters utility (shipped as part of intel-gpu-tools).
- */
-static const struct gl_perf_monitor_counter gen5_raw_chaps_counters[] = {
- COUNTER("cycles the CS unit is starved"),
- COUNTER("cycles the CS unit is stalled"),
- COUNTER("cycles the VF unit is starved"),
- COUNTER("cycles the VF unit is stalled"),
- COUNTER("cycles the VS unit is starved"),
- COUNTER("cycles the VS unit is stalled"),
- COUNTER("cycles the GS unit is starved"),
- COUNTER("cycles the GS unit is stalled"),
- COUNTER("cycles the CL unit is starved"),
- COUNTER("cycles the CL unit is stalled"),
- COUNTER("cycles the SF unit is starved"),
- COUNTER("cycles the SF unit is stalled"),
- COUNTER("cycles the WZ unit is starved"),
- COUNTER("cycles the WZ unit is stalled"),
- COUNTER("Z buffer read/write"),
- COUNTER("cycles each EU was active"),
- COUNTER("cycles each EU was suspended"),
- COUNTER("cycles threads loaded all EUs"),
- COUNTER("cycles filtering active"),
- COUNTER("cycles PS threads executed"),
- COUNTER("subspans written to RC"),
- COUNTER("bytes read for texture reads"),
- COUNTER("texels returned from sampler"),
- COUNTER("polygons not culled"),
- COUNTER("clocks MASF has valid message"),
- COUNTER("64b writes/reads from RC"),
- COUNTER("reads on dataport"),
- COUNTER("clocks MASF has valid msg not consumed by sampler"),
- COUNTER("cycles any EU is stalled for math"),
-};
-
-static const int gen5_oa_snapshot_layout[] =
-{
- -1, /* Report ID */
- -1, /* TIMESTAMP (64-bit) */
- -1, /* ...second half... */
- 0, /* cycles the CS unit is starved */
- 1, /* cycles the CS unit is stalled */
- 2, /* cycles the VF unit is starved */
- 3, /* cycles the VF unit is stalled */
- 4, /* cycles the VS unit is starved */
- 5, /* cycles the VS unit is stalled */
- 6, /* cycles the GS unit is starved */
- 7, /* cycles the GS unit is stalled */
- 8, /* cycles the CL unit is starved */
- 9, /* cycles the CL unit is stalled */
- 10, /* cycles the SF unit is starved */
- 11, /* cycles the SF unit is stalled */
- 12, /* cycles the WZ unit is starved */
- 13, /* cycles the WZ unit is stalled */
- 14, /* Z buffer read/write */
- 15, /* cycles each EU was active */
- 16, /* cycles each EU was suspended */
- 17, /* cycles threads loaded all EUs */
- 18, /* cycles filtering active */
- 19, /* cycles PS threads executed */
- 20, /* subspans written to RC */
- 21, /* bytes read for texture reads */
- 22, /* texels returned from sampler */
- 23, /* polygons not culled */
- 24, /* clocks MASF has valid message */
- 25, /* 64b writes/reads from RC */
- 26, /* reads on dataport */
- 27, /* clocks MASF has valid msg not consumed by sampler */
- 28, /* cycles any EU is stalled for math */
-};
-
-static const struct gl_perf_monitor_group gen5_groups[] = {
- [OA_COUNTERS] = GROUP("CHAPS Counters", INT_MAX, gen5_raw_chaps_counters),
- /* Our pipeline statistics counter handling requires hardware contexts. */
-};
-/** @} */
-
-/**
- * Sandybridge:
- * @{
- *
- * A few of the counters here (A17-A20) are not included in the latest
- * documentation, but are described in the Ironlake PRM (which strangely
- * documents Sandybridge's performance counter system, not Ironlake's).
- * It's unclear whether they work or not; empirically, they appear to.
- */
-
-/**
- * Aggregating counters A0-A28:
- */
-static const struct gl_perf_monitor_counter gen6_raw_oa_counters[] = {
- /* A0: 0 */ COUNTER("Aggregated Core Array Active"),
- /* A1: 1 */ COUNTER("Aggregated Core Array Stalled"),
- /* A2: 2 */ COUNTER("Vertex Shader Active Time"),
- /* A3: Not actually hooked up on Sandybridge. */
- /* A4: 3 */ COUNTER("Vertex Shader Stall Time - Core Stall"),
- /* A5: 4 */ COUNTER("# VS threads loaded"),
- /* A6: 5 */ COUNTER("Vertex Shader Ready but not running Time"),
- /* A7: 6 */ COUNTER("Geometry Shader Active Time"),
- /* A8: Not actually hooked up on Sandybridge. */
- /* A9: 7 */ COUNTER("Geometry Shader Stall Time - Core Stall"),
- /* A10: 8 */ COUNTER("# GS threads loaded"),
- /* A11: 9 */ COUNTER("Geometry Shader Ready but not running Time"),
- /* A12: 10 */ COUNTER("Pixel Shader Active Time"),
- /* A13: Not actually hooked up on Sandybridge. */
- /* A14: 11 */ COUNTER("Pixel Shader Stall Time - Core Stall"),
- /* A15: 12 */ COUNTER("# PS threads loaded"),
- /* A16: 13 */ COUNTER("Pixel Shader Ready but not running Time"),
- /* A17: 14 */ COUNTER("Early Z Test Pixels Passing"),
- /* A18: 15 */ COUNTER("Early Z Test Pixels Failing"),
- /* A19: 16 */ COUNTER("Early Stencil Test Pixels Passing"),
- /* A20: 17 */ COUNTER("Early Stencil Test Pixels Failing"),
- /* A21: 18 */ COUNTER("Pixel Kill Count"),
- /* A22: 19 */ COUNTER("Alpha Test Pixels Failed"),
- /* A23: 20 */ COUNTER("Post PS Stencil Pixels Failed"),
- /* A24: 21 */ COUNTER("Post PS Z buffer Pixels Failed"),
- /* A25: 22 */ COUNTER("Pixels/samples Written in the frame buffer"),
- /* A26: 23 */ COUNTER("GPU Busy"),
- /* A27: 24 */ COUNTER("CL active and not stalled"),
- /* A28: 25 */ COUNTER("SF active and stalled"),
-};
-
-/**
- * Sandybridge: Counter Select = 001
- * A0 A1 A2 A3 A4 TIMESTAMP RPT_ID
- * A5 A6 A7 A8 A9 A10 A11 A12
- * A13 A14 A15 A16 A17 A18 A19 A20
- * A21 A22 A23 A24 A25 A26 A27 A28
- *
- * (Yes, this is a strange order.) We also have to remap for missing counters.
- */
-static const int gen6_oa_snapshot_layout[] =
-{
- -1, /* Report ID */
- -1, /* TIMESTAMP (64-bit) */
- -1, /* ...second half... */
- 3, /* A4: Vertex Shader Stall Time - Core Stall */
- -1, /* A3: (not available) */
- 2, /* A2: Vertex Shader Active Time */
- 1, /* A1: Aggregated Core Array Stalled */
- 0, /* A0: Aggregated Core Array Active */
- 10, /* A12: Pixel Shader Active Time */
- 9, /* A11: Geometry Shader ready but not running Time */
- 8, /* A10: # GS threads loaded */
- 7, /* A9: Geometry Shader Stall Time - Core Stall */
- -1, /* A8: (not available) */
- 6, /* A7: Geometry Shader Active Time */
- 5, /* A6: Vertex Shader ready but not running Time */
- 4, /* A5: # VS Threads Loaded */
- 17, /* A20: Early Stencil Test Pixels Failing */
- 16, /* A19: Early Stencil Test Pixels Passing */
- 15, /* A18: Early Z Test Pixels Failing */
- 14, /* A17: Early Z Test Pixels Passing */
- 13, /* A16: Pixel Shader ready but not running Time */
- 12, /* A15: # PS threads loaded */
- 11, /* A14: Pixel Shader Stall Time - Core Stall */
- -1, /* A13: (not available) */
- 25, /* A28: SF active and stalled */
- 24, /* A27: CL active and not stalled */
- 23, /* A26: GPU Busy */
- 22, /* A25: Pixels/samples Written in the frame buffer */
- 21, /* A24: Post PS Z buffer Pixels Failed */
- 20, /* A23: Post PS Stencil Pixels Failed */
- 19, /* A22: Alpha Test Pixels Failed */
- 18, /* A21: Pixel Kill Count */
-};
-
-static const struct gl_perf_monitor_counter gen6_statistics_counters[] = {
- COUNTER64("IA_VERTICES_COUNT"),
- COUNTER64("IA_PRIMITIVES_COUNT"),
- COUNTER64("VS_INVOCATION_COUNT"),
- COUNTER64("GS_INVOCATION_COUNT"),
- COUNTER64("GS_PRIMITIVES_COUNT"),
- COUNTER64("CL_INVOCATION_COUNT"),
- COUNTER64("CL_PRIMITIVES_COUNT"),
- COUNTER64("PS_INVOCATION_COUNT"),
- COUNTER64("PS_DEPTH_COUNT"),
- COUNTER64("SO_NUM_PRIMS_WRITTEN"),
- COUNTER64("SO_PRIM_STORAGE_NEEDED"),
-};
-
-/** MMIO register addresses for each pipeline statistics counter. */
-static const int gen6_statistics_register_addresses[] = {
- IA_VERTICES_COUNT,
- IA_PRIMITIVES_COUNT,
- VS_INVOCATION_COUNT,
- GS_INVOCATION_COUNT,
- GS_PRIMITIVES_COUNT,
- CL_INVOCATION_COUNT,
- CL_PRIMITIVES_COUNT,
- PS_INVOCATION_COUNT,
- PS_DEPTH_COUNT,
- GEN6_SO_NUM_PRIMS_WRITTEN,
- GEN6_SO_PRIM_STORAGE_NEEDED,
-};
-
-static const struct gl_perf_monitor_group gen6_groups[] = {
- GROUP("Observability Architecture Counters", INT_MAX, gen6_raw_oa_counters),
- GROUP("Pipeline Statistics Registers", INT_MAX, gen6_statistics_counters),
-};
-/** @} */
-
-/**
- * Ivybridge/Baytrail/Haswell:
- * @{
- */
-static const struct gl_perf_monitor_counter gen7_raw_oa_counters[] = {
- COUNTER("Aggregated Core Array Active"),
- COUNTER("Aggregated Core Array Stalled"),
- COUNTER("Vertex Shader Active Time"),
- COUNTER("Vertex Shader Stall Time - Core Stall"),
- COUNTER("# VS threads loaded"),
- COUNTER("Hull Shader Active Time"),
- COUNTER("Hull Shader Stall Time - Core Stall"),
- COUNTER("# HS threads loaded"),
- COUNTER("Domain Shader Active Time"),
- COUNTER("Domain Shader Stall Time - Core Stall"),
- COUNTER("# DS threads loaded"),
- COUNTER("Compute Shader Active Time"),
- COUNTER("Compute Shader Stall Time - Core Stall"),
- COUNTER("# CS threads loaded"),
- COUNTER("Geometry Shader Active Time"),
- COUNTER("Geometry Shader Stall Time - Core Stall"),
- COUNTER("# GS threads loaded"),
- COUNTER("Pixel Shader Active Time"),
- COUNTER("Pixel Shader Stall Time - Core Stall"),
- COUNTER("# PS threads loaded"),
- COUNTER("HiZ Fast Z Test Pixels Passing"),
- COUNTER("HiZ Fast Z Test Pixels Failing"),
- COUNTER("Slow Z Test Pixels Passing"),
- COUNTER("Slow Z Test Pixels Failing"),
- COUNTER("Pixel Kill Count"),
- COUNTER("Alpha Test Pixels Failed"),
- COUNTER("Post PS Stencil Pixels Failed"),
- COUNTER("Post PS Z buffer Pixels Failed"),
- COUNTER("3D/GPGPU Render Target Writes"),
- COUNTER("Render Engine Busy"),
- COUNTER("VS bottleneck"),
- COUNTER("GS bottleneck"),
-};
-
-/**
- * Ivybridge/Baytrail/Haswell: Counter Select = 101
- * A4 A3 A2 A1 A0 TIMESTAMP ReportID
- * A12 A11 A10 A9 A8 A7 A6 A5
- * A20 A19 A18 A17 A16 A15 A14 A13
- * A28 A27 A26 A25 A24 A23 A22 A21
- * A36 A35 A34 A33 A32 A31 A30 A29
- * A44 A43 A42 A41 A40 A39 A38 A37
- * B7 B6 B5 B4 B3 B2 B1 B0
- * Rsv Rsv Rsv Rsv Rsv Rsv Rsv Rsv
- */
-static const int gen7_oa_snapshot_layout[] =
-{
- -1, /* Report ID */
- -1, /* TIMESTAMP (64-bit) */
- -1, /* ...second half... */
- 0, /* A0: Aggregated Core Array Active */
- 1, /* A1: Aggregated Core Array Stalled */
- 2, /* A2: Vertex Shader Active Time */
- -1, /* A3: Reserved */
- 3, /* A4: Vertex Shader Stall Time - Core Stall */
- 4, /* A5: # VS threads loaded */
- -1, /* A6: Reserved */
- 5, /* A7: Hull Shader Active Time */
- -1, /* A8: Reserved */
- 6, /* A9: Hull Shader Stall Time - Core Stall */
- 7, /* A10: # HS threads loaded */
- -1, /* A11: Reserved */
- 8, /* A12: Domain Shader Active Time */
- -1, /* A13: Reserved */
- 9, /* A14: Domain Shader Stall Time - Core Stall */
- 10, /* A15: # DS threads loaded */
- -1, /* A16: Reserved */
- 11, /* A17: Compute Shader Active Time */
- -1, /* A18: Reserved */
- 12, /* A19: Compute Shader Stall Time - Core Stall */
- 13, /* A20: # CS threads loaded */
- -1, /* A21: Reserved */
- 14, /* A22: Geometry Shader Active Time */
- -1, /* A23: Reserved */
- 15, /* A24: Geometry Shader Stall Time - Core Stall */
- 16, /* A25: # GS threads loaded */
- -1, /* A26: Reserved */
- 17, /* A27: Pixel Shader Active Time */
- -1, /* A28: Reserved */
- 18, /* A29: Pixel Shader Stall Time - Core Stall */
- 19, /* A30: # PS threads loaded */
- -1, /* A31: Reserved */
- 20, /* A32: HiZ Fast Z Test Pixels Passing */
- 21, /* A33: HiZ Fast Z Test Pixels Failing */
- 22, /* A34: Slow Z Test Pixels Passing */
- 23, /* A35: Slow Z Test Pixels Failing */
- 24, /* A36: Pixel Kill Count */
- 25, /* A37: Alpha Test Pixels Failed */
- 26, /* A38: Post PS Stencil Pixels Failed */
- 27, /* A39: Post PS Z buffer Pixels Failed */
- 28, /* A40: 3D/GPGPU Render Target Writes */
- 29, /* A41: Render Engine Busy */
- 30, /* A42: VS bottleneck */
- 31, /* A43: GS bottleneck */
- -1, /* A44: Reserved */
- -1, /* B0 */
- -1, /* B1 */
- -1, /* B2 */
- -1, /* B3 */
- -1, /* B4 */
- -1, /* B5 */
- -1, /* B6 */
- -1, /* B7 */
- -1, /* Reserved */
- -1, /* Reserved */
- -1, /* Reserved */
- -1, /* Reserved */
- -1, /* Reserved */
- -1, /* Reserved */
- -1, /* Reserved */
- -1, /* Reserved */
-};
-
-static const struct gl_perf_monitor_counter gen7_statistics_counters[] = {
- COUNTER64("IA_VERTICES_COUNT"),
- COUNTER64("IA_PRIMITIVES_COUNT"),
- COUNTER64("VS_INVOCATION_COUNT"),
- COUNTER64("HS_INVOCATION_COUNT"),
- COUNTER64("DS_INVOCATION_COUNT"),
- COUNTER64("GS_INVOCATION_COUNT"),
- COUNTER64("GS_PRIMITIVES_COUNT"),
- COUNTER64("CL_INVOCATION_COUNT"),
- COUNTER64("CL_PRIMITIVES_COUNT"),
- COUNTER64("PS_INVOCATION_COUNT"),
- COUNTER64("PS_DEPTH_COUNT"),
- COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 0)"),
- COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 1)"),
- COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 2)"),
- COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 3)"),
- COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 0)"),
- COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 1)"),
- COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 2)"),
- COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 3)"),
-};
-
-/** MMIO register addresses for each pipeline statistics counter. */
-static const int gen7_statistics_register_addresses[] = {
- IA_VERTICES_COUNT,
- IA_PRIMITIVES_COUNT,
- VS_INVOCATION_COUNT,
- HS_INVOCATION_COUNT,
- DS_INVOCATION_COUNT,
- GS_INVOCATION_COUNT,
- GS_PRIMITIVES_COUNT,
- CL_INVOCATION_COUNT,
- CL_PRIMITIVES_COUNT,
- PS_INVOCATION_COUNT,
- PS_DEPTH_COUNT,
- GEN7_SO_NUM_PRIMS_WRITTEN(0),
- GEN7_SO_NUM_PRIMS_WRITTEN(1),
- GEN7_SO_NUM_PRIMS_WRITTEN(2),
- GEN7_SO_NUM_PRIMS_WRITTEN(3),
- GEN7_SO_PRIM_STORAGE_NEEDED(0),
- GEN7_SO_PRIM_STORAGE_NEEDED(1),
- GEN7_SO_PRIM_STORAGE_NEEDED(2),
- GEN7_SO_PRIM_STORAGE_NEEDED(3),
-};
-
-static const struct gl_perf_monitor_group gen7_groups[] = {
- GROUP("Observability Architecture Counters", INT_MAX, gen7_raw_oa_counters),
- GROUP("Pipeline Statistics Registers", INT_MAX, gen7_statistics_counters),
-};
-/** @} */
-
-/******************************************************************************/
-
-static GLboolean brw_is_perf_monitor_result_available(struct gl_context *, struct gl_perf_monitor_object *);
-
-static void
-dump_perf_monitor_callback(GLuint name, void *monitor_void, void *brw_void)
-{
- struct brw_context *brw = brw_void;
- struct gl_context *ctx = brw_void;
- struct gl_perf_monitor_object *m = monitor_void;
- struct brw_perf_monitor_object *monitor = monitor_void;
-
- const char *resolved = "";
- for (int i = 0; i < brw->perfmon.unresolved_elements; i++) {
- if (brw->perfmon.unresolved[i] == monitor) {
- resolved = "Unresolved";
- break;
- }
- }
-
- DBG("%4d %-7s %-6s %-10s %-11s <%3d, %3d, %3d> %-6s %-9s\n",
- name,
- m->Active ? "Active" : "",
- m->Ended ? "Ended" : "",
- resolved,
- brw_is_perf_monitor_result_available(ctx, m) ? "Available" : "",
- monitor->oa_head_end,
- monitor->oa_middle_start,
- monitor->oa_tail_start,
- monitor->oa_bo ? "OA BO" : "",
- monitor->pipeline_stats_bo ? "Stats BO" : "");
-}
-
-void
-brw_dump_perf_monitors(struct brw_context *brw)
-{
- struct gl_context *ctx = &brw->ctx;
- DBG("Monitors: (OA users = %d)\n", brw->perfmon.oa_users);
- _mesa_HashWalk(ctx->PerfMonitor.Monitors, dump_perf_monitor_callback, brw);
-}
-
-/******************************************************************************/
-
-static bool
-monitor_needs_statistics_registers(struct brw_context *brw,
- struct gl_perf_monitor_object *m)
-{
- return brw->gen >= 6 && m->ActiveGroups[PIPELINE_STATS_COUNTERS];
-}
-
-/**
- * Take a snapshot of any monitored pipeline statistics counters.
- */
-static void
-snapshot_statistics_registers(struct brw_context *brw,
- struct brw_perf_monitor_object *monitor,
- uint32_t offset)
-{
- struct gl_context *ctx = &brw->ctx;
- const int group = PIPELINE_STATS_COUNTERS;
- const int num_counters = ctx->PerfMonitor.Groups[group].NumCounters;
-
- brw_emit_mi_flush(brw);
-
- for (int i = 0; i < num_counters; i++) {
- if (BITSET_TEST(monitor->base.ActiveCounters[group], i)) {
- assert(ctx->PerfMonitor.Groups[group].Counters[i].Type ==
- GL_UNSIGNED_INT64_AMD);
-
- brw_store_register_mem64(brw, monitor->pipeline_stats_bo,
- brw->perfmon.statistics_registers[i],
- offset + i * sizeof(uint64_t));
- }
- }
-}
-
-/**
- * Gather results from pipeline_stats_bo, storing the final values.
- *
- * This allows us to free pipeline_stats_bo (which is 4K) in favor of a much
- * smaller array of final results.
- */
-static void
-gather_statistics_results(struct brw_context *brw,
- struct brw_perf_monitor_object *monitor)
-{
- struct gl_context *ctx = &brw->ctx;
- const int num_counters =
- ctx->PerfMonitor.Groups[PIPELINE_STATS_COUNTERS].NumCounters;
-
- monitor->pipeline_stats_results = calloc(num_counters, sizeof(uint64_t));
- if (monitor->pipeline_stats_results == NULL) {
- _mesa_error_no_memory(__func__);
- return;
- }
-
- drm_intel_bo_map(monitor->pipeline_stats_bo, false);
- uint64_t *start = monitor->pipeline_stats_bo->virtual;
- uint64_t *end = start + (SECOND_SNAPSHOT_OFFSET_IN_BYTES / sizeof(uint64_t));
-
- for (int i = 0; i < num_counters; i++) {
- monitor->pipeline_stats_results[i] = end[i] - start[i];
- }
- drm_intel_bo_unmap(monitor->pipeline_stats_bo);
- drm_intel_bo_unreference(monitor->pipeline_stats_bo);
- monitor->pipeline_stats_bo = NULL;
-}
-
-/******************************************************************************/
-
-static bool
-monitor_needs_oa(struct brw_context *brw,
- struct gl_perf_monitor_object *m)
-{
- return m->ActiveGroups[OA_COUNTERS];
-}
-
-/**
- * Enable the Observability Architecture counters by whacking OACONTROL.
- */
-static void
-start_oa_counters(struct brw_context *brw)
-{
- unsigned counter_format;
-
- /* Pick the counter format which gives us all the counters. */
- switch (brw->gen) {
- case 5:
- return; /* Ironlake counters are always running. */
- case 6:
- counter_format = 0b001;
- break;
- case 7:
- counter_format = 0b101;
- break;
- default:
- unreachable("Tried to enable OA counters on an unsupported generation.");
- }
-
- BEGIN_BATCH(3);
- OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
- OUT_BATCH(OACONTROL);
- OUT_BATCH(counter_format << OACONTROL_COUNTER_SELECT_SHIFT |
- OACONTROL_ENABLE_COUNTERS);
- ADVANCE_BATCH();
-}
-
-/**
- * Disable OA counters.
- */
-static void
-stop_oa_counters(struct brw_context *brw)
-{
- /* Ironlake counters never stop. */
- if (brw->gen == 5)
- return;
-
- BEGIN_BATCH(3);
- OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
- OUT_BATCH(OACONTROL);
- OUT_BATCH(0);
- ADVANCE_BATCH();
-}
-
-/**
- * The amount of batch space it takes to emit an MI_REPORT_PERF_COUNT snapshot,
- * including the required PIPE_CONTROL flushes.
- *
- * Sandybridge is the worst case scenario: brw_emit_mi_flush expands to four
- * PIPE_CONTROLs which are 5 DWords each. We have to flush before and after
- * MI_REPORT_PERF_COUNT, so multiply by two. Finally, add the 3 DWords for
- * MI_REPORT_PERF_COUNT itself.
- */
-#define MI_REPORT_PERF_COUNT_BATCH_DWORDS (2 * (4 * 5) + 3)
-
-/**
- * Emit an MI_REPORT_PERF_COUNT command packet.
- *
- * This writes the current OA counter values to buffer.
- */
-static void
-emit_mi_report_perf_count(struct brw_context *brw,
- drm_intel_bo *bo,
- uint32_t offset_in_bytes,
- uint32_t report_id)
-{
- assert(offset_in_bytes % 64 == 0);
-
- /* Make sure the commands to take a snapshot fits in a single batch. */
- intel_batchbuffer_require_space(brw, MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4,
- RENDER_RING);
- int batch_used = USED_BATCH(brw->batch);
-
- /* Reports apparently don't always get written unless we flush first. */
- brw_emit_mi_flush(brw);
-
- if (brw->gen == 5) {
- /* Ironlake requires two MI_REPORT_PERF_COUNT commands to write all
- * the counters. The report ID is ignored in the second set.
- */
- BEGIN_BATCH(6);
- OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_0);
- OUT_RELOC(bo,
- I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
- offset_in_bytes);
- OUT_BATCH(report_id);
-
- OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_1);
- OUT_RELOC(bo,
- I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
- offset_in_bytes + 64);
- OUT_BATCH(report_id);
- ADVANCE_BATCH();
- } else if (brw->gen == 6) {
- BEGIN_BATCH(3);
- OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT);
- OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
- offset_in_bytes | MI_COUNTER_ADDRESS_GTT);
- OUT_BATCH(report_id);
- ADVANCE_BATCH();
- } else if (brw->gen == 7) {
- BEGIN_BATCH(3);
- OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT);
- OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
- offset_in_bytes);
- OUT_BATCH(report_id);
- ADVANCE_BATCH();
- } else {
- unreachable("Unsupported generation for performance counters.");
- }
-
- /* Reports apparently don't always get written unless we flush after. */
- brw_emit_mi_flush(brw);
-
- (void) batch_used;
- assert(USED_BATCH(brw->batch) - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4);
-}
-
-/**
- * Add a monitor to the global list of "unresolved monitors."
- *
- * Monitors are "unresolved" if they refer to OA counter snapshots in
- * bookend_bo. Results (even partial ones) must be gathered for all
- * unresolved monitors before it's safe to discard bookend_bo.
- */
-static void
-add_to_unresolved_monitor_list(struct brw_context *brw,
- struct brw_perf_monitor_object *monitor)
-{
- if (brw->perfmon.unresolved_elements >=
- brw->perfmon.unresolved_array_size) {
- brw->perfmon.unresolved_array_size *= 2;
- brw->perfmon.unresolved = reralloc(brw, brw->perfmon.unresolved,
- struct brw_perf_monitor_object *,
- brw->perfmon.unresolved_array_size);
- }
-
- brw->perfmon.unresolved[brw->perfmon.unresolved_elements++] = monitor;
-}
-
-/**
- * If possible, throw away the contents of bookend BO.
- *
- * When all monitoring stops, and no monitors need data from bookend_bo to
- * compute results, we can discard it and start writing snapshots at the
- * beginning again. This helps reduce the amount of buffer wraparound.
- */
-static void
-clean_bookend_bo(struct brw_context *brw)
-{
- if (brw->perfmon.unresolved_elements == 0) {
- DBG("***Resetting bookend snapshots to 0\n");
- brw->perfmon.bookend_snapshots = 0;
- }
-}
-
-/**
- * Remove a monitor from the global list of "unresolved monitors."
- *
- * This can happen when:
- * - We finish computing a completed monitor's results.
- * - We discard unwanted monitor results.
- * - A monitor's results can be computed without relying on bookend_bo.
- */
-static void
-drop_from_unresolved_monitor_list(struct brw_context *brw,
- struct brw_perf_monitor_object *monitor)
-{
- for (int i = 0; i < brw->perfmon.unresolved_elements; i++) {
- if (brw->perfmon.unresolved[i] == monitor) {
- int last_elt = --brw->perfmon.unresolved_elements;
-
- if (i == last_elt) {
- brw->perfmon.unresolved[i] = NULL;
- } else {
- brw->perfmon.unresolved[i] = brw->perfmon.unresolved[last_elt];
- }
-
- clean_bookend_bo(brw);
- return;
- }
- }
-}
-
-/**
- * Given pointers to starting and ending OA snapshots, add the deltas for each
- * counter to the results.
- */
-static void
-add_deltas(struct brw_context *brw,
- struct brw_perf_monitor_object *monitor,
- uint32_t *start, uint32_t *end)
-{
- /* Look for expected report ID values to ensure data is present. */
- assert(start[0] == REPORT_ID);
- assert(end[0] == REPORT_ID);
-
- /* Subtract each counter's ending and starting values, then add the
- * difference to the counter's value so far.
- */
- for (int i = 3; i < brw->perfmon.entries_per_oa_snapshot; i++) {
- /* When debugging, it's useful to note when the ending value is less than
- * the starting value; aggregating counters should always increase in
- * value (or remain unchanged). This happens periodically due to
- * wraparound, but can also indicate serious problems.
- */
-#ifdef DEBUG
- if (end[i] < start[i]) {
- int counter = brw->perfmon.oa_snapshot_layout[i];
- if (counter >= 0) {
- DBG("WARNING: \"%s\" ending value was less than the starting "
- "value: %u < %u (end - start = %u)\n",
- brw->ctx.PerfMonitor.Groups[0].Counters[counter].Name,
- end[i], start[i], end[i] - start[i]);
- }
- }
-#endif
- monitor->oa_results[i] += end[i] - start[i];
- }
-}
-
-/**
- * Gather OA counter results (partial or full) from a series of snapshots.
- *
- * Monitoring can start or stop at any time, likely at some point mid-batch.
- * We write snapshots for both events, storing them in monitor->oa_bo.
- *
- * Ideally, we would simply subtract those two snapshots to obtain the final
- * counter results. Unfortunately, our hardware doesn't preserve their values
- * across context switches or GPU sleep states. In order to support multiple
- * concurrent OA clients, as well as reliable data across power management,
- * we have to take snapshots at the start and end of batches as well.
- *
- * This results in a three-part sequence of (start, end) intervals:
- * - The "head" is from the BeginPerfMonitor snapshot to the end of the first
- * batchbuffer.
- * - The "middle" is a series of (batch start, batch end) snapshots which
- * bookend any batchbuffers between the ones which start/end monitoring.
- * - The "tail" is from the start of the last batch where monitoring was
- * active to the EndPerfMonitor snapshot.
- *
- * Due to wrapping in the bookend BO, we may have to accumulate partial results.
- * If so, we handle the "head" and any "middle" results so far. When monitoring
- * eventually ends, we handle additional "middle" batches and the "tail."
- */
-static void
-gather_oa_results(struct brw_context *brw,
- struct brw_perf_monitor_object *monitor,
- uint32_t *bookend_buffer)
-{
- struct gl_perf_monitor_object *m = &monitor->base;
- assert(monitor->oa_bo != NULL);
-
- drm_intel_bo_map(monitor->oa_bo, false);
- uint32_t *monitor_buffer = monitor->oa_bo->virtual;
-
- /* If monitoring was entirely contained within a single batch, then the
- * bookend BO is irrelevant. Just subtract monitor->bo's two snapshots.
- */
- if (monitor->oa_middle_start == -1) {
- add_deltas(brw, monitor,
- monitor_buffer,
- monitor_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES /
- sizeof(uint32_t)));
- drm_intel_bo_unmap(monitor->oa_bo);
- return;
- }
-
- const ptrdiff_t snapshot_size = brw->perfmon.entries_per_oa_snapshot;
-
- /* First, add the contributions from the "head" interval:
- * (snapshot taken at BeginPerfMonitor time,
- * snapshot taken at the end of the first batch after monitoring began)
- */
- if (monitor->oa_head_end != -1) {
- assert(monitor->oa_head_end < brw->perfmon.bookend_snapshots);
- add_deltas(brw, monitor,
- monitor_buffer,
- bookend_buffer + snapshot_size * monitor->oa_head_end);
-
- /* Make sure we don't count the "head" again in the future. */
- monitor->oa_head_end = -1;
- }
-
- /* Next, count the contributions from the "middle" batches. These are
- * (batch begin, batch end) deltas while monitoring was active.
- */
- int last_snapshot;
- if (m->Ended)
- last_snapshot = monitor->oa_tail_start;
- else
- last_snapshot = brw->perfmon.bookend_snapshots;
-
- for (int s = monitor->oa_middle_start; s < last_snapshot; s += 2) {
- add_deltas(brw, monitor,
- bookend_buffer + snapshot_size * s,
- bookend_buffer + snapshot_size * (s + 1));
- }
-
- /* Finally, if the monitor has ended, we need to count the contributions of
- * the "tail" interval:
- * (start of the batch where monitoring ended, EndPerfMonitor snapshot)
- */
- if (m->Ended) {
- assert(monitor->oa_tail_start != -1);
- add_deltas(brw, monitor,
- bookend_buffer + snapshot_size * monitor->oa_tail_start,
- monitor_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES /
- sizeof(uint32_t)));
- }
-
- drm_intel_bo_unmap(monitor->oa_bo);
-
- /* If the monitor has ended, then we've gathered all the results, and
- * can free the monitor's OA BO.
- */
- if (m->Ended) {
- drm_intel_bo_unreference(monitor->oa_bo);
- monitor->oa_bo = NULL;
-
- /* The monitor's OA result is now resolved. */
- DBG("Marking %d resolved - results gathered\n", m->Name);
- drop_from_unresolved_monitor_list(brw, monitor);
- }
-}
-
-/**
- * Handle running out of space in the bookend BO.
- *
- * When we run out of space in the bookend BO, we need to gather up partial
- * results for every unresolved monitor. This allows us to free the snapshot
- * data in bookend_bo, freeing up the space for reuse. We call this "wrapping."
- *
- * This will completely compute the result for any unresolved monitors that
- * have ended.
- */
-static void
-wrap_bookend_bo(struct brw_context *brw)
-{
- DBG("****Wrap bookend BO****\n");
- /* Note that wrapping will only occur at the start of a batch, since that's
- * where we reserve space. So the current batch won't reference bookend_bo
- * or any monitor BOs. This means we don't need to worry about
- * synchronization.
- *
- * Also, EndPerfMonitor guarantees that only monitors which span multiple
- * batches exist in the unresolved monitor list.
- */
- assert(brw->perfmon.oa_users > 0);
-
- drm_intel_bo_map(brw->perfmon.bookend_bo, false);
- uint32_t *bookend_buffer = brw->perfmon.bookend_bo->virtual;
- for (int i = 0; i < brw->perfmon.unresolved_elements; i++) {
- struct brw_perf_monitor_object *monitor = brw->perfmon.unresolved[i];
- struct gl_perf_monitor_object *m = &monitor->base;
-
- gather_oa_results(brw, monitor, bookend_buffer);
-
- if (m->Ended) {
- /* gather_oa_results() dropped the monitor from the unresolved list,
- * throwing our indices off by one.
- */
- --i;
- } else {
- /* When we create the new bookend_bo, snapshot #0 will be the
- * beginning of another "middle" BO.
- */
- monitor->oa_middle_start = 0;
- assert(monitor->oa_head_end == -1);
- assert(monitor->oa_tail_start == -1);
- }
- }
- drm_intel_bo_unmap(brw->perfmon.bookend_bo);
-
- brw->perfmon.bookend_snapshots = 0;
-}
-
-/* This is fairly arbitrary; the trade off is memory usage vs. extra overhead
- * from wrapping. On Gen7, 32768 should be enough for 128 snapshots before
- * wrapping (since each is 256 bytes).
- */
-#define BOOKEND_BO_SIZE_BYTES 32768
-
-/**
- * Check whether bookend_bo has space for a given number of snapshots.
- */
-static bool
-has_space_for_bookend_snapshots(struct brw_context *brw, int snapshots)
-{
- int snapshot_bytes = brw->perfmon.entries_per_oa_snapshot * sizeof(uint32_t);
-
- /* There are brw->perfmon.bookend_snapshots - 1 existing snapshots. */
- int total_snapshots = (brw->perfmon.bookend_snapshots - 1) + snapshots;
-
- return total_snapshots * snapshot_bytes < BOOKEND_BO_SIZE_BYTES;
-}
-
-/**
- * Write an OA counter snapshot to bookend_bo.
- */
-static void
-emit_bookend_snapshot(struct brw_context *brw)
-{
- int snapshot_bytes = brw->perfmon.entries_per_oa_snapshot * sizeof(uint32_t);
- int offset_in_bytes = brw->perfmon.bookend_snapshots * snapshot_bytes;
-
- emit_mi_report_perf_count(brw, brw->perfmon.bookend_bo, offset_in_bytes,
- REPORT_ID);
- ++brw->perfmon.bookend_snapshots;
-}
-
-/******************************************************************************/
-
-/**
- * Initialize a monitor to sane starting state; throw away old buffers.
- */
-static void
-reinitialize_perf_monitor(struct brw_context *brw,
- struct brw_perf_monitor_object *monitor)
-{
- if (monitor->oa_bo) {
- drm_intel_bo_unreference(monitor->oa_bo);
- monitor->oa_bo = NULL;
- }
-
- /* Since the results are now invalid, we don't need to hold on to any
- * snapshots in bookend_bo. The monitor is effectively "resolved."
- */
- drop_from_unresolved_monitor_list(brw, monitor);
-
- monitor->oa_head_end = -1;
- monitor->oa_middle_start = -1;
- monitor->oa_tail_start = -1;
-
- free(monitor->oa_results);
- monitor->oa_results = NULL;
-
- if (monitor->pipeline_stats_bo) {
- drm_intel_bo_unreference(monitor->pipeline_stats_bo);
- monitor->pipeline_stats_bo = NULL;
- }
-
- free(monitor->pipeline_stats_results);
- monitor->pipeline_stats_results = NULL;
-}
-
-/**
- * Driver hook for glBeginPerformanceMonitorAMD().
- */
-static GLboolean
-brw_begin_perf_monitor(struct gl_context *ctx,
- struct gl_perf_monitor_object *m)
-{
- struct brw_context *brw = brw_context(ctx);
- struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
-
- DBG("Begin(%d)\n", m->Name);
-
- reinitialize_perf_monitor(brw, monitor);
-
- if (monitor_needs_oa(brw, m)) {
- /* If the global OA bookend BO doesn't exist, allocate it. This should
- * only happen once, but we delay until BeginPerfMonitor time to avoid
- * wasting memory for contexts that don't use performance monitors.
- */
- if (!brw->perfmon.bookend_bo) {
- brw->perfmon.bookend_bo = drm_intel_bo_alloc(brw->bufmgr,
- "OA bookend BO",
- BOOKEND_BO_SIZE_BYTES, 64);
- }
-
- monitor->oa_bo =
- drm_intel_bo_alloc(brw->bufmgr, "perf. monitor OA bo", 4096, 64);
-#ifdef DEBUG
- /* Pre-filling the BO helps debug whether writes landed. */
- drm_intel_bo_map(monitor->oa_bo, true);
- memset((char *) monitor->oa_bo->virtual, 0xff, 4096);
- drm_intel_bo_unmap(monitor->oa_bo);
-#endif
-
- /* Allocate storage for accumulated OA counter values. */
- monitor->oa_results =
- calloc(brw->perfmon.entries_per_oa_snapshot, sizeof(uint32_t));
-
- /* If the OA counters aren't already on, enable them. */
- if (brw->perfmon.oa_users == 0) {
- /* Ensure the OACONTROL enable and snapshot land in the same batch. */
- int space = (MI_REPORT_PERF_COUNT_BATCH_DWORDS + 3) * 4;
- intel_batchbuffer_require_space(brw, space, RENDER_RING);
- start_oa_counters(brw);
- }
-
- /* Take a starting OA counter snapshot. */
- emit_mi_report_perf_count(brw, monitor->oa_bo, 0, REPORT_ID);
-
- monitor->oa_head_end = brw->perfmon.bookend_snapshots;
- monitor->oa_middle_start = brw->perfmon.bookend_snapshots + 1;
- monitor->oa_tail_start = -1;
-
- /* Add the monitor to the unresolved list. */
- add_to_unresolved_monitor_list(brw, monitor);
-
- ++brw->perfmon.oa_users;
- }
-
- if (monitor_needs_statistics_registers(brw, m)) {
- monitor->pipeline_stats_bo =
- drm_intel_bo_alloc(brw->bufmgr, "perf. monitor stats bo", 4096, 64);
-
- /* Take starting snapshots. */
- snapshot_statistics_registers(brw, monitor, 0);
- }
-
- return true;
-}
-
-/**
- * Driver hook for glEndPerformanceMonitorAMD().
- */
-static void
-brw_end_perf_monitor(struct gl_context *ctx,
- struct gl_perf_monitor_object *m)
-{
- struct brw_context *brw = brw_context(ctx);
- struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
-
- DBG("End(%d)\n", m->Name);
-
- if (monitor_needs_oa(brw, m)) {
- /* Take an ending OA counter snapshot. */
- emit_mi_report_perf_count(brw, monitor->oa_bo,
- SECOND_SNAPSHOT_OFFSET_IN_BYTES, REPORT_ID);
-
- --brw->perfmon.oa_users;
-
- if (brw->perfmon.oa_users == 0)
- stop_oa_counters(brw);
-
- if (monitor->oa_head_end == brw->perfmon.bookend_snapshots) {
- assert(monitor->oa_head_end != -1);
- /* We never actually wrote the snapshot for the end of the first batch
- * after BeginPerfMonitor. This means that monitoring was contained
- * entirely within a single batch, so we can ignore bookend_bo and
- * just compare the monitor's begin/end snapshots directly.
- */
- monitor->oa_head_end = -1;
- monitor->oa_middle_start = -1;
- monitor->oa_tail_start = -1;
-
- /* We can also mark it resolved since it won't depend on bookend_bo. */
- DBG("Marking %d resolved - entirely in one batch\n", m->Name);
- drop_from_unresolved_monitor_list(brw, monitor);
- } else {
- /* We've written at least one batch end snapshot, so the monitoring
- * spanned multiple batches. Mark which snapshot corresponds to the
- * start of the current batch.
- */
- monitor->oa_tail_start = brw->perfmon.bookend_snapshots - 1;
- }
- }
-
- if (monitor_needs_statistics_registers(brw, m)) {
- /* Take ending snapshots. */
- snapshot_statistics_registers(brw, monitor,
- SECOND_SNAPSHOT_OFFSET_IN_BYTES);
- }
-}
-
-/**
- * Reset a performance monitor, throwing away any results.
- */
-static void
-brw_reset_perf_monitor(struct gl_context *ctx,
- struct gl_perf_monitor_object *m)
-{
- struct brw_context *brw = brw_context(ctx);
- struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
-
- reinitialize_perf_monitor(brw, monitor);
-
- if (m->Active) {
- brw_begin_perf_monitor(ctx, m);
- }
-}
-
-/**
- * Is a performance monitor result available?
- */
-static GLboolean
-brw_is_perf_monitor_result_available(struct gl_context *ctx,
- struct gl_perf_monitor_object *m)
-{
- struct brw_context *brw = brw_context(ctx);
- struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
-
- bool oa_available = true;
- bool stats_available = true;
-
- if (monitor_needs_oa(brw, m)) {
- oa_available = !monitor->oa_bo ||
- (!drm_intel_bo_references(brw->batch.bo, monitor->oa_bo) &&
- !drm_intel_bo_busy(monitor->oa_bo));
- }
-
- if (monitor_needs_statistics_registers(brw, m)) {
- stats_available = !monitor->pipeline_stats_bo ||
- (!drm_intel_bo_references(brw->batch.bo, monitor->pipeline_stats_bo) &&
- !drm_intel_bo_busy(monitor->pipeline_stats_bo));
- }
-
- return oa_available && stats_available;
-}
-
-/**
- * Get the performance monitor result.
- */
-static void
-brw_get_perf_monitor_result(struct gl_context *ctx,
- struct gl_perf_monitor_object *m,
- GLsizei data_size,
- GLuint *data,
- GLint *bytes_written)
-{
- struct brw_context *brw = brw_context(ctx);
- struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
- const GLuint *const data_end = (GLuint *)((uint8_t *) data + data_size);
-
- DBG("GetResult(%d)\n", m->Name);
- brw_dump_perf_monitors(brw);
-
- /* This hook should only be called when results are available. */
- assert(m->Ended);
-
- /* Copy data to the supplied array (data).
- *
- * The output data format is: <group ID, counter ID, value> for each
- * active counter. The API allows counters to appear in any order.
- */
- GLsizei offset = 0;
-
- if (monitor_needs_oa(brw, m)) {
- /* Gather up the results from the BO, unless we already did due to the
- * bookend BO wrapping.
- */
- if (monitor->oa_bo) {
- /* Since the result is available, all the necessary snapshots will
- * have been written to the bookend BO. If other monitors are
- * active, the bookend BO may be busy or referenced by the current
- * batch, but only for writing snapshots beyond oa_tail_start,
- * which we don't care about.
- *
- * Using an unsynchronized mapping avoids stalling for an
- * indeterminate amount of time.
- */
- drm_intel_gem_bo_map_unsynchronized(brw->perfmon.bookend_bo);
-
- gather_oa_results(brw, monitor, brw->perfmon.bookend_bo->virtual);
-
- drm_intel_bo_unmap(brw->perfmon.bookend_bo);
- }
-
- for (int i = 0; i < brw->perfmon.entries_per_oa_snapshot; i++) {
- int group = OA_COUNTERS;
- int counter = brw->perfmon.oa_snapshot_layout[i];
-
- /* We always capture all the OA counters, but the application may
- * have only asked for a subset. Skip unwanted counters.
- */
- if (counter < 0 || !BITSET_TEST(m->ActiveCounters[group], counter))
- continue;
-
- if (data + offset + 3 <= data_end) {
- data[offset++] = group;
- data[offset++] = counter;
- data[offset++] = monitor->oa_results[i];
- }
- }
-
- clean_bookend_bo(brw);
- }
-
- if (monitor_needs_statistics_registers(brw, m)) {
- const int num_counters =
- ctx->PerfMonitor.Groups[PIPELINE_STATS_COUNTERS].NumCounters;
-
- if (!monitor->pipeline_stats_results) {
- gather_statistics_results(brw, monitor);
-
- /* Check if we did really get the results */
- if (!monitor->pipeline_stats_results) {
- if (bytes_written) {
- *bytes_written = 0;
- }
- return;
- }
- }
-
- for (int i = 0; i < num_counters; i++) {
- if (BITSET_TEST(m->ActiveCounters[PIPELINE_STATS_COUNTERS], i)) {
- if (data + offset + 4 <= data_end) {
- data[offset++] = PIPELINE_STATS_COUNTERS;
- data[offset++] = i;
- *((uint64_t *) (&data[offset])) = monitor->pipeline_stats_results[i];
- offset += 2;
- }
- }
- }
- }
-
- if (bytes_written)
- *bytes_written = offset * sizeof(uint32_t);
-}
-
-/**
- * Create a new performance monitor object.
- */
-static struct gl_perf_monitor_object *
-brw_new_perf_monitor(struct gl_context *ctx)
-{
- (void) ctx;
- return calloc(1, sizeof(struct brw_perf_monitor_object));
-}
-
-/**
- * Delete a performance monitor object.
- */
-static void
-brw_delete_perf_monitor(struct gl_context *ctx, struct gl_perf_monitor_object *m)
-{
- struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
- DBG("Delete(%d)\n", m->Name);
- reinitialize_perf_monitor(brw_context(ctx), monitor);
- free(monitor);
-}
-
-/******************************************************************************/
-
-/**
- * Called at the start of every render ring batch.
- *
- * Enable OA counters and emit the "start of batchbuffer" bookend OA snapshot.
- * Since it's a new batch, there will be plenty of space for the commands.
- */
-void
-brw_perf_monitor_new_batch(struct brw_context *brw)
-{
- assert(brw->batch.ring == RENDER_RING);
- assert(brw->gen < 6 || USED_BATCH(brw->batch) == 0);
-
- if (brw->perfmon.oa_users == 0)
- return;
-
- start_oa_counters(brw);
-
- /* Make sure bookend_bo has enough space for a pair of snapshots.
- * If not, "wrap" the BO: gather up any results so far, and start from
- * the beginning of the buffer. Reserving a pair guarantees that wrapping
- * will only happen at the beginning of a batch, where it's safe to map BOs
- * (as the batch is empty and can't refer to any of them yet).
- */
- if (!has_space_for_bookend_snapshots(brw, 2))
- wrap_bookend_bo(brw);
-
- DBG("Bookend Begin Snapshot (%d)\n", brw->perfmon.bookend_snapshots);
- emit_bookend_snapshot(brw);
-}
-
-/**
- * Called at the end of every render ring batch.
- *
- * Emit the "end of batchbuffer" bookend OA snapshot and disable the counters.
- *
- * This relies on there being enough space in BATCH_RESERVED.
- */
-void
-brw_perf_monitor_finish_batch(struct brw_context *brw)
-{
- assert(brw->batch.ring == RENDER_RING);
-
- if (brw->perfmon.oa_users == 0)
- return;
-
- DBG("Bookend End Snapshot (%d)\n", brw->perfmon.bookend_snapshots);
-
- /* Not safe to wrap; should've reserved space already. */
- assert(has_space_for_bookend_snapshots(brw, 1));
-
- emit_bookend_snapshot(brw);
-
- stop_oa_counters(brw);
-}
-
-/******************************************************************************/
-
-void
-brw_init_performance_monitors(struct brw_context *brw)
-{
- struct gl_context *ctx = &brw->ctx;
-
- ctx->Driver.NewPerfMonitor = brw_new_perf_monitor;
- ctx->Driver.DeletePerfMonitor = brw_delete_perf_monitor;
- ctx->Driver.BeginPerfMonitor = brw_begin_perf_monitor;
- ctx->Driver.EndPerfMonitor = brw_end_perf_monitor;
- ctx->Driver.ResetPerfMonitor = brw_reset_perf_monitor;
- ctx->Driver.IsPerfMonitorResultAvailable = brw_is_perf_monitor_result_available;
- ctx->Driver.GetPerfMonitorResult = brw_get_perf_monitor_result;
-
- if (brw->gen == 5) {
- ctx->PerfMonitor.Groups = gen5_groups;
- ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen5_groups);
- brw->perfmon.oa_snapshot_layout = gen5_oa_snapshot_layout;
- brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen5_oa_snapshot_layout);
- } else if (brw->gen == 6) {
- ctx->PerfMonitor.Groups = gen6_groups;
- ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen6_groups);
- brw->perfmon.oa_snapshot_layout = gen6_oa_snapshot_layout;
- brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen6_oa_snapshot_layout);
- brw->perfmon.statistics_registers = gen6_statistics_register_addresses;
- } else if (brw->gen == 7) {
- ctx->PerfMonitor.Groups = gen7_groups;
- ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen7_groups);
- brw->perfmon.oa_snapshot_layout = gen7_oa_snapshot_layout;
- brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen7_oa_snapshot_layout);
- brw->perfmon.statistics_registers = gen7_statistics_register_addresses;
- }
-
- brw->perfmon.unresolved =
- ralloc_array(brw, struct brw_perf_monitor_object *, 1);
- brw->perfmon.unresolved_elements = 0;
- brw->perfmon.unresolved_array_size = 1;
-}
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index f136fae6cc3..65c27731cd2 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -175,8 +175,7 @@ do_batch_dump(struct brw_context *brw)
void
intel_batchbuffer_emit_render_ring_prelude(struct brw_context *brw)
{
- /* We may need to enable and snapshot OA counters. */
- brw_perf_monitor_new_batch(brw);
+ /* Un-used currently */
}
/**
@@ -211,9 +210,6 @@ brw_new_batch(struct brw_context *brw)
*/
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
brw_collect_and_report_shader_time(brw);
-
- if (INTEL_DEBUG & DEBUG_PERFMON)
- brw_dump_perf_monitors(brw);
}
/**
@@ -241,9 +237,6 @@ brw_finish_batch(struct brw_context *brw)
if (brw->gen >= 7)
gen7_restore_default_l3_config(brw);
- /* We may also need to snapshot and disable OA counters. */
- brw_perf_monitor_finish_batch(brw);
-
if (brw->is_haswell) {
/* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
* 3DSTATE_CC_STATE_POINTERS > "Note":
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index bbb794240c0..1ecefc19449 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -100,71 +100,6 @@ can_do_pipelined_register_writes(struct brw_context *brw)
return success;
}
-static bool
-can_write_oacontrol(struct brw_context *brw)
-{
- if (brw->gen < 6 || brw->gen >= 8)
- return false;
-
- static int result = -1;
- if (result != -1)
- return result;
-
- /* Set "Select Context ID" to a particular address (which is likely not a
- * context), but leave all counting disabled. This should be harmless.
- */
- const int expected_value = 0x31337000;
- const int offset = 110;
-
- uint32_t *data;
- /* Set a value in a BO to a known quantity. The workaround BO already
- * exists and doesn't contain anything important, so we may as well use it.
- */
- drm_intel_bo_map(brw->workaround_bo, true);
- data = brw->workaround_bo->virtual;
- data[offset] = 0xffffffff;
- drm_intel_bo_unmap(brw->workaround_bo);
-
- /* Write OACONTROL. */
- BEGIN_BATCH(3);
- OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
- OUT_BATCH(OACONTROL);
- OUT_BATCH(expected_value);
- ADVANCE_BATCH();
-
- brw_emit_mi_flush(brw);
-
- /* Save the register's value back to the buffer. */
- BEGIN_BATCH(3);
- OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
- OUT_BATCH(OACONTROL);
- OUT_RELOC(brw->workaround_bo,
- I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
- offset * sizeof(uint32_t));
- ADVANCE_BATCH();
-
- brw_emit_mi_flush(brw);
-
- /* Set OACONTROL back to zero (everything off). */
- BEGIN_BATCH(3);
- OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
- OUT_BATCH(OACONTROL);
- OUT_BATCH(0);
- ADVANCE_BATCH();
-
- intel_batchbuffer_flush(brw);
-
- /* Check whether the value got written. */
- drm_intel_bo_map(brw->workaround_bo, false);
- data = brw->workaround_bo->virtual;
- bool success = data[offset] == expected_value;
- drm_intel_bo_unmap(brw->workaround_bo);
-
- result = success;
-
- return success;
-}
-
/**
* Initializes potential list of extensions if ctx == NULL, or actually enables
* extensions for a context.
@@ -290,11 +225,6 @@ intelInitExtensions(struct gl_context *ctx)
ctx->Extensions.ARB_texture_query_levels = ctx->Const.GLSLVersion >= 130;
ctx->Extensions.ARB_texture_query_lod = true;
ctx->Extensions.EXT_timer_query = true;
-
- if (brw->gen == 5 || can_write_oacontrol(brw)) {
- ctx->Extensions.AMD_performance_monitor = true;
- ctx->Extensions.INTEL_performance_query = true;
- }
}
if (brw->gen >= 6) {