6 files changed, 1 insertions, 1597 deletions
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 0a7ba1bb34d..dd546826d19 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -134,7 +134,6 @@ i965_FILES = \
 	brw_multisample_state.h \
 	brw_nir_uniforms.cpp \
 	brw_object_purgeable.c \
-	brw_performance_monitor.c \
 	brw_pipe_control.c \
 	brw_program.c \
 	brw_program.h \
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 4ca77c789b4..45490a0f5cf 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -1136,10 +1136,6 @@ brwCreateContext(gl_api api,
    _mesa_initialize_dispatch_tables(ctx);
    _mesa_initialize_vbo_vtxfmt(ctx);
 
-   if (ctx->Extensions.AMD_performance_monitor) {
-      brw_init_performance_monitors(brw);
-   }
-
    vbo_use_buffer_objects(ctx);
    vbo_always_unmap_buffers(ctx);
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index a2817131a50..2dd2686e033 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1170,43 +1170,6 @@ struct brw_context
       bool supported;
    } predicate;
 
-   struct {
-      /** A map from pipeline statistics counter IDs to MMIO addresses. */
-      const int *statistics_registers;
-
-      /** The number of active monitors using OA counters. */
-      unsigned oa_users;
-
-      /**
-       * A buffer object storing OA counter snapshots taken at the start and
-       * end of each batch (creating "bookends" around the batch).
-       */
-      drm_intel_bo *bookend_bo;
-
-      /** The number of snapshots written to bookend_bo. */
-      int bookend_snapshots;
-
-      /**
-       * An array of monitors whose results haven't yet been assembled based on
-       * the data in buffer objects.
-       *
-       * These may be active, or have already ended.  However, the results
-       * have not been requested.
-       */
-      struct brw_perf_monitor_object **unresolved;
-      int unresolved_elements;
-      int unresolved_array_size;
-
-      /**
-       * Mapping from a uint32_t offset within an OA snapshot to the ID of
-       * the counter which MI_REPORT_PERF_COUNT stores there.
-       */
-      const int *oa_snapshot_layout;
-
-      /** Number of 32-bit entries in a hardware counter snapshot. */
-      int entries_per_oa_snapshot;
-   } perfmon;
-
    int num_atoms[BRW_NUM_PIPELINES];
    const struct brw_tracked_state render_atoms[76];
    const struct brw_tracked_state compute_atoms[11];
@@ -1522,12 +1485,6 @@ bool brw_render_target_supported(struct brw_context *brw,
                                  struct gl_renderbuffer *rb);
 uint32_t brw_depth_format(struct brw_context *brw, mesa_format format);
 
-/* brw_performance_monitor.c */
-void brw_init_performance_monitors(struct brw_context *brw);
-void brw_dump_perf_monitors(struct brw_context *brw);
-void brw_perf_monitor_new_batch(struct brw_context *brw);
-void brw_perf_monitor_finish_batch(struct brw_context *brw);
-
 /* intel_buffer_objects.c */
 int brw_bo_map(struct brw_context *brw, drm_intel_bo *bo, int write_enable,
                const char *bo_name);
diff --git a/src/mesa/drivers/dri/i965/brw_performance_monitor.c b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
deleted file mode 100644
index f8e50e10fa3..00000000000
--- a/src/mesa/drivers/dri/i965/brw_performance_monitor.c
+++ /dev/null
@@ -1,1471 +0,0 @@
-/*
- * Copyright © 2013 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * \file brw_performance_monitor.c
- *
- * Implementation of the GL_AMD_performance_monitor extension.
- *
- * On Gen5+ hardware, we have two sources of performance counter data:
- * the Observability Architecture counters (MI_REPORT_PERF_COUNT), and
- * the Pipeline Statistics Registers.  We expose both sets of raw data,
- * as well as some useful processed values.
- *
- * The Observability Architecture (OA) counters for Gen6+ are documented
- * in a separate document from the rest of the PRMs.  It is available at:
- * https://01.org/linuxgraphics/documentation/driver-documentation-prms
- * => 2013 Intel Core Processor Family => Observability Performance Counters
- * (This one volume covers Sandybridge, Ivybridge, Baytrail, and Haswell.)
- *
- * On Ironlake, the OA counters were called "CHAPS" counters.  Sadly, no public
- * documentation exists; our implementation is based on the source code for the
- * intel_perf_counters utility (which is available as part of intel-gpu-tools).
- */
-
-#include <limits.h>
-
-#include "util/bitset.h"
-#include "main/hash.h"
-#include "main/macros.h"
-#include "main/mtypes.h"
-#include "main/performance_monitor.h"
-
-#include "util/ralloc.h"
-
-#include "brw_context.h"
-#include "brw_defines.h"
-#include "intel_batchbuffer.h"
-
-#define FILE_DEBUG_FLAG DEBUG_PERFMON
-
-/**
- * i965 representation of a performance monitor object.
- */
-struct brw_perf_monitor_object
-{
-   /** The base class. */
-   struct gl_perf_monitor_object base;
-
-   /**
-    * BO containing OA counter snapshots at monitor Begin/End time.
-    */
-   drm_intel_bo *oa_bo;
-
-   /** Indexes into bookend_bo (snapshot numbers) for various segments. */
-   int oa_head_end;
-   int oa_middle_start;
-   int oa_tail_start;
-
-   /**
-    * Storage for OA results accumulated so far.
-    *
-    * An array indexed by the counter ID in the OA_COUNTERS group.
-    *
-    * When we run out of space in bookend_bo, we compute the results so far
-    * and add them to the value stored here.  Then, we can discard bookend_bo.
-    */
-   uint32_t *oa_results;
-
-   /**
-    * BO containing starting and ending snapshots for any active pipeline
-    * statistics counters.
-    */
-   drm_intel_bo *pipeline_stats_bo;
-
-   /**
-    * Storage for final pipeline statistics counter results.
-    */
-   uint64_t *pipeline_stats_results;
-};
-
-/** Downcasting convenience macro. */
-static inline struct brw_perf_monitor_object *
-brw_perf_monitor(struct gl_perf_monitor_object *m)
-{
-   return (struct brw_perf_monitor_object *) m;
-}
-
-#define SECOND_SNAPSHOT_OFFSET_IN_BYTES 2048
-
-/* A random value used to ensure we're getting valid snapshots. */
-#define REPORT_ID 0xd2e9c607
-
-/******************************************************************************/
-
-#define COUNTER(name)           \
-   {                            \
-      .Name = name,             \
-      .Type = GL_UNSIGNED_INT,  \
-      .Minimum = { .u32 =  0 }, \
-      .Maximum = { .u32 = ~0 }, \
-   }
-
-#define COUNTER64(name)              \
-   {                                 \
-      .Name = name,                  \
-      .Type = GL_UNSIGNED_INT64_AMD, \
-      .Minimum = { .u64 =  0 },      \
-      .Maximum = { .u64 = ~0 },      \
-   }
-
-#define GROUP(name, max_active, counter_list)  \
-   {                                           \
-      .Name = name,                            \
-      .MaxActiveCounters = max_active,         \
-      .Counters = counter_list,                \
-      .NumCounters = ARRAY_SIZE(counter_list), \
-   }
-
-/** Performance Monitor Group IDs */
-enum brw_counter_groups {
-   OA_COUNTERS, /* Observability Architecture (MI_REPORT_PERF_COUNT) Counters */
-   PIPELINE_STATS_COUNTERS, /* Pipeline Statistics Register Counters */
-};
-
-/**
- * Ironlake:
- *  @{
- *
- * The list of CHAPS counters unfortunately does not appear in any public
- * documentation, but is available by reading the source code for the
- * intel_perf_counters utility (shipped as part of intel-gpu-tools).
- */
-static const struct gl_perf_monitor_counter gen5_raw_chaps_counters[] = {
-   COUNTER("cycles the CS unit is starved"),
-   COUNTER("cycles the CS unit is stalled"),
-   COUNTER("cycles the VF unit is starved"),
-   COUNTER("cycles the VF unit is stalled"),
-   COUNTER("cycles the VS unit is starved"),
-   COUNTER("cycles the VS unit is stalled"),
-   COUNTER("cycles the GS unit is starved"),
-   COUNTER("cycles the GS unit is stalled"),
-   COUNTER("cycles the CL unit is starved"),
-   COUNTER("cycles the CL unit is stalled"),
-   COUNTER("cycles the SF unit is starved"),
-   COUNTER("cycles the SF unit is stalled"),
-   COUNTER("cycles the WZ unit is starved"),
-   COUNTER("cycles the WZ unit is stalled"),
-   COUNTER("Z buffer read/write"),
-   COUNTER("cycles each EU was active"),
-   COUNTER("cycles each EU was suspended"),
-   COUNTER("cycles threads loaded all EUs"),
-   COUNTER("cycles filtering active"),
-   COUNTER("cycles PS threads executed"),
-   COUNTER("subspans written to RC"),
-   COUNTER("bytes read for texture reads"),
-   COUNTER("texels returned from sampler"),
-   COUNTER("polygons not culled"),
-   COUNTER("clocks MASF has valid message"),
-   COUNTER("64b writes/reads from RC"),
-   COUNTER("reads on dataport"),
-   COUNTER("clocks MASF has valid msg not consumed by sampler"),
-   COUNTER("cycles any EU is stalled for math"),
-};
-
-static const int gen5_oa_snapshot_layout[] =
-{
-   -1, /* Report ID */
-   -1, /* TIMESTAMP (64-bit) */
-   -1, /* ...second half... */
-    0, /* cycles the CS unit is starved */
-    1, /* cycles the CS unit is stalled */
-    2, /* cycles the VF unit is starved */
-    3, /* cycles the VF unit is stalled */
-    4, /* cycles the VS unit is starved */
-    5, /* cycles the VS unit is stalled */
-    6, /* cycles the GS unit is starved */
-    7, /* cycles the GS unit is stalled */
-    8, /* cycles the CL unit is starved */
-    9, /* cycles the CL unit is stalled */
-   10, /* cycles the SF unit is starved */
-   11, /* cycles the SF unit is stalled */
-   12, /* cycles the WZ unit is starved */
-   13, /* cycles the WZ unit is stalled */
-   14, /* Z buffer read/write */
-   15, /* cycles each EU was active */
-   16, /* cycles each EU was suspended */
-   17, /* cycles threads loaded all EUs */
-   18, /* cycles filtering active */
-   19, /* cycles PS threads executed */
-   20, /* subspans written to RC */
-   21, /* bytes read for texture reads */
-   22, /* texels returned from sampler */
-   23, /* polygons not culled */
-   24, /* clocks MASF has valid message */
-   25, /* 64b writes/reads from RC */
-   26, /* reads on dataport */
-   27, /* clocks MASF has valid msg not consumed by sampler */
-   28, /* cycles any EU is stalled for math */
-};
-
-static const struct gl_perf_monitor_group gen5_groups[] = {
-   [OA_COUNTERS] = GROUP("CHAPS Counters", INT_MAX, gen5_raw_chaps_counters),
-   /* Our pipeline statistics counter handling requires hardware contexts. */
-};
-/** @} */
-
-/**
- * Sandybridge:
- *  @{
- *
- * A few of the counters here (A17-A20) are not included in the latest
- * documentation, but are described in the Ironlake PRM (which strangely
- * documents Sandybridge's performance counter system, not Ironlake's).
- * It's unclear whether they work or not; empirically, they appear to.
- */
-
-/**
- * Aggregating counters A0-A28:
- */
-static const struct gl_perf_monitor_counter gen6_raw_oa_counters[] = {
-   /* A0:   0 */ COUNTER("Aggregated Core Array Active"),
-   /* A1:   1 */ COUNTER("Aggregated Core Array Stalled"),
-   /* A2:   2 */ COUNTER("Vertex Shader Active Time"),
-   /* A3: Not actually hooked up on Sandybridge. */
-   /* A4:   3 */ COUNTER("Vertex Shader Stall Time - Core Stall"),
-   /* A5:   4 */ COUNTER("# VS threads loaded"),
-   /* A6:   5 */ COUNTER("Vertex Shader Ready but not running Time"),
-   /* A7:   6 */ COUNTER("Geometry Shader Active Time"),
-   /* A8: Not actually hooked up on Sandybridge. */
-   /* A9:   7 */ COUNTER("Geometry Shader Stall Time - Core Stall"),
-   /* A10:  8 */ COUNTER("# GS threads loaded"),
-   /* A11:  9 */ COUNTER("Geometry Shader Ready but not running Time"),
-   /* A12: 10 */ COUNTER("Pixel Shader Active Time"),
-   /* A13: Not actually hooked up on Sandybridge. */
-   /* A14: 11 */ COUNTER("Pixel Shader Stall Time - Core Stall"),
-   /* A15: 12 */ COUNTER("# PS threads loaded"),
-   /* A16: 13 */ COUNTER("Pixel Shader Ready but not running Time"),
-   /* A17: 14 */ COUNTER("Early Z Test Pixels Passing"),
-   /* A18: 15 */ COUNTER("Early Z Test Pixels Failing"),
-   /* A19: 16 */ COUNTER("Early Stencil Test Pixels Passing"),
-   /* A20: 17 */ COUNTER("Early Stencil Test Pixels Failing"),
-   /* A21: 18 */ COUNTER("Pixel Kill Count"),
-   /* A22: 19 */ COUNTER("Alpha Test Pixels Failed"),
-   /* A23: 20 */ COUNTER("Post PS Stencil Pixels Failed"),
-   /* A24: 21 */ COUNTER("Post PS Z buffer Pixels Failed"),
-   /* A25: 22 */ COUNTER("Pixels/samples Written in the frame buffer"),
-   /* A26: 23 */ COUNTER("GPU Busy"),
-   /* A27: 24 */ COUNTER("CL active and not stalled"),
-   /* A28: 25 */ COUNTER("SF active and stalled"),
-};
-
-/**
- * Sandybridge: Counter Select = 001
- * A0   A1   A2   A3   A4   TIMESTAMP RPT_ID
- * A5   A6   A7   A8   A9   A10  A11  A12
- * A13  A14  A15  A16  A17  A18  A19  A20
- * A21  A22  A23  A24  A25  A26  A27  A28
- *
- * (Yes, this is a strange order.)  We also have to remap for missing counters.
- */
-static const int gen6_oa_snapshot_layout[] =
-{
-   -1, /* Report ID */
-   -1, /* TIMESTAMP (64-bit) */
-   -1, /* ...second half... */
-    3, /* A4:  Vertex Shader Stall Time - Core Stall */
-   -1, /* A3:  (not available) */
-    2, /* A2:  Vertex Shader Active Time */
-    1, /* A1:  Aggregated Core Array Stalled */
-    0, /* A0:  Aggregated Core Array Active */
-   10, /* A12: Pixel Shader Active Time */
-    9, /* A11: Geometry Shader ready but not running Time */
-    8, /* A10: # GS threads loaded */
-    7, /* A9:  Geometry Shader Stall Time - Core Stall */
-   -1, /* A8:  (not available) */
-    6, /* A7:  Geometry Shader Active Time */
-    5, /* A6:  Vertex Shader ready but not running Time */
-    4, /* A5:  # VS Threads Loaded */
-   17, /* A20: Early Stencil Test Pixels Failing */
-   16, /* A19: Early Stencil Test Pixels Passing */
-   15, /* A18: Early Z Test Pixels Failing */
-   14, /* A17: Early Z Test Pixels Passing */
-   13, /* A16: Pixel Shader ready but not running Time */
-   12, /* A15: # PS threads loaded */
-   11, /* A14: Pixel Shader Stall Time - Core Stall */
-   -1, /* A13: (not available) */
-   25, /* A28: SF active and stalled */
-   24, /* A27: CL active and not stalled */
-   23, /* A26: GPU Busy */
-   22, /* A25: Pixels/samples Written in the frame buffer */
-   21, /* A24: Post PS Z buffer Pixels Failed */
-   20, /* A23: Post PS Stencil Pixels Failed */
-   19, /* A22: Alpha Test Pixels Failed */
-   18, /* A21: Pixel Kill Count */
-};
-
-static const struct gl_perf_monitor_counter gen6_statistics_counters[] = {
-   COUNTER64("IA_VERTICES_COUNT"),
-   COUNTER64("IA_PRIMITIVES_COUNT"),
-   COUNTER64("VS_INVOCATION_COUNT"),
-   COUNTER64("GS_INVOCATION_COUNT"),
-   COUNTER64("GS_PRIMITIVES_COUNT"),
-   COUNTER64("CL_INVOCATION_COUNT"),
-   COUNTER64("CL_PRIMITIVES_COUNT"),
-   COUNTER64("PS_INVOCATION_COUNT"),
-   COUNTER64("PS_DEPTH_COUNT"),
-   COUNTER64("SO_NUM_PRIMS_WRITTEN"),
-   COUNTER64("SO_PRIM_STORAGE_NEEDED"),
-};
-
-/** MMIO register addresses for each pipeline statistics counter. */
-static const int gen6_statistics_register_addresses[] = {
-   IA_VERTICES_COUNT,
-   IA_PRIMITIVES_COUNT,
-   VS_INVOCATION_COUNT,
-   GS_INVOCATION_COUNT,
-   GS_PRIMITIVES_COUNT,
-   CL_INVOCATION_COUNT,
-   CL_PRIMITIVES_COUNT,
-   PS_INVOCATION_COUNT,
-   PS_DEPTH_COUNT,
-   GEN6_SO_NUM_PRIMS_WRITTEN,
-   GEN6_SO_PRIM_STORAGE_NEEDED,
-};
-
-static const struct gl_perf_monitor_group gen6_groups[] = {
-   GROUP("Observability Architecture Counters", INT_MAX, gen6_raw_oa_counters),
-   GROUP("Pipeline Statistics Registers", INT_MAX, gen6_statistics_counters),
-};
-/** @} */
-
-/**
- * Ivybridge/Baytrail/Haswell:
- *  @{
- */
-static const struct gl_perf_monitor_counter gen7_raw_oa_counters[] = {
-   COUNTER("Aggregated Core Array Active"),
-   COUNTER("Aggregated Core Array Stalled"),
-   COUNTER("Vertex Shader Active Time"),
-   COUNTER("Vertex Shader Stall Time - Core Stall"),
-   COUNTER("# VS threads loaded"),
-   COUNTER("Hull Shader Active Time"),
-   COUNTER("Hull Shader Stall Time - Core Stall"),
-   COUNTER("# HS threads loaded"),
-   COUNTER("Domain Shader Active Time"),
-   COUNTER("Domain Shader Stall Time - Core Stall"),
-   COUNTER("# DS threads loaded"),
-   COUNTER("Compute Shader Active Time"),
-   COUNTER("Compute Shader Stall Time - Core Stall"),
-   COUNTER("# CS threads loaded"),
-   COUNTER("Geometry Shader Active Time"),
-   COUNTER("Geometry Shader Stall Time - Core Stall"),
-   COUNTER("# GS threads loaded"),
-   COUNTER("Pixel Shader Active Time"),
-   COUNTER("Pixel Shader Stall Time - Core Stall"),
-   COUNTER("# PS threads loaded"),
-   COUNTER("HiZ Fast Z Test Pixels Passing"),
-   COUNTER("HiZ Fast Z Test Pixels Failing"),
-   COUNTER("Slow Z Test Pixels Passing"),
-   COUNTER("Slow Z Test Pixels Failing"),
-   COUNTER("Pixel Kill Count"),
-   COUNTER("Alpha Test Pixels Failed"),
-   COUNTER("Post PS Stencil Pixels Failed"),
-   COUNTER("Post PS Z buffer Pixels Failed"),
-   COUNTER("3D/GPGPU Render Target Writes"),
-   COUNTER("Render Engine Busy"),
-   COUNTER("VS bottleneck"),
-   COUNTER("GS bottleneck"),
-};
-
-/**
- * Ivybridge/Baytrail/Haswell: Counter Select = 101
- * A4   A3   A2   A1   A0   TIMESTAMP  ReportID
- * A12  A11  A10  A9   A8   A7   A6    A5
- * A20  A19  A18  A17  A16  A15  A14   A13
- * A28  A27  A26  A25  A24  A23  A22   A21
- * A36  A35  A34  A33  A32  A31  A30   A29
- * A44  A43  A42  A41  A40  A39  A38   A37
- * B7   B6   B5   B4   B3   B2   B1    B0
- * Rsv  Rsv  Rsv  Rsv  Rsv  Rsv  Rsv   Rsv
- */
-static const int gen7_oa_snapshot_layout[] =
-{
-   -1, /* Report ID */
-   -1, /* TIMESTAMP (64-bit) */
-   -1, /* ...second half... */
-    0, /* A0:  Aggregated Core Array Active */
-    1, /* A1:  Aggregated Core Array Stalled */
-    2, /* A2:  Vertex Shader Active Time */
-   -1, /* A3:  Reserved */
-    3, /* A4:  Vertex Shader Stall Time - Core Stall */
-    4, /* A5:  # VS threads loaded */
-   -1, /* A6:  Reserved */
-    5, /* A7:  Hull Shader Active Time */
-   -1, /* A8:  Reserved */
-    6, /* A9:  Hull Shader Stall Time - Core Stall */
-    7, /* A10: # HS threads loaded */
-   -1, /* A11: Reserved */
-    8, /* A12: Domain Shader Active Time */
-   -1, /* A13: Reserved */
-    9, /* A14: Domain Shader Stall Time - Core Stall */
-   10, /* A15: # DS threads loaded */
-   -1, /* A16: Reserved */
-   11, /* A17: Compute Shader Active Time */
-   -1, /* A18: Reserved */
-   12, /* A19: Compute Shader Stall Time - Core Stall */
-   13, /* A20: # CS threads loaded */
-   -1, /* A21: Reserved */
-   14, /* A22: Geometry Shader Active Time */
-   -1, /* A23: Reserved */
-   15, /* A24: Geometry Shader Stall Time - Core Stall */
-   16, /* A25: # GS threads loaded */
-   -1, /* A26: Reserved */
-   17, /* A27: Pixel Shader Active Time */
-   -1, /* A28: Reserved */
-   18, /* A29: Pixel Shader Stall Time - Core Stall */
-   19, /* A30: # PS threads loaded */
-   -1, /* A31: Reserved */
-   20, /* A32: HiZ Fast Z Test Pixels Passing */
-   21, /* A33: HiZ Fast Z Test Pixels Failing */
-   22, /* A34: Slow Z Test Pixels Passing */
-   23, /* A35: Slow Z Test Pixels Failing */
-   24, /* A36: Pixel Kill Count */
-   25, /* A37: Alpha Test Pixels Failed */
-   26, /* A38: Post PS Stencil Pixels Failed */
-   27, /* A39: Post PS Z buffer Pixels Failed */
-   28, /* A40: 3D/GPGPU Render Target Writes */
-   29, /* A41: Render Engine Busy */
-   30, /* A42: VS bottleneck */
-   31, /* A43: GS bottleneck */
-   -1, /* A44: Reserved */
-   -1, /* B0 */
-   -1, /* B1 */
-   -1, /* B2 */
-   -1, /* B3 */
-   -1, /* B4 */
-   -1, /* B5 */
-   -1, /* B6 */
-   -1, /* B7 */
-   -1, /* Reserved */
-   -1, /* Reserved */
-   -1, /* Reserved */
-   -1, /* Reserved */
-   -1, /* Reserved */
-   -1, /* Reserved */
-   -1, /* Reserved */
-   -1, /* Reserved */
-};
-
-static const struct gl_perf_monitor_counter gen7_statistics_counters[] = {
-   COUNTER64("IA_VERTICES_COUNT"),
-   COUNTER64("IA_PRIMITIVES_COUNT"),
-   COUNTER64("VS_INVOCATION_COUNT"),
-   COUNTER64("HS_INVOCATION_COUNT"),
-   COUNTER64("DS_INVOCATION_COUNT"),
-   COUNTER64("GS_INVOCATION_COUNT"),
-   COUNTER64("GS_PRIMITIVES_COUNT"),
-   COUNTER64("CL_INVOCATION_COUNT"),
-   COUNTER64("CL_PRIMITIVES_COUNT"),
-   COUNTER64("PS_INVOCATION_COUNT"),
-   COUNTER64("PS_DEPTH_COUNT"),
-   COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 0)"),
-   COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 1)"),
-   COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 2)"),
-   COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 3)"),
-   COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 0)"),
-   COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 1)"),
-   COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 2)"),
-   COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 3)"),
-};
-
-/** MMIO register addresses for each pipeline statistics counter. */
-static const int gen7_statistics_register_addresses[] = {
-   IA_VERTICES_COUNT,
-   IA_PRIMITIVES_COUNT,
-   VS_INVOCATION_COUNT,
-   HS_INVOCATION_COUNT,
-   DS_INVOCATION_COUNT,
-   GS_INVOCATION_COUNT,
-   GS_PRIMITIVES_COUNT,
-   CL_INVOCATION_COUNT,
-   CL_PRIMITIVES_COUNT,
-   PS_INVOCATION_COUNT,
-   PS_DEPTH_COUNT,
-   GEN7_SO_NUM_PRIMS_WRITTEN(0),
-   GEN7_SO_NUM_PRIMS_WRITTEN(1),
-   GEN7_SO_NUM_PRIMS_WRITTEN(2),
-   GEN7_SO_NUM_PRIMS_WRITTEN(3),
-   GEN7_SO_PRIM_STORAGE_NEEDED(0),
-   GEN7_SO_PRIM_STORAGE_NEEDED(1),
-   GEN7_SO_PRIM_STORAGE_NEEDED(2),
-   GEN7_SO_PRIM_STORAGE_NEEDED(3),
-};
-
-static const struct gl_perf_monitor_group gen7_groups[] = {
-   GROUP("Observability Architecture Counters", INT_MAX, gen7_raw_oa_counters),
-   GROUP("Pipeline Statistics Registers", INT_MAX, gen7_statistics_counters),
-};
-/** @} */
-
-/******************************************************************************/
-
-static GLboolean brw_is_perf_monitor_result_available(struct gl_context *, struct gl_perf_monitor_object *);
-
-static void
-dump_perf_monitor_callback(GLuint name, void *monitor_void, void *brw_void)
-{
-   struct brw_context *brw = brw_void;
-   struct gl_context *ctx = brw_void;
-   struct gl_perf_monitor_object *m = monitor_void;
-   struct brw_perf_monitor_object *monitor = monitor_void;
-
-   const char *resolved = "";
-   for (int i = 0; i < brw->perfmon.unresolved_elements; i++) {
-      if (brw->perfmon.unresolved[i] == monitor) {
-         resolved = "Unresolved";
-         break;
-      }
-   }
-
-   DBG("%4d  %-7s %-6s %-10s %-11s <%3d, %3d, %3d>  %-6s %-9s\n",
-       name,
-       m->Active ? "Active" : "",
-       m->Ended ? "Ended" : "",
-       resolved,
-       brw_is_perf_monitor_result_available(ctx, m) ? "Available" : "",
-       monitor->oa_head_end,
-       monitor->oa_middle_start,
-       monitor->oa_tail_start,
-       monitor->oa_bo ? "OA BO" : "",
-       monitor->pipeline_stats_bo ? "Stats BO" : "");
-}
-
-void
-brw_dump_perf_monitors(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   DBG("Monitors: (OA users = %d)\n", brw->perfmon.oa_users);
-   _mesa_HashWalk(ctx->PerfMonitor.Monitors, dump_perf_monitor_callback, brw);
-}
-
-/******************************************************************************/
-
-static bool
-monitor_needs_statistics_registers(struct brw_context *brw,
-                                   struct gl_perf_monitor_object *m)
-{
-   return brw->gen >= 6 && m->ActiveGroups[PIPELINE_STATS_COUNTERS];
-}
-
-/**
- * Take a snapshot of any monitored pipeline statistics counters.
- */
-static void
-snapshot_statistics_registers(struct brw_context *brw,
-                              struct brw_perf_monitor_object *monitor,
-                              uint32_t offset)
-{
-   struct gl_context *ctx = &brw->ctx;
-   const int group = PIPELINE_STATS_COUNTERS;
-   const int num_counters = ctx->PerfMonitor.Groups[group].NumCounters;
-
-   brw_emit_mi_flush(brw);
-
-   for (int i = 0; i < num_counters; i++) {
-      if (BITSET_TEST(monitor->base.ActiveCounters[group], i)) {
-         assert(ctx->PerfMonitor.Groups[group].Counters[i].Type ==
-                GL_UNSIGNED_INT64_AMD);
-
-         brw_store_register_mem64(brw, monitor->pipeline_stats_bo,
-                                  brw->perfmon.statistics_registers[i],
-                                  offset + i * sizeof(uint64_t));
-      }
-   }
-}
-
-/**
- * Gather results from pipeline_stats_bo, storing the final values.
- *
- * This allows us to free pipeline_stats_bo (which is 4K) in favor of a much
- * smaller array of final results.
- */
-static void
-gather_statistics_results(struct brw_context *brw,
-                          struct brw_perf_monitor_object *monitor)
-{
-   struct gl_context *ctx = &brw->ctx;
-   const int num_counters =
-      ctx->PerfMonitor.Groups[PIPELINE_STATS_COUNTERS].NumCounters;
-
-   monitor->pipeline_stats_results = calloc(num_counters, sizeof(uint64_t));
-   if (monitor->pipeline_stats_results == NULL) {
-      _mesa_error_no_memory(__func__);
-      return;
-   }
-
-   drm_intel_bo_map(monitor->pipeline_stats_bo, false);
-   uint64_t *start = monitor->pipeline_stats_bo->virtual;
-   uint64_t *end = start + (SECOND_SNAPSHOT_OFFSET_IN_BYTES / sizeof(uint64_t));
-
-   for (int i = 0; i < num_counters; i++) {
-      monitor->pipeline_stats_results[i] = end[i] - start[i];
-   }
-   drm_intel_bo_unmap(monitor->pipeline_stats_bo);
-   drm_intel_bo_unreference(monitor->pipeline_stats_bo);
-   monitor->pipeline_stats_bo = NULL;
-}
-
-/******************************************************************************/
-
-static bool
-monitor_needs_oa(struct brw_context *brw,
-                 struct gl_perf_monitor_object *m)
-{
-   return m->ActiveGroups[OA_COUNTERS];
-}
-
-/**
- * Enable the Observability Architecture counters by whacking OACONTROL.
- */
-static void
-start_oa_counters(struct brw_context *brw)
-{
-   unsigned counter_format;
-
-   /* Pick the counter format which gives us all the counters. */
-   switch (brw->gen) {
-   case 5:
-      return; /* Ironlake counters are always running. */
-   case 6:
-      counter_format = 0b001;
-      break;
-   case 7:
-      counter_format = 0b101;
-      break;
-   default:
-      unreachable("Tried to enable OA counters on an unsupported generation.");
-   }
-
-   BEGIN_BATCH(3);
-   OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
-   OUT_BATCH(OACONTROL);
-   OUT_BATCH(counter_format << OACONTROL_COUNTER_SELECT_SHIFT |
-             OACONTROL_ENABLE_COUNTERS);
-   ADVANCE_BATCH();
-}
-
-/**
- * Disable OA counters.
- */
-static void
-stop_oa_counters(struct brw_context *brw)
-{
-   /* Ironlake counters never stop. */
-   if (brw->gen == 5)
-      return;
-
-   BEGIN_BATCH(3);
-   OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
-   OUT_BATCH(OACONTROL);
-   OUT_BATCH(0);
-   ADVANCE_BATCH();
-}
-
-/**
- * The amount of batch space it takes to emit an MI_REPORT_PERF_COUNT snapshot,
- * including the required PIPE_CONTROL flushes.
- *
- * Sandybridge is the worst case scenario: brw_emit_mi_flush expands to four
- * PIPE_CONTROLs which are 5 DWords each.  We have to flush before and after
- * MI_REPORT_PERF_COUNT, so multiply by two.  Finally, add the 3 DWords for
- * MI_REPORT_PERF_COUNT itself.
- */
-#define MI_REPORT_PERF_COUNT_BATCH_DWORDS (2 * (4 * 5) + 3)
-
-/**
- * Emit an MI_REPORT_PERF_COUNT command packet.
- *
- * This writes the current OA counter values to buffer.
- */
-static void
-emit_mi_report_perf_count(struct brw_context *brw,
-                          drm_intel_bo *bo,
-                          uint32_t offset_in_bytes,
-                          uint32_t report_id)
-{
-   assert(offset_in_bytes % 64 == 0);
-
-   /* Make sure the commands to take a snapshot fits in a single batch. */
-   intel_batchbuffer_require_space(brw, MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4,
-                                   RENDER_RING);
-   int batch_used = USED_BATCH(brw->batch);
-
-   /* Reports apparently don't always get written unless we flush first. */
-   brw_emit_mi_flush(brw);
-
-   if (brw->gen == 5) {
-      /* Ironlake requires two MI_REPORT_PERF_COUNT commands to write all
-       * the counters.  The report ID is ignored in the second set.
-       */
-      BEGIN_BATCH(6);
-      OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_0);
-      OUT_RELOC(bo,
-                I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                offset_in_bytes);
-      OUT_BATCH(report_id);
-
-      OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_1);
-      OUT_RELOC(bo,
-                I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                offset_in_bytes + 64);
-      OUT_BATCH(report_id);
-      ADVANCE_BATCH();
-   } else if (brw->gen == 6) {
-      BEGIN_BATCH(3);
-      OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT);
-      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                offset_in_bytes | MI_COUNTER_ADDRESS_GTT);
-      OUT_BATCH(report_id);
-      ADVANCE_BATCH();
-   } else if (brw->gen == 7) {
-      BEGIN_BATCH(3);
-      OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT);
-      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                offset_in_bytes);
-      OUT_BATCH(report_id);
-      ADVANCE_BATCH();
-   } else {
-      unreachable("Unsupported generation for performance counters.");
-   }
-
-   /* Reports apparently don't always get written unless we flush after. */
-   brw_emit_mi_flush(brw);
-
-   (void) batch_used;
-   assert(USED_BATCH(brw->batch) - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4);
-}
-
-/**
- * Add a monitor to the global list of "unresolved monitors."
- *
- * Monitors are "unresolved" if they refer to OA counter snapshots in
- * bookend_bo.  Results (even partial ones) must be gathered for all
- * unresolved monitors before it's safe to discard bookend_bo.
- */
-static void
-add_to_unresolved_monitor_list(struct brw_context *brw,
-                               struct brw_perf_monitor_object *monitor)
-{
-   if (brw->perfmon.unresolved_elements >=
-       brw->perfmon.unresolved_array_size) {
-      brw->perfmon.unresolved_array_size *= 2;
-      brw->perfmon.unresolved = reralloc(brw, brw->perfmon.unresolved,
-                                         struct brw_perf_monitor_object *,
-                                         brw->perfmon.unresolved_array_size);
-   }
-
-   brw->perfmon.unresolved[brw->perfmon.unresolved_elements++] = monitor;
-}
-
-/**
- * If possible, throw away the contents of bookend BO.
- *
- * When all monitoring stops, and no monitors need data from bookend_bo to
- * compute results, we can discard it and start writing snapshots at the
- * beginning again.  This helps reduce the amount of buffer wraparound.
- */
-static void
-clean_bookend_bo(struct brw_context *brw)
-{
-   if (brw->perfmon.unresolved_elements == 0) {
-      DBG("***Resetting bookend snapshots to 0\n");
-      brw->perfmon.bookend_snapshots = 0;
-   }
-}
-
-/**
- * Remove a monitor from the global list of "unresolved monitors."
- *
- * This can happen when:
- * - We finish computing a completed monitor's results.
- * - We discard unwanted monitor results.
- * - A monitor's results can be computed without relying on bookend_bo.
- */
-static void
-drop_from_unresolved_monitor_list(struct brw_context *brw,
-                                  struct brw_perf_monitor_object *monitor)
-{
-   for (int i = 0; i < brw->perfmon.unresolved_elements; i++) {
-      if (brw->perfmon.unresolved[i] == monitor) {
-         int last_elt = --brw->perfmon.unresolved_elements;
-
-         if (i == last_elt) {
-            brw->perfmon.unresolved[i] = NULL;
-         } else {
-            brw->perfmon.unresolved[i] = brw->perfmon.unresolved[last_elt];
-         }
-
-         clean_bookend_bo(brw);
-         return;
-      }
-   }
-}
-
-/**
- * Given pointers to starting and ending OA snapshots, add the deltas for each
- * counter to the results.
- */
-static void
-add_deltas(struct brw_context *brw,
-           struct brw_perf_monitor_object *monitor,
-           uint32_t *start, uint32_t *end)
-{
-   /* Look for expected report ID values to ensure data is present. */
-   assert(start[0] == REPORT_ID);
-   assert(end[0] == REPORT_ID);
-
-   /* Subtract each counter's ending and starting values, then add the
-    * difference to the counter's value so far.
-    */
-   for (int i = 3; i < brw->perfmon.entries_per_oa_snapshot; i++) {
-      /* When debugging, it's useful to note when the ending value is less than
-       * the starting value; aggregating counters should always increase in
-       * value (or remain unchanged).  This happens periodically due to
-       * wraparound, but can also indicate serious problems.
-       */
-#ifdef DEBUG
-      if (end[i] < start[i]) {
-         int counter = brw->perfmon.oa_snapshot_layout[i];
-         if (counter >= 0) {
-            DBG("WARNING: \"%s\" ending value was less than the starting "
-                "value: %u < %u (end - start = %u)\n",
-                brw->ctx.PerfMonitor.Groups[0].Counters[counter].Name,
-                end[i], start[i], end[i] - start[i]);
-         }
-      }
-#endif
-      monitor->oa_results[i] += end[i] - start[i];
-   }
-}
-
-/**
- * Gather OA counter results (partial or full) from a series of snapshots.
- *
- * Monitoring can start or stop at any time, likely at some point mid-batch.
- * We write snapshots for both events, storing them in monitor->oa_bo.
- *
- * Ideally, we would simply subtract those two snapshots to obtain the final
- * counter results.  Unfortunately, our hardware doesn't preserve their values
- * across context switches or GPU sleep states.  In order to support multiple
- * concurrent OA clients, as well as reliable data across power management,
- * we have to take snapshots at the start and end of batches as well.
- *
- * This results in a three-part sequence of (start, end) intervals:
- * - The "head" is from the BeginPerfMonitor snapshot to the end of the first
- *   batchbuffer.
- * - The "middle" is a series of (batch start, batch end) snapshots which
- *   bookend any batchbuffers between the ones which start/end monitoring.
- * - The "tail" is from the start of the last batch where monitoring was
- *   active to the EndPerfMonitor snapshot.
- *
- * Due to wrapping in the bookend BO, we may have to accumulate partial results.
- * If so, we handle the "head" and any "middle" results so far.  When monitoring
- * eventually ends, we handle additional "middle" batches and the "tail."
- */
-static void
-gather_oa_results(struct brw_context *brw,
-                  struct brw_perf_monitor_object *monitor,
-                  uint32_t *bookend_buffer)
-{
-   struct gl_perf_monitor_object *m = &monitor->base;
-   assert(monitor->oa_bo != NULL);
-
-   drm_intel_bo_map(monitor->oa_bo, false);
-   uint32_t *monitor_buffer = monitor->oa_bo->virtual;
-
-   /* If monitoring was entirely contained within a single batch, then the
-    * bookend BO is irrelevant.  Just subtract monitor->bo's two snapshots.
-    */
-   if (monitor->oa_middle_start == -1) {
-      add_deltas(brw, monitor,
-                 monitor_buffer,
-                 monitor_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES /
-                                   sizeof(uint32_t)));
-      drm_intel_bo_unmap(monitor->oa_bo);
-      return;
-   }
-
-   const ptrdiff_t snapshot_size = brw->perfmon.entries_per_oa_snapshot;
-
-   /* First, add the contributions from the "head" interval:
-    * (snapshot taken at BeginPerfMonitor time,
-    *  snapshot taken at the end of the first batch after monitoring began)
-    */
-   if (monitor->oa_head_end != -1) {
-      assert(monitor->oa_head_end < brw->perfmon.bookend_snapshots);
-      add_deltas(brw, monitor,
-                 monitor_buffer,
-                 bookend_buffer + snapshot_size * monitor->oa_head_end);
-
-      /* Make sure we don't count the "head" again in the future. */
-      monitor->oa_head_end = -1;
-   }
-
-   /* Next, count the contributions from the "middle" batches.  These are
-    * (batch begin, batch end) deltas while monitoring was active.
-    */
-   int last_snapshot;
-   if (m->Ended)
-      last_snapshot = monitor->oa_tail_start;
-   else
-      last_snapshot = brw->perfmon.bookend_snapshots;
-
-   for (int s = monitor->oa_middle_start; s < last_snapshot; s += 2) {
-      add_deltas(brw, monitor,
-                 bookend_buffer + snapshot_size * s,
-                 bookend_buffer + snapshot_size * (s + 1));
-   }
-
-   /* Finally, if the monitor has ended, we need to count the contributions of
-    * the "tail" interval:
-    * (start of the batch where monitoring ended, EndPerfMonitor snapshot)
-    */
-   if (m->Ended) {
-      assert(monitor->oa_tail_start != -1);
-      add_deltas(brw, monitor,
-                 bookend_buffer + snapshot_size * monitor->oa_tail_start,
-                 monitor_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES /
-                                   sizeof(uint32_t)));
-   }
-
-   drm_intel_bo_unmap(monitor->oa_bo);
-
-   /* If the monitor has ended, then we've gathered all the results, and
-    * can free the monitor's OA BO.
-    */
-   if (m->Ended) {
-      drm_intel_bo_unreference(monitor->oa_bo);
-      monitor->oa_bo = NULL;
-
-      /* The monitor's OA result is now resolved. */
-      DBG("Marking %d resolved - results gathered\n", m->Name);
-      drop_from_unresolved_monitor_list(brw, monitor);
-   }
-}
-
-/**
- * Handle running out of space in the bookend BO.
- *
- * When we run out of space in the bookend BO, we need to gather up partial
- * results for every unresolved monitor.  This allows us to free the snapshot
- * data in bookend_bo, freeing up the space for reuse.  We call this "wrapping."
- *
- * This will completely compute the result for any unresolved monitors that
- * have ended.
- */
-static void
-wrap_bookend_bo(struct brw_context *brw)
-{
-   DBG("****Wrap bookend BO****\n");
-   /* Note that wrapping will only occur at the start of a batch, since that's
-    * where we reserve space.  So the current batch won't reference bookend_bo
-    * or any monitor BOs.  This means we don't need to worry about
-    * synchronization.
-    *
-    * Also, EndPerfMonitor guarantees that only monitors which span multiple
-    * batches exist in the unresolved monitor list.
-    */
-   assert(brw->perfmon.oa_users > 0);
-
-   drm_intel_bo_map(brw->perfmon.bookend_bo, false);
-   uint32_t *bookend_buffer = brw->perfmon.bookend_bo->virtual;
-   for (int i = 0; i < brw->perfmon.unresolved_elements; i++) {
-      struct brw_perf_monitor_object *monitor = brw->perfmon.unresolved[i];
-      struct gl_perf_monitor_object *m = &monitor->base;
-
-      gather_oa_results(brw, monitor, bookend_buffer);
-
-      if (m->Ended) {
-         /* gather_oa_results() dropped the monitor from the unresolved list,
-          * throwing our indices off by one.
-          */
-         --i;
-      } else {
-         /* When we create the new bookend_bo, snapshot #0 will be the
-          * beginning of another "middle" BO.
-          */
-         monitor->oa_middle_start = 0;
-         assert(monitor->oa_head_end == -1);
-         assert(monitor->oa_tail_start == -1);
-      }
-   }
-   drm_intel_bo_unmap(brw->perfmon.bookend_bo);
-
-   brw->perfmon.bookend_snapshots = 0;
-}
-
-/* This is fairly arbitrary; the trade off is memory usage vs. extra overhead
- * from wrapping.  On Gen7, 32768 should be enough for 128 snapshots before
- * wrapping (since each is 256 bytes).
- */
-#define BOOKEND_BO_SIZE_BYTES 32768
-
-/**
- * Check whether bookend_bo has space for a given number of snapshots.
- */
-static bool
-has_space_for_bookend_snapshots(struct brw_context *brw, int snapshots)
-{
-   int snapshot_bytes = brw->perfmon.entries_per_oa_snapshot * sizeof(uint32_t);
-
-   /* There are brw->perfmon.bookend_snapshots - 1 existing snapshots. */
-   int total_snapshots = (brw->perfmon.bookend_snapshots - 1) + snapshots;
-
-   return total_snapshots * snapshot_bytes < BOOKEND_BO_SIZE_BYTES;
-}
-
-/**
- * Write an OA counter snapshot to bookend_bo.
- */
-static void
-emit_bookend_snapshot(struct brw_context *brw)
-{
-   int snapshot_bytes = brw->perfmon.entries_per_oa_snapshot * sizeof(uint32_t);
-   int offset_in_bytes = brw->perfmon.bookend_snapshots * snapshot_bytes;
-
-   emit_mi_report_perf_count(brw, brw->perfmon.bookend_bo, offset_in_bytes,
-                             REPORT_ID);
-   ++brw->perfmon.bookend_snapshots;
-}
-
-/******************************************************************************/
-
-/**
- * Initialize a monitor to sane starting state; throw away old buffers.
- */
-static void
-reinitialize_perf_monitor(struct brw_context *brw,
-                          struct brw_perf_monitor_object *monitor)
-{
-   if (monitor->oa_bo) {
-      drm_intel_bo_unreference(monitor->oa_bo);
-      monitor->oa_bo = NULL;
-   }
-
-   /* Since the results are now invalid, we don't need to hold on to any
-    * snapshots in bookend_bo.  The monitor is effectively "resolved."
-    */
-   drop_from_unresolved_monitor_list(brw, monitor);
-
-   monitor->oa_head_end = -1;
-   monitor->oa_middle_start = -1;
-   monitor->oa_tail_start = -1;
-
-   free(monitor->oa_results);
-   monitor->oa_results = NULL;
-
-   if (monitor->pipeline_stats_bo) {
-      drm_intel_bo_unreference(monitor->pipeline_stats_bo);
-      monitor->pipeline_stats_bo = NULL;
-   }
-
-   free(monitor->pipeline_stats_results);
-   monitor->pipeline_stats_results = NULL;
-}
-
-/**
- * Driver hook for glBeginPerformanceMonitorAMD().
- */
-static GLboolean
-brw_begin_perf_monitor(struct gl_context *ctx,
-                       struct gl_perf_monitor_object *m)
-{
-   struct brw_context *brw = brw_context(ctx);
-   struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
-
-   DBG("Begin(%d)\n", m->Name);
-
-   reinitialize_perf_monitor(brw, monitor);
-
-   if (monitor_needs_oa(brw, m)) {
-      /* If the global OA bookend BO doesn't exist, allocate it.  This should
-       * only happen once, but we delay until BeginPerfMonitor time to avoid
-       * wasting memory for contexts that don't use performance monitors.
-       */
-      if (!brw->perfmon.bookend_bo) {
-         brw->perfmon.bookend_bo = drm_intel_bo_alloc(brw->bufmgr,
-                                                      "OA bookend BO",
-                                                      BOOKEND_BO_SIZE_BYTES, 64);
-      }
-
-      monitor->oa_bo =
-         drm_intel_bo_alloc(brw->bufmgr, "perf. monitor OA bo", 4096, 64);
-#ifdef DEBUG
-      /* Pre-filling the BO helps debug whether writes landed. */
-      drm_intel_bo_map(monitor->oa_bo, true);
-      memset((char *) monitor->oa_bo->virtual, 0xff, 4096);
-      drm_intel_bo_unmap(monitor->oa_bo);
-#endif
-
-      /* Allocate storage for accumulated OA counter values. */
-      monitor->oa_results =
-         calloc(brw->perfmon.entries_per_oa_snapshot, sizeof(uint32_t));
-
-      /* If the OA counters aren't already on, enable them. */
-      if (brw->perfmon.oa_users == 0) {
-         /* Ensure the OACONTROL enable and snapshot land in the same batch. */
-         int space = (MI_REPORT_PERF_COUNT_BATCH_DWORDS + 3) * 4;
-         intel_batchbuffer_require_space(brw, space, RENDER_RING);
-         start_oa_counters(brw);
-      }
-
-      /* Take a starting OA counter snapshot. */
-      emit_mi_report_perf_count(brw, monitor->oa_bo, 0, REPORT_ID);
-
-      monitor->oa_head_end = brw->perfmon.bookend_snapshots;
-      monitor->oa_middle_start = brw->perfmon.bookend_snapshots + 1;
-      monitor->oa_tail_start = -1;
-
-      /* Add the monitor to the unresolved list. */
-      add_to_unresolved_monitor_list(brw, monitor);
-
-      ++brw->perfmon.oa_users;
-   }
-
-   if (monitor_needs_statistics_registers(brw, m)) {
-      monitor->pipeline_stats_bo =
-         drm_intel_bo_alloc(brw->bufmgr, "perf. monitor stats bo", 4096, 64);
-
-      /* Take starting snapshots. */
-      snapshot_statistics_registers(brw, monitor, 0);
-   }
-
-   return true;
-}
-
-/**
- * Driver hook for glEndPerformanceMonitorAMD().
- */
-static void
-brw_end_perf_monitor(struct gl_context *ctx,
-                     struct gl_perf_monitor_object *m)
-{
-   struct brw_context *brw = brw_context(ctx);
-   struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
-
-   DBG("End(%d)\n", m->Name);
-
-   if (monitor_needs_oa(brw, m)) {
-      /* Take an ending OA counter snapshot. */
-      emit_mi_report_perf_count(brw, monitor->oa_bo,
-                                SECOND_SNAPSHOT_OFFSET_IN_BYTES, REPORT_ID);
-
-      --brw->perfmon.oa_users;
-
-      if (brw->perfmon.oa_users == 0)
-         stop_oa_counters(brw);
-
-      if (monitor->oa_head_end == brw->perfmon.bookend_snapshots) {
-         assert(monitor->oa_head_end != -1);
-         /* We never actually wrote the snapshot for the end of the first batch
-          * after BeginPerfMonitor.  This means that monitoring was contained
-          * entirely within a single batch, so we can ignore bookend_bo and
-          * just compare the monitor's begin/end snapshots directly.
-          */
-         monitor->oa_head_end = -1;
-         monitor->oa_middle_start = -1;
-         monitor->oa_tail_start = -1;
-
-         /* We can also mark it resolved since it won't depend on bookend_bo. */
-         DBG("Marking %d resolved - entirely in one batch\n", m->Name);
-         drop_from_unresolved_monitor_list(brw, monitor);
-      } else {
-         /* We've written at least one batch end snapshot, so the monitoring
-          * spanned multiple batches.  Mark which snapshot corresponds to the
-          * start of the current batch.
-          */
-         monitor->oa_tail_start = brw->perfmon.bookend_snapshots - 1;
-      }
-   }
-
-   if (monitor_needs_statistics_registers(brw, m)) {
-      /* Take ending snapshots. */
-      snapshot_statistics_registers(brw, monitor,
-                                    SECOND_SNAPSHOT_OFFSET_IN_BYTES);
-   }
-}
-
-/**
- * Reset a performance monitor, throwing away any results.
- */
-static void
-brw_reset_perf_monitor(struct gl_context *ctx,
-                       struct gl_perf_monitor_object *m)
-{
-   struct brw_context *brw = brw_context(ctx);
-   struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
-
-   reinitialize_perf_monitor(brw, monitor);
-
-   if (m->Active) {
-      brw_begin_perf_monitor(ctx, m);
-   }
-}
-
-/**
- * Is a performance monitor result available?
- */
-static GLboolean
-brw_is_perf_monitor_result_available(struct gl_context *ctx,
-                                     struct gl_perf_monitor_object *m)
-{
-   struct brw_context *brw = brw_context(ctx);
-   struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
-
-   bool oa_available = true;
-   bool stats_available = true;
-
-   if (monitor_needs_oa(brw, m)) {
-      oa_available = !monitor->oa_bo ||
-         (!drm_intel_bo_references(brw->batch.bo, monitor->oa_bo) &&
-          !drm_intel_bo_busy(monitor->oa_bo));
-   }
-
-   if (monitor_needs_statistics_registers(brw, m)) {
-      stats_available = !monitor->pipeline_stats_bo ||
-         (!drm_intel_bo_references(brw->batch.bo, monitor->pipeline_stats_bo) &&
-          !drm_intel_bo_busy(monitor->pipeline_stats_bo));
-   }
-
-   return oa_available && stats_available;
-}
-
-/**
- * Get the performance monitor result.
- */
-static void
-brw_get_perf_monitor_result(struct gl_context *ctx,
-                            struct gl_perf_monitor_object *m,
-                            GLsizei data_size,
-                            GLuint *data,
-                            GLint *bytes_written)
-{
-   struct brw_context *brw = brw_context(ctx);
-   struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
-   const GLuint *const data_end = (GLuint *)((uint8_t *) data + data_size);
-
-   DBG("GetResult(%d)\n", m->Name);
-   brw_dump_perf_monitors(brw);
-
-   /* This hook should only be called when results are available. */
-   assert(m->Ended);
-
-   /* Copy data to the supplied array (data).
-    *
-    * The output data format is: <group ID, counter ID, value> for each
-    * active counter.  The API allows counters to appear in any order.
-    */
-   GLsizei offset = 0;
-
-   if (monitor_needs_oa(brw, m)) {
-      /* Gather up the results from the BO, unless we already did due to the
-       * bookend BO wrapping.
-       */
-      if (monitor->oa_bo) {
-         /* Since the result is available, all the necessary snapshots will
-          * have been written to the bookend BO.  If other monitors are
-          * active, the bookend BO may be busy or referenced by the current
-          * batch, but only for writing snapshots beyond oa_tail_start,
-          * which we don't care about.
-          *
-          * Using an unsynchronized mapping avoids stalling for an
-          * indeterminate amount of time.
-          */
-         drm_intel_gem_bo_map_unsynchronized(brw->perfmon.bookend_bo);
-
-         gather_oa_results(brw, monitor, brw->perfmon.bookend_bo->virtual);
-
-         drm_intel_bo_unmap(brw->perfmon.bookend_bo);
-      }
-
-      for (int i = 0; i < brw->perfmon.entries_per_oa_snapshot; i++) {
-         int group = OA_COUNTERS;
-         int counter = brw->perfmon.oa_snapshot_layout[i];
-
-         /* We always capture all the OA counters, but the application may
-          * have only asked for a subset.  Skip unwanted counters.
-          */
-         if (counter < 0 || !BITSET_TEST(m->ActiveCounters[group], counter))
-            continue;
-
-         if (data + offset + 3 <= data_end) {
-            data[offset++] = group;
-            data[offset++] = counter;
-            data[offset++] = monitor->oa_results[i];
-         }
-      }
-
-      clean_bookend_bo(brw);
-   }
-
-   if (monitor_needs_statistics_registers(brw, m)) {
-      const int num_counters =
-         ctx->PerfMonitor.Groups[PIPELINE_STATS_COUNTERS].NumCounters;
-
-      if (!monitor->pipeline_stats_results) {
-         gather_statistics_results(brw, monitor);
-
-         /* Check if we did really get the results */
-         if (!monitor->pipeline_stats_results) {
-            if (bytes_written) {
-               *bytes_written = 0;
-            }
-            return;
-         }
-      }
-
-      for (int i = 0; i < num_counters; i++) {
-         if (BITSET_TEST(m->ActiveCounters[PIPELINE_STATS_COUNTERS], i)) {
-            if (data + offset + 4 <= data_end) {
-               data[offset++] = PIPELINE_STATS_COUNTERS;
-               data[offset++] = i;
-               *((uint64_t *) (&data[offset])) = monitor->pipeline_stats_results[i];
-               offset += 2;
-            }
-         }
-      }
-   }
-
-   if (bytes_written)
-      *bytes_written = offset * sizeof(uint32_t);
-}
-
-/**
- * Create a new performance monitor object.
- */
-static struct gl_perf_monitor_object *
-brw_new_perf_monitor(struct gl_context *ctx)
-{
-   (void) ctx;
-   return calloc(1, sizeof(struct brw_perf_monitor_object));
-}
-
-/**
- * Delete a performance monitor object.
- */
-static void
-brw_delete_perf_monitor(struct gl_context *ctx, struct gl_perf_monitor_object *m)
-{
-   struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
-   DBG("Delete(%d)\n", m->Name);
-   reinitialize_perf_monitor(brw_context(ctx), monitor);
-   free(monitor);
-}
-
-/******************************************************************************/
-
-/**
- * Called at the start of every render ring batch.
- *
- * Enable OA counters and emit the "start of batchbuffer" bookend OA snapshot.
- * Since it's a new batch, there will be plenty of space for the commands.
- */
-void
-brw_perf_monitor_new_batch(struct brw_context *brw)
-{
-   assert(brw->batch.ring == RENDER_RING);
-   assert(brw->gen < 6 || USED_BATCH(brw->batch) == 0);
-
-   if (brw->perfmon.oa_users == 0)
-      return;
-
-   start_oa_counters(brw);
-
-   /* Make sure bookend_bo has enough space for a pair of snapshots.
-    * If not, "wrap" the BO: gather up any results so far, and start from
-    * the beginning of the buffer.  Reserving a pair guarantees that wrapping
-    * will only happen at the beginning of a batch, where it's safe to map BOs
-    * (as the batch is empty and can't refer to any of them yet).
-    */
-   if (!has_space_for_bookend_snapshots(brw, 2))
-      wrap_bookend_bo(brw);
-
-   DBG("Bookend Begin Snapshot (%d)\n", brw->perfmon.bookend_snapshots);
-   emit_bookend_snapshot(brw);
-}
-
-/**
- * Called at the end of every render ring batch.
- *
- * Emit the "end of batchbuffer" bookend OA snapshot and disable the counters.
- *
- * This relies on there being enough space in BATCH_RESERVED.
- */
-void
-brw_perf_monitor_finish_batch(struct brw_context *brw)
-{
-   assert(brw->batch.ring == RENDER_RING);
-
-   if (brw->perfmon.oa_users == 0)
-      return;
-
-   DBG("Bookend End Snapshot (%d)\n", brw->perfmon.bookend_snapshots);
-
-   /* Not safe to wrap; should've reserved space already. */
-   assert(has_space_for_bookend_snapshots(brw, 1));
-
-   emit_bookend_snapshot(brw);
-
-   stop_oa_counters(brw);
-}
-
-/******************************************************************************/
-
-void
-brw_init_performance_monitors(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-
-   ctx->Driver.NewPerfMonitor = brw_new_perf_monitor;
-   ctx->Driver.DeletePerfMonitor = brw_delete_perf_monitor;
-   ctx->Driver.BeginPerfMonitor = brw_begin_perf_monitor;
-   ctx->Driver.EndPerfMonitor = brw_end_perf_monitor;
-   ctx->Driver.ResetPerfMonitor = brw_reset_perf_monitor;
-   ctx->Driver.IsPerfMonitorResultAvailable = brw_is_perf_monitor_result_available;
-   ctx->Driver.GetPerfMonitorResult = brw_get_perf_monitor_result;
-
-   if (brw->gen == 5) {
-      ctx->PerfMonitor.Groups = gen5_groups;
-      ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen5_groups);
-      brw->perfmon.oa_snapshot_layout = gen5_oa_snapshot_layout;
-      brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen5_oa_snapshot_layout);
-   } else if (brw->gen == 6) {
-      ctx->PerfMonitor.Groups = gen6_groups;
-      ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen6_groups);
-      brw->perfmon.oa_snapshot_layout = gen6_oa_snapshot_layout;
-      brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen6_oa_snapshot_layout);
-      brw->perfmon.statistics_registers = gen6_statistics_register_addresses;
-   } else if (brw->gen == 7) {
-      ctx->PerfMonitor.Groups = gen7_groups;
-      ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen7_groups);
-      brw->perfmon.oa_snapshot_layout = gen7_oa_snapshot_layout;
-      brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen7_oa_snapshot_layout);
-      brw->perfmon.statistics_registers = gen7_statistics_register_addresses;
-   }
-
-   brw->perfmon.unresolved =
-      ralloc_array(brw, struct brw_perf_monitor_object *, 1);
-   brw->perfmon.unresolved_elements = 0;
-   brw->perfmon.unresolved_array_size = 1;
-}
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index f136fae6cc3..65c27731cd2 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -175,8 +175,7 @@ do_batch_dump(struct brw_context *brw)
 void
 intel_batchbuffer_emit_render_ring_prelude(struct brw_context *brw)
 {
-   /* We may need to enable and snapshot OA counters. */
-   brw_perf_monitor_new_batch(brw);
+   /* Un-used currently */
 }
 
 /**
@@ -211,9 +210,6 @@ brw_new_batch(struct brw_context *brw)
     */
    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
       brw_collect_and_report_shader_time(brw);
-
-   if (INTEL_DEBUG & DEBUG_PERFMON)
-      brw_dump_perf_monitors(brw);
 }
 
 /**
@@ -241,9 +237,6 @@ brw_finish_batch(struct brw_context *brw)
       if (brw->gen >= 7)
          gen7_restore_default_l3_config(brw);
 
-      /* We may also need to snapshot and disable OA counters. */
-      brw_perf_monitor_finish_batch(brw);
-
       if (brw->is_haswell) {
          /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
           * 3DSTATE_CC_STATE_POINTERS > "Note":
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index bbb794240c0..1ecefc19449 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -100,71 +100,6 @@ can_do_pipelined_register_writes(struct brw_context *brw)
    return success;
 }
 
-static bool
-can_write_oacontrol(struct brw_context *brw)
-{
-   if (brw->gen < 6 || brw->gen >= 8)
-      return false;
-
-   static int result = -1;
-   if (result != -1)
-      return result;
-
-   /* Set "Select Context ID" to a particular address (which is likely not a
-    * context), but leave all counting disabled.  This should be harmless.
-    */
-   const int expected_value = 0x31337000;
-   const int offset = 110;
-
-   uint32_t *data;
-   /* Set a value in a BO to a known quantity.  The workaround BO already
-    * exists and doesn't contain anything important, so we may as well use it.
-    */
-   drm_intel_bo_map(brw->workaround_bo, true);
-   data = brw->workaround_bo->virtual;
-   data[offset] = 0xffffffff;
-   drm_intel_bo_unmap(brw->workaround_bo);
-
-   /* Write OACONTROL. */
-   BEGIN_BATCH(3);
-   OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
-   OUT_BATCH(OACONTROL);
-   OUT_BATCH(expected_value);
-   ADVANCE_BATCH();
-
-   brw_emit_mi_flush(brw);
-
-   /* Save the register's value back to the buffer. */
-   BEGIN_BATCH(3);
-   OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
-   OUT_BATCH(OACONTROL);
-   OUT_RELOC(brw->workaround_bo,
-             I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-             offset * sizeof(uint32_t));
-   ADVANCE_BATCH();
-
-   brw_emit_mi_flush(brw);
-
-   /* Set OACONTROL back to zero (everything off). */
-   BEGIN_BATCH(3);
-   OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
-   OUT_BATCH(OACONTROL);
-   OUT_BATCH(0);
-   ADVANCE_BATCH();
-
-   intel_batchbuffer_flush(brw);
-
-   /* Check whether the value got written. */
-   drm_intel_bo_map(brw->workaround_bo, false);
-   data = brw->workaround_bo->virtual;
-   bool success = data[offset] == expected_value;
-   drm_intel_bo_unmap(brw->workaround_bo);
-
-   result = success;
-
-   return success;
-}
-
 /**
  * Initializes potential list of extensions if ctx == NULL, or actually enables
  * extensions for a context.
@@ -290,11 +225,6 @@ intelInitExtensions(struct gl_context *ctx)
       ctx->Extensions.ARB_texture_query_levels = ctx->Const.GLSLVersion >= 130;
       ctx->Extensions.ARB_texture_query_lod = true;
       ctx->Extensions.EXT_timer_query = true;
-
-      if (brw->gen == 5 || can_write_oacontrol(brw)) {
-         ctx->Extensions.AMD_performance_monitor = true;
-         ctx->Extensions.INTEL_performance_query = true;
-      }
    }
 
    if (brw->gen >= 6) {