i965/iris: perf-queries: don't invalidate/flush 3d pipeline

Our current implementation of performance queries is fairly harsh because it completely flushes and invalidates the 3d pipeline caches at the beginning and end of each query. An argument can be made that this is how performance should be measured but it probably doesn't reflect what the application is actually doing and the actual cost of draw calls. A more appropriate approach is to just stall the pipeline at scoreboard, so that we measure the effect of a draw call without having the pipeline in a completely pristine state for every draw call. v2: Use end of pipe PIPE_CONTROL instruction for Iris (Ken) Signed-off-by: Lionel Landwerlin <[email protected]> Reviewed-by: Kenneth Graunke <[email protected]>
author: Lionel Landwerlin <[email protected]> 2019-05-20 07:56:18 +0100
committer: Lionel Landwerlin <[email protected]> 2019-12-13 11:27:22 +0200
commit: bd888bc1d65cefbd4e3fc0a40d416c75d9632951 (patch)
tree: c566453a19e10d1e1483d63c069406f2498b01e2
parent: a575b3cd5c1e61a7e92fa2521ced95d24b64f392 (diff)
4 files changed, 21 insertions, 25 deletions
diff --git a/src/gallium/drivers/iris/iris_perf.c b/src/gallium/drivers/iris/iris_perf.c
index 7c0378aacee..1e5ec8140dc 100644
--- a/src/gallium/drivers/iris/iris_perf.c
+++ b/src/gallium/drivers/iris/iris_perf.c
@@ -31,18 +31,11 @@ iris_oa_bo_alloc(void *bufmgr, const char *name, uint64_t size)
 }
 
 static void
-iris_perf_emit_mi_flush(struct iris_context *ice)
+iris_perf_emit_stall_at_pixel_scoreboard(struct iris_context *ice)
 {
-   const int flags = PIPE_CONTROL_RENDER_TARGET_FLUSH |
-                     PIPE_CONTROL_INSTRUCTION_INVALIDATE |
-                     PIPE_CONTROL_CONST_CACHE_INVALIDATE |
-                     PIPE_CONTROL_DATA_CACHE_FLUSH |
-                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-                     PIPE_CONTROL_VF_CACHE_INVALIDATE |
-                     PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
-                     PIPE_CONTROL_CS_STALL;
-   iris_emit_pipe_control_flush(&ice->batches[IRIS_BATCH_RENDER],
-                                "OA metrics", flags);
+   iris_emit_end_of_pipe_sync(&ice->batches[IRIS_BATCH_RENDER],
+                              "OA metrics",
+                              PIPE_CONTROL_STALL_AT_SCOREBOARD);
 }
 
 static void
@@ -106,7 +99,8 @@ iris_perf_init_vtbl(struct gen_perf_config *perf_cfg)
    perf_cfg->vtbl.bo_unreference = (bo_unreference_t)iris_bo_unreference;
    perf_cfg->vtbl.bo_map = (bo_map_t)iris_bo_map;
    perf_cfg->vtbl.bo_unmap = (bo_unmap_t)iris_bo_unmap;
-   perf_cfg->vtbl.emit_mi_flush = (emit_mi_flush_t)iris_perf_emit_mi_flush;
+   perf_cfg->vtbl.emit_stall_at_pixel_scoreboard =
+      (emit_mi_flush_t)iris_perf_emit_stall_at_pixel_scoreboard;
 
    perf_cfg->vtbl.emit_mi_report_perf_count =
       (emit_mi_report_t)iris_perf_emit_mi_report_perf_count;
diff --git a/src/intel/perf/gen_perf.c b/src/intel/perf/gen_perf.c
index daa092c88c9..9e987d599d7 100644
--- a/src/intel/perf/gen_perf.c
+++ b/src/intel/perf/gen_perf.c
@@ -1716,15 +1716,9 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx,
     * end snapshot - otherwise the results won't be a complete representation
     * of the work.
     *
-    * Theoretically there could be opportunities to minimize how much of the
-    * GPU pipeline is drained, or that we stall for, when we know what specific
-    * units the performance counters being queried relate to but we don't
-    * currently attempt to be clever here.
-    *
-    * Note: with our current simple approach here then for back-to-back queries
-    * we will redundantly emit duplicate commands to synchronize the command
-    * streamer with the rest of the GPU pipeline, but we assume that in HW the
-    * second synchronization is effectively a NOOP.
+    * To achieve this, we stall the pipeline at pixel scoreboard (prevent any
+    * additional work to be processed by the pipeline until all pixels of the
+    * previous draw has be completed).
     *
     * N.B. The final results are based on deltas of counters between (inside)
     * Begin/End markers so even though the total wall clock time of the
@@ -1738,7 +1732,7 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx,
     * This is our Begin synchronization point to drain current work on the
     * GPU before we capture our first counter snapshot...
     */
-   perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
+   perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx);
 
    switch (queryinfo->kind) {
    case GEN_PERF_QUERY_TYPE_OA:
@@ -1920,7 +1914,7 @@ gen_perf_end_query(struct gen_perf_context *perf_ctx,
     * For more details see comment in brw_begin_perf_query for
     * corresponding flush.
     */
-  perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
+   perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx);
 
    switch (query->queryinfo->kind) {
    case GEN_PERF_QUERY_TYPE_OA:
diff --git a/src/intel/perf/gen_perf.h b/src/intel/perf/gen_perf.h
index 46d37e07c25..2cd246a1dca 100644
--- a/src/intel/perf/gen_perf.h
+++ b/src/intel/perf/gen_perf.h
@@ -219,7 +219,7 @@ struct gen_perf_config {
       bool (*batch_references)(void *batch, void *bo);
       void (*bo_wait_rendering)(void *bo);
       int (*bo_busy)(void *bo);
-      void (*emit_mi_flush)(void *ctx);
+      void (*emit_stall_at_pixel_scoreboard)(void *ctx);
       void (*emit_mi_report_perf_count)(void *ctx,
                                         void *bo,
                                         uint32_t offset_in_bytes,
diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c
index 0e5459e5e5e..cfd3efe374e 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_query.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
@@ -459,6 +459,13 @@ brw_oa_batchbuffer_flush(void *c, const char *file, int line)
    _intel_batchbuffer_flush_fence(ctx, -1, NULL, file,  line);
 }
 
+static void
+brw_oa_emit_stall_at_pixel_scoreboard(void *c)
+{
+   struct brw_context *brw = c;
+   brw_emit_end_of_pipe_sync(brw, PIPE_CONTROL_STALL_AT_SCOREBOARD);
+}
+
 typedef void (*capture_frequency_stat_register_t)(void *, void *, uint32_t );
 typedef void (*store_register_mem64_t)(void *ctx, void *bo,
                                        uint32_t reg, uint32_t offset);
@@ -487,7 +494,8 @@ brw_init_perf_query_info(struct gl_context *ctx)
    perf_cfg->vtbl.bo_unreference = (bo_unreference_t)brw_bo_unreference;
    perf_cfg->vtbl.bo_map = (bo_map_t)brw_bo_map;
    perf_cfg->vtbl.bo_unmap = (bo_unmap_t)brw_bo_unmap;
-   perf_cfg->vtbl.emit_mi_flush = (emit_mi_flush_t)brw_emit_mi_flush;
+   perf_cfg->vtbl.emit_stall_at_pixel_scoreboard =
+      (emit_mi_flush_t)brw_oa_emit_stall_at_pixel_scoreboard;
    perf_cfg->vtbl.emit_mi_report_perf_count =
       (emit_mi_report_t)brw_oa_emit_mi_report_perf_count;
    perf_cfg->vtbl.batchbuffer_flush = brw_oa_batchbuffer_flush;
author	Lionel Landwerlin <[email protected]>	2019-05-20 07:56:18 +0100
committer	Lionel Landwerlin <[email protected]>	2019-12-13 11:27:22 +0200
commit	bd888bc1d65cefbd4e3fc0a40d416c75d9632951 (patch)
tree	c566453a19e10d1e1483d63c069406f2498b01e2
parent	a575b3cd5c1e61a7e92fa2521ced95d24b64f392 (diff)