From bd888bc1d65cefbd4e3fc0a40d416c75d9632951 Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Mon, 20 May 2019 07:56:18 +0100 Subject: i965/iris: perf-queries: don't invalidate/flush 3d pipeline Our current implementation of performance queries is fairly harsh because it completely flushes and invalidates the 3d pipeline caches at the beginning and end of each query. An argument can be made that this is how performance should be measured but it probably doesn't reflect what the application is actually doing and the actual cost of draw calls. A more appropriate approach is to just stall the pipeline at scoreboard, so that we measure the effect of a draw call without having the pipeline in a completely pristine state for every draw call. v2: Use end of pipe PIPE_CONTROL instruction for Iris (Ken) Signed-off-by: Lionel Landwerlin Reviewed-by: Kenneth Graunke --- src/intel/perf/gen_perf.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'src/intel/perf/gen_perf.c') diff --git a/src/intel/perf/gen_perf.c b/src/intel/perf/gen_perf.c index daa092c88c9..9e987d599d7 100644 --- a/src/intel/perf/gen_perf.c +++ b/src/intel/perf/gen_perf.c @@ -1716,15 +1716,9 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx, * end snapshot - otherwise the results won't be a complete representation * of the work. * - * Theoretically there could be opportunities to minimize how much of the - * GPU pipeline is drained, or that we stall for, when we know what specific - * units the performance counters being queried relate to but we don't - * currently attempt to be clever here. - * - * Note: with our current simple approach here then for back-to-back queries - * we will redundantly emit duplicate commands to synchronize the command - * streamer with the rest of the GPU pipeline, but we assume that in HW the - * second synchronization is effectively a NOOP. + * To achieve this, we stall the pipeline at pixel scoreboard (prevent any + * additional work to be processed by the pipeline until all pixels of the + * previous draw has be completed). * * N.B. The final results are based on deltas of counters between (inside) * Begin/End markers so even though the total wall clock time of the @@ -1738,7 +1732,7 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx, * This is our Begin synchronization point to drain current work on the * GPU before we capture our first counter snapshot... */ - perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx); + perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx); switch (queryinfo->kind) { case GEN_PERF_QUERY_TYPE_OA: @@ -1920,7 +1914,7 @@ gen_perf_end_query(struct gen_perf_context *perf_ctx, * For more details see comment in brw_begin_perf_query for * corresponding flush. */ - perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx); + perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx); switch (query->queryinfo->kind) { case GEN_PERF_QUERY_TYPE_OA: -- cgit v1.2.3