i965: Reimplement ARB_transform_feedback2 on Haswell and later.

My old implementation accumulated <start, end> pairs in a buffer, and eventually processed that data on the CPU. This meant flushing the batchbuffer and waiting for it to completely execute before we could map it, resulting in really long stalls. We could also run out of space in the buffer, and have to do this early. Instead, we can use Haswell's MI_MATH command to do the (end - start) subtraction, as well as the multiplication by 2 or 3 to convert from the number of primitives written to the number of vertices written. We still need to CS stall to read the counters, but otherwise everything is completely pipelined - there's no CPU<->GPU synchronization required. It also uses only 80 bytes in the buffer, no matter what. Improves performance in Manhattan on Skylake GT3e at 800x600 by 6.1086% +/- 0.954166% (n=9). At 1920x1080, improves performance by 2.82103% +/- 0.148596% (n=84). v2: Fix number of primitives -> number of vertices calculation for GL_TRIANGLES (I was multiplying by 4 instead of 3.) Caught by Jordan Justen. Signed-off-by: Kenneth Graunke <[email protected]> Reviewed-by: Jordan Justen <[email protected]>
author: Kenneth Graunke <[email protected]> 2016-05-04 23:44:25 -0700
committer: Kenneth Graunke <[email protected]> 2016-05-09 15:00:01 -0700
commit: 96d43f2d087e23ab692d43fc48fe1be30e923ae0 (patch)
tree: df3d1532a3993a98e0141003d63e4fc4a06cd2b7 /src/mesa/drivers/dri/i965/brw_draw.c
parent: fdb6c1887f7b61ef49fb89e0b0928f65b2edf29b (diff)
1 files changed, 30 insertions, 8 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index afa8a4e9eae..9d034cfdb33 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -153,7 +153,9 @@ trim(GLenum prim, GLuint length)
 static void
 brw_emit_prim(struct brw_context *brw,
               const struct _mesa_prim *prim,
-              uint32_t hw_prim)
+              uint32_t hw_prim,
+              struct brw_transform_feedback_object *xfb_obj,
+              unsigned stream)
 {
    int verts_per_instance;
    int vertex_access_type;
@@ -185,7 +187,7 @@ brw_emit_prim(struct brw_context *brw,
       verts_per_instance = prim->count;
 
    /* If nothing to emit, just return. */
-   if (verts_per_instance == 0 && !prim->is_indirect)
+   if (verts_per_instance == 0 && !prim->is_indirect && !xfb_obj)
       return;
 
    /* If we're set to always flush, do it before and after the primitive emit.
@@ -197,7 +199,25 @@ brw_emit_prim(struct brw_context *brw,
       brw_emit_mi_flush(brw);
 
    /* If indirect, emit a bunch of loads from the indirect BO. */
-   if (prim->is_indirect) {
+   if (xfb_obj) {
+      indirect_flag = GEN7_3DPRIM_INDIRECT_PARAMETER_ENABLE;
+
+      brw_load_register_mem(brw, GEN7_3DPRIM_VERTEX_COUNT,
+                            xfb_obj->prim_count_bo,
+                            I915_GEM_DOMAIN_VERTEX, 0,
+                            stream * sizeof(uint32_t));
+      BEGIN_BATCH(9);
+      OUT_BATCH(MI_LOAD_REGISTER_IMM | (9 - 2));
+      OUT_BATCH(GEN7_3DPRIM_INSTANCE_COUNT);
+      OUT_BATCH(prim->num_instances);
+      OUT_BATCH(GEN7_3DPRIM_START_VERTEX);
+      OUT_BATCH(0);
+      OUT_BATCH(GEN7_3DPRIM_BASE_VERTEX);
+      OUT_BATCH(0);
+      OUT_BATCH(GEN7_3DPRIM_START_INSTANCE);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   } else if (prim->is_indirect) {
       struct gl_buffer_object *indirect_buffer = brw->ctx.DrawIndirectBuffer;
       drm_intel_bo *bo = intel_bufferobj_buffer(brw,
             intel_buffer_object(indirect_buffer),
@@ -382,6 +402,8 @@ brw_try_draw_prims(struct gl_context *ctx,
                    const struct _mesa_index_buffer *ib,
                    GLuint min_index,
                    GLuint max_index,
+                   struct brw_transform_feedback_object *xfb_obj,
+                   unsigned stream,
                    struct gl_buffer_object *indirect)
 {
    struct brw_context *brw = brw_context(ctx);
@@ -531,7 +553,7 @@ retry:
 	 brw_upload_render_state(brw);
       }
 
-      brw_emit_prim(brw, &prims[i], brw->primitive);
+      brw_emit_prim(brw, &prims[i], brw->primitive, xfb_obj, stream);
 
       brw->no_batch_wrap = false;
 
@@ -573,14 +595,14 @@ brw_draw_prims(struct gl_context *ctx,
                GLboolean index_bounds_valid,
                GLuint min_index,
                GLuint max_index,
-               struct gl_transform_feedback_object *unused_tfb_object,
+               struct gl_transform_feedback_object *gl_xfb_obj,
                unsigned stream,
                struct gl_buffer_object *indirect)
 {
    struct brw_context *brw = brw_context(ctx);
    const struct gl_client_array **arrays = ctx->Array._DrawArrays;
-
-   assert(unused_tfb_object == NULL);
+   struct brw_transform_feedback_object *xfb_obj =
+      (struct brw_transform_feedback_object *) gl_xfb_obj;
 
    if (!brw_check_conditional_render(brw))
       return;
@@ -619,7 +641,7 @@ brw_draw_prims(struct gl_context *ctx,
     * to it.
     */
    brw_try_draw_prims(ctx, arrays, prims, nr_prims, ib, min_index, max_index,
-                      indirect);
+                      xfb_obj, stream, indirect);
 }
 
 void
author	Kenneth Graunke <[email protected]>	2016-05-04 23:44:25 -0700
committer	Kenneth Graunke <[email protected]>	2016-05-09 15:00:01 -0700
commit	96d43f2d087e23ab692d43fc48fe1be30e923ae0 (patch)
tree	df3d1532a3993a98e0141003d63e4fc4a06cd2b7 /src/mesa/drivers/dri/i965/brw_draw.c
parent	fdb6c1887f7b61ef49fb89e0b0928f65b2edf29b (diff)