i965: Emit VF cache invalidates for 48-bit addressing bugs with softpin.

We'd like to start using soft-pin to assign BO addresses up front, and never move them again. Our previous plan for dealing with 48-bit VF cache bugs was to relocate vertex buffers to the low 4GB, so we'd never have addresses that alias in the low 32 bits. But that requires moving buffers dynamically. This patch tracks the last seen BO address for each vertex/index buffer, and emits a VF cache invalidate if the high bits change. (Ideally, we won't hit this case very often.) This should work for the soft-pin case, but unfortunately won't work in the relocation case, as we don't actually know the addresses. So, we have to use both methods. v2: Mention that the cache uses a <VertexBufferIndex, Address> tuple more explicitly (suggested by Scott). Mention "single batch" too (suggested by Chris). Reviewed-by: Scott D Phillips <[email protected]>
author: Kenneth Graunke <[email protected]> 2018-04-09 15:39:56 -0700
committer: Kenneth Graunke <[email protected]> 2018-05-22 10:02:28 -0700
commit: 92f01fc5f914fd500497d0c3aed75f3ac8dc054d (patch)
tree: 4f770b126dd1bf91399ff4bb0e982304bc6a3798 /src/mesa
parent: c7259259d4c0df9ba339f4927891c855c7f91924 (diff)
2 files changed, 69 insertions, 0 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 0844400bc53..773f104824d 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -966,6 +966,9 @@ struct brw_context
        * These bitfields indicate which workarounds are needed.
        */
       uint8_t attrib_wa_flags[VERT_ATTRIB_MAX];
+
+      /* High bits of the last seen vertex buffer address (for workarounds). */
+      uint16_t last_bo_high_bits[33];
    } vb;
 
    struct {
@@ -986,6 +989,9 @@ struct brw_context
        * referencing the same index buffer.
        */
       unsigned int start_vertex_offset;
+
+      /* High bits of the last seen index buffer address (for workarounds). */
+      uint16_t last_bo_high_bits;
    } ib;
 
    /* Active vertex program:
diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c
index 6178bfa3f88..4f44b9965e6 100644
--- a/src/mesa/drivers/dri/i965/genX_state_upload.c
+++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
@@ -480,6 +480,65 @@ upload_format_size(uint32_t upload_format)
    }
 }
 
+static UNUSED uint16_t
+pinned_bo_high_bits(struct brw_bo *bo)
+{
+   return (bo->kflags & EXEC_OBJECT_PINNED) ? bo->gtt_offset >> 32ull : 0;
+}
+
+/* The VF cache designers apparently cut corners, and made the cache key's
+ * <VertexBufferIndex, Memory Address> tuple only consider the bottom 32 bits
+ * of the address.  If you happen to have two vertex buffers which get placed
+ * exactly 4 GiB apart and use them in back-to-back draw calls, you can get
+ * collisions.  (These collisions can happen within a single batch.)
+ *
+ * In the soft-pin world, we'd like to assign addresses up front, and never
+ * move buffers.  So, we need to do a VF cache invalidate if the buffer for
+ * a particular VB slot has different [48:32] address bits than the last one.
+ *
+ * In the relocation world, we have no idea what the addresses will be, so
+ * we can't apply this workaround.  Instead, we tell the kernel to move it
+ * to the low 4GB regardless.
+ */
+static void
+vf_invalidate_for_vb_48bit_transitions(struct brw_context *brw)
+{
+#if GEN_GEN >= 8
+   bool need_invalidate = true;
+   unsigned i;
+
+   for (i = 0; i < brw->vb.nr_buffers; i++) {
+      uint16_t high_bits = pinned_bo_high_bits(brw->vb.buffers[i].bo);
+
+      if (high_bits != brw->vb.last_bo_high_bits[i]) {
+         need_invalidate = true;
+         brw->vb.last_bo_high_bits[i] = high_bits;
+      }
+   }
+
+   /* Don't bother with draw parameter buffers - those are generated by
+    * the driver so we can select a consistent memory zone.
+    */
+
+   if (need_invalidate) {
+      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
+   }
+#endif
+}
+
+static void
+vf_invalidate_for_ib_48bit_transition(struct brw_context *brw)
+{
+#if GEN_GEN >= 8
+   uint16_t high_bits = pinned_bo_high_bits(brw->ib.bo);
+
+   if (high_bits != brw->ib.last_bo_high_bits) {
+      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
+      brw->ib.last_bo_high_bits = high_bits;
+   }
+#endif
+}
+
 static void
 genX(emit_vertices)(struct brw_context *brw)
 {
@@ -594,6 +653,8 @@ genX(emit_vertices)(struct brw_context *brw)
    const unsigned nr_buffers = brw->vb.nr_buffers +
       uses_draw_params + uses_derived_draw_params;
 
+   vf_invalidate_for_vb_48bit_transitions(brw);
+
    if (nr_buffers) {
       assert(nr_buffers <= (GEN_GEN >= 6 ? 33 : 17));
 
@@ -886,6 +947,8 @@ genX(emit_index_buffer)(struct brw_context *brw)
    if (index_buffer == NULL)
       return;
 
+   vf_invalidate_for_ib_48bit_transition(brw);
+
    brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) {
 #if GEN_GEN < 8 && !GEN_IS_HASWELL
       ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
author	Kenneth Graunke <[email protected]>	2018-04-09 15:39:56 -0700
committer	Kenneth Graunke <[email protected]>	2018-05-22 10:02:28 -0700
commit	92f01fc5f914fd500497d0c3aed75f3ac8dc054d (patch)
tree	4f770b126dd1bf91399ff4bb0e982304bc6a3798 /src/mesa
parent	c7259259d4c0df9ba339f4927891c855c7f91924 (diff)