aboutsummaryrefslogtreecommitdiffstats
path: root/src/intel
diff options
context:
space:
mode:
authorJason Ekstrand <[email protected]>2019-11-25 21:55:51 -0600
committerJason Ekstrand <[email protected]>2019-12-05 10:59:10 -0600
commit46af0ecc1d1f060786a1c2dfede1f936b407fbf6 (patch)
tree3f67744e5239b8086c572f6ebf3fc2a9771c0500 /src/intel
parent1b5cb92b623119243fb668bc70fd970e86d5fd58 (diff)
anv: Use PIPE_CONTROL flushes to implement the gen8 VF cache WA
Reviewed-by: Lionel Landwerlin <[email protected]>
Diffstat (limited to 'src/intel')
-rw-r--r--src/intel/vulkan/anv_device.c14
-rw-r--r--src/intel/vulkan/anv_genX.h8
-rw-r--r--src/intel/vulkan/anv_private.h26
-rw-r--r--src/intel/vulkan/genX_blorp_exec.c33
-rw-r--r--src/intel/vulkan/genX_cmd_buffer.c180
-rw-r--r--src/intel/vulkan/genX_gpu_memcpy.c4
6 files changed, 245 insertions, 20 deletions
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 2c48003b276..be4d23356e3 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -141,8 +141,12 @@ anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
}
}
+ /* We only allow 48-bit addresses with softpin because knowing the actual
+ * address is required for the vertex cache flush workaround.
+ */
device->supports_48bit_addresses = (device->info.gen >= 8) &&
- gtt_size > (4ULL << 30 /* GiB */);
+ device->has_softpin &&
+ gtt_size > (4ULL << 30 /* GiB */);
uint64_t heap_size = anv_compute_heap_size(fd, gtt_size);
@@ -471,10 +475,6 @@ anv_physical_device_init(struct anv_physical_device *device,
goto fail;
}
- result = anv_physical_device_init_heaps(device, fd);
- if (result != VK_SUCCESS)
- goto fail;
-
device->has_softpin = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN);
device->has_exec_async = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_ASYNC);
device->has_exec_capture = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_CAPTURE);
@@ -484,6 +484,10 @@ anv_physical_device_init(struct anv_physical_device *device,
anv_gem_supports_syncobj_wait(fd);
device->has_context_priority = anv_gem_has_context_priority(fd);
+ result = anv_physical_device_init_heaps(device, fd);
+ if (result != VK_SUCCESS)
+ goto fail;
+
device->use_softpin = device->has_softpin &&
device->supports_48bit_addresses;
diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
index 0274fe8b3a8..8c2a0e40099 100644
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@@ -44,6 +44,14 @@ void genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer);
void genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer);
+void genX(cmd_buffer_set_binding_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
+ int vb_index,
+ struct anv_address vb_address,
+ uint32_t vb_size);
+void genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t access_type,
+ uint64_t vb_used);
+
void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
unsigned width, unsigned height,
unsigned scale);
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index fd6f0fdb104..2abbb866b2f 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -2503,6 +2503,27 @@ struct anv_attachment_state {
struct anv_image_view * image_view;
};
+/** State tracking for vertex buffer flushes
+ *
+ * On Gen8-9, the VF cache only considers the bottom 32 bits of memory
+ * addresses. If you happen to have two vertex buffers which get placed
+ * exactly 4 GiB apart and use them in back-to-back draw calls, you can get
+ * collisions. In order to solve this problem, we track vertex address ranges
+ * which are live in the cache and invalidate the cache if one ever exceeds 32
+ * bits.
+ */
+struct anv_vb_cache_range {
+ /* Virtual address at which the live vertex buffer cache range starts for
+ * this vertex buffer index.
+ */
+ uint64_t start;
+
+ /* Virtual address of the byte after where vertex buffer cache range ends.
+ * This is exclusive such that end - start is the size of the range.
+ */
+ uint64_t end;
+};
+
/** State tracking for particular pipeline bind point
*
* This struct is the base struct for anv_cmd_graphics_state and
@@ -2531,6 +2552,11 @@ struct anv_cmd_graphics_state {
anv_cmd_dirty_mask_t dirty;
uint32_t vb_dirty;
+ struct anv_vb_cache_range ib_bound_range;
+ struct anv_vb_cache_range ib_dirty_range;
+ struct anv_vb_cache_range vb_bound_ranges[33];
+ struct anv_vb_cache_range vb_dirty_ranges[33];
+
struct anv_dynamic_state dynamic;
struct {
diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c
index 79e18d95282..302acb54461 100644
--- a/src/intel/vulkan/genX_blorp_exec.c
+++ b/src/intel/vulkan/genX_blorp_exec.c
@@ -139,19 +139,6 @@ blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
struct blorp_address *addr)
{
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
-
- /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
- *
- * "The VF cache needs to be invalidated before binding and then using
- * Vertex Buffers that overlap with any previously bound Vertex Buffer
- * (at a 64B granularity) since the last invalidation. A VF cache
- * invalidate is performed by setting the "VF Cache Invalidation Enable"
- * bit in PIPE_CONTROL."
- *
- * This restriction first appears in the Skylake PRM but the internal docs
- * also list it as being an issue on Broadwell. In order to avoid this
- * problem, we align all vertex buffer allocations to 64 bytes.
- */
struct anv_state vb_state =
anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 64);
@@ -170,9 +157,25 @@ blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
uint32_t *sizes,
unsigned num_vbs)
{
- /* anv forces all vertex buffers into the low 4GB so there are never any
- * transitions that require a VF invalidation.
+ struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+
+ for (unsigned i = 0; i < num_vbs; i++) {
+ struct anv_address anv_addr = {
+ .bo = addrs[i].buffer,
+ .offset = addrs[i].offset,
+ };
+ genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer,
+ i, anv_addr, sizes[i]);
+ }
+
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+ /* Technically, we should call this *after* 3DPRIMITIVE but it doesn't
+ * really matter for blorp because we never call apply_pipe_flushes after
+ * this point.
*/
+ genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(cmd_buffer, SEQUENTIAL,
+ (1 << num_vbs) - 1);
}
#if GEN_GEN >= 8
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
index 677377ea302..c764011eb5f 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -1392,6 +1392,10 @@ genX(BeginCommandBuffer)(
* executing anything. The chances are fairly high that they will use
* blorp at least once per primary command buffer so it shouldn't be
* wasted.
+ *
+ * There is also a workaround on gen8 which requires us to invalidate the
+ * VF cache occasionally. It's easier if we can assume we start with a
+ * fresh cache (See also genX(cmd_buffer_set_binding_for_gen8_vb_flush).)
*/
cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
@@ -1598,6 +1602,14 @@ genX(CmdExecuteCommands)(
anv_cmd_buffer_add_secondary(primary, secondary);
}
+ /* The secondary isn't counted in our VF cache tracking so we need to
+ * invalidate the whole thing.
+ */
+ if (GEN_GEN >= 8 && GEN_GEN <= 9) {
+ primary->state.pending_pipe_bits |=
+ ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
+ }
+
/* The secondary may have selected a different pipeline (3D or compute) and
* may have changed the current L3$ configuration. Reset our tracking
* variables to invalid values to ensure that we re-emit these in the case
@@ -1836,6 +1848,18 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
}
+ if ((GEN_GEN >= 8 && GEN_GEN <= 9) &&
+ (bits & ANV_PIPE_CS_STALL_BIT) &&
+ (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
+ /* If we are doing a VF cache invalidate AND a CS stall (it must be
+ * both) then we can reset our vertex cache tracking.
+ */
+ memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
+ sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
+ memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
+ sizeof(cmd_buffer->state.gfx.ib_dirty_range));
+ }
+
if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) {
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
#if GEN_GEN >= 12
@@ -2830,6 +2854,12 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
#endif
};
+#if GEN_GEN >= 8 && GEN_GEN <= 9
+ genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer, vb,
+ state.BufferStartingAddress,
+ state.BufferSize);
+#endif
+
GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
i++;
}
@@ -2967,6 +2997,9 @@ emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer,
.EndAddress = anv_address_add(addr, size),
#endif
});
+
+ genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer,
+ index, addr, size);
}
static void
@@ -3014,6 +3047,25 @@ emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)
emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);
}
+static void
+update_dirty_vbs_for_gen8_vb_flush(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t access_type)
+{
+ struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline;
+ const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+ uint64_t vb_used = pipeline->vb_used;
+ if (vs_prog_data->uses_firstvertex ||
+ vs_prog_data->uses_baseinstance)
+ vb_used |= 1ull << ANV_SVGS_VB_INDEX;
+ if (vs_prog_data->uses_drawid)
+ vb_used |= 1ull << ANV_DRAWID_VB_INDEX;
+
+ genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(cmd_buffer,
+ access_type == RANDOM,
+ vb_used);
+}
+
void genX(CmdDraw)(
VkCommandBuffer commandBuffer,
uint32_t vertexCount,
@@ -3059,6 +3111,8 @@ void genX(CmdDraw)(
prim.StartInstanceLocation = firstInstance;
prim.BaseVertexLocation = 0;
}
+
+ update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL);
}
void genX(CmdDrawIndexed)(
@@ -3107,6 +3161,8 @@ void genX(CmdDrawIndexed)(
prim.StartInstanceLocation = firstInstance;
prim.BaseVertexLocation = vertexOffset;
}
+
+ update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM);
}
/* Auto-Draw / Indirect Registers */
@@ -3179,6 +3235,8 @@ void genX(CmdDrawIndirectByteCountEXT)(
prim.VertexAccessType = SEQUENTIAL;
prim.PrimitiveTopologyType = pipeline->topology;
}
+
+ update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL);
#endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */
}
@@ -3263,6 +3321,8 @@ void genX(CmdDrawIndirect)(
prim.PrimitiveTopologyType = pipeline->topology;
}
+ update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL);
+
offset += stride;
}
}
@@ -3311,6 +3371,8 @@ void genX(CmdDrawIndexedIndirect)(
prim.PrimitiveTopologyType = pipeline->topology;
}
+ update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM);
+
offset += stride;
}
}
@@ -3465,6 +3527,8 @@ void genX(CmdDrawIndirectCountKHR)(
prim.PrimitiveTopologyType = pipeline->topology;
}
+ update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL);
+
offset += stride;
}
}
@@ -3530,6 +3594,8 @@ void genX(CmdDrawIndexedIndirectCountKHR)(
prim.PrimitiveTopologyType = pipeline->topology;
}
+ update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM);
+
offset += stride;
}
}
@@ -4115,6 +4181,120 @@ genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer)
}
}
+/* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
+ *
+ * "The VF cache needs to be invalidated before binding and then using
+ * Vertex Buffers that overlap with any previously bound Vertex Buffer
+ * (at a 64B granularity) since the last invalidation. A VF cache
+ * invalidate is performed by setting the "VF Cache Invalidation Enable"
+ * bit in PIPE_CONTROL."
+ *
+ * This is implemented by carefully tracking all vertex and index buffer
+ * bindings and flushing if the cache ever ends up with a range in the cache
+ * that would exceed 4 GiB. This is implemented in three parts:
+ *
+ * 1. genX(cmd_buffer_set_binding_for_gen8_vb_flush)() which must be called
+ * every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the
+ * tracking code of the new binding. If this new binding would cause
+ * the cache to have a too-large range on the next draw call, a pipeline
+ * stall and VF cache invalidate are added to pending_pipeline_bits.
+ *
+ * 2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to
+ * empty whenever we emit a VF invalidate.
+ *
+ * 3. genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)() must be called
+ * after every 3DPRIMITIVE and copies the bound range into the dirty
+ * range for each used buffer. This has to be a separate step because
+ * we don't always re-bind all buffers and so 1. can't know which
+ * buffers are actually bound.
+ */
+void
+genX(cmd_buffer_set_binding_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
+ int vb_index,
+ struct anv_address vb_address,
+ uint32_t vb_size)
+{
+ if (GEN_GEN < 8 || GEN_GEN > 9 ||
+ !cmd_buffer->device->instance->physicalDevice.use_softpin)
+ return;
+
+ struct anv_vb_cache_range *bound, *dirty;
+ if (vb_index == -1) {
+ bound = &cmd_buffer->state.gfx.ib_bound_range;
+ dirty = &cmd_buffer->state.gfx.ib_dirty_range;
+ } else {
+ assert(vb_index >= 0);
+ assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
+ assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
+ bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index];
+ dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
+ }
+
+ if (vb_size == 0) {
+ bound->start = 0;
+ bound->end = 0;
+ return;
+ }
+
+ assert(vb_address.bo && (vb_address.bo->flags & EXEC_OBJECT_PINNED));
+ bound->start = gen_48b_address(anv_address_physical(vb_address));
+ bound->end = bound->start + vb_size;
+ assert(bound->end > bound->start); /* No overflow */
+
+ /* Align everything to a cache line */
+ bound->start &= ~(64ull - 1ull);
+ bound->end = align_u64(bound->end, 64);
+
+ /* Compute the dirty range */
+ dirty->start = MIN2(dirty->start, bound->start);
+ dirty->end = MAX2(dirty->end, bound->end);
+
+ /* If our range is larger than 32 bits, we have to flush */
+ assert(bound->end - bound->start <= (1ull << 32));
+ if (dirty->end - dirty->start > (1ull << 32)) {
+ cmd_buffer->state.pending_pipe_bits |=
+ ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
+ }
+}
+
+void
+genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t access_type,
+ uint64_t vb_used)
+{
+ if (GEN_GEN < 8 || GEN_GEN > 9 ||
+ !cmd_buffer->device->instance->physicalDevice.use_softpin)
+ return;
+
+ if (access_type == RANDOM) {
+ /* We have an index buffer */
+ struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
+ struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
+
+ if (bound->end > bound->start) {
+ dirty->start = MIN2(dirty->start, bound->start);
+ dirty->end = MAX2(dirty->end, bound->end);
+ }
+ }
+
+ uint64_t mask = vb_used;
+ while (mask) {
+ int i = u_bit_scan64(&mask);
+ assert(i >= 0);
+ assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
+ assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
+
+ struct anv_vb_cache_range *bound, *dirty;
+ bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
+ dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
+
+ if (bound->end > bound->start) {
+ dirty->start = MIN2(dirty->start, bound->start);
+ dirty->end = MAX2(dirty->end, bound->end);
+ }
+ }
+}
+
/**
* Update the pixel hashing modes that determine the balancing of PS threads
* across subslices and slices.
diff --git a/src/intel/vulkan/genX_gpu_memcpy.c b/src/intel/vulkan/genX_gpu_memcpy.c
index 5af7085393e..28de5def12d 100644
--- a/src/intel/vulkan/genX_gpu_memcpy.c
+++ b/src/intel/vulkan/genX_gpu_memcpy.c
@@ -78,6 +78,7 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
}
+ genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer, 32, src, size);
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
genX(flush_pipeline_select_3d)(cmd_buffer);
@@ -229,5 +230,8 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
prim.BaseVertexLocation = 0;
}
+ genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(cmd_buffer, SEQUENTIAL,
+ 1ull << 32);
+
cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE;
}