aboutsummaryrefslogtreecommitdiffstats
path: root/src/gallium
diff options
context:
space:
mode:
authorMarek Olšák <[email protected]>2020-06-01 15:56:12 -0400
committerMarek Olšák <[email protected]>2020-06-09 00:45:26 -0400
commit90c34aed1d2f814ff8baca87b338d250257ae1d0 (patch)
tree3fd1f6e285860e3ef2c62067e851c45f849d5825 /src/gallium
parent88e8f1a38d838753542461cea56d1c1b1a5cfc5d (diff)
gallium/u_vbuf: add a faster path for uploading non-interleaved attribs
+1% higher FPS in torcs. Reviewed-by: Alyssa Rosenzweig <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5304>
Diffstat (limited to 'src/gallium')
-rw-r--r--src/gallium/auxiliary/util/u_vbuf.c117
1 files changed, 83 insertions, 34 deletions
diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c
index 999fcb80135..7e2631c2e86 100644
--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@@ -131,6 +131,9 @@ struct u_vbuf_elements {
* non-instanced. */
uint32_t noninstance_vb_mask_any;
+ /* Which buffers are used by multiple vertex attribs. */
+ uint32_t interleaved_vb_mask;
+
void *driver_cso;
};
@@ -802,6 +805,9 @@ u_vbuf_create_vertex_elements(struct u_vbuf *mgr, unsigned count,
ve->src_format_size[i] = util_format_get_blocksize(format);
+ if (used_buffers & vb_index_bit)
+ ve->interleaved_vb_mask |= vb_index_bit;
+
used_buffers |= vb_index_bit;
if (!ve->ve[i].instance_divisor) {
@@ -955,6 +961,49 @@ void u_vbuf_set_vertex_buffers(struct u_vbuf *mgr,
mgr->dirty_real_vb_mask |= ~mask;
}
+static ALWAYS_INLINE bool
+get_upload_offset_size(struct u_vbuf *mgr,
+ const struct pipe_vertex_buffer *vb,
+ struct u_vbuf_elements *ve,
+ const struct pipe_vertex_element *velem,
+ unsigned vb_index, unsigned velem_index,
+ int start_vertex, unsigned num_vertices,
+ int start_instance, unsigned num_instances,
+ unsigned *offset, unsigned *size)
+{
+ /* Skip the buffers generated by translate. */
+ if ((1 << vb_index) & mgr->fallback_vbs_mask || !vb->is_user_buffer)
+ return false;
+
+ unsigned instance_div = velem->instance_divisor;
+ *offset = vb->buffer_offset + velem->src_offset;
+
+ if (!vb->stride) {
+ /* Constant attrib. */
+ *size = ve->src_format_size[velem_index];
+ } else if (instance_div) {
+ /* Per-instance attrib. */
+
+ /* Figure out how many instances we'll render given instance_div. We
+ * can't use the typical div_round_up() pattern because the CTS uses
+ * instance_div = ~0 for a test, which overflows div_round_up()'s
+ * addition.
+ */
+ unsigned count = num_instances / instance_div;
+ if (count * instance_div != num_instances)
+ count++;
+
+ *offset += vb->stride * start_instance;
+ *size = vb->stride * (count - 1) + ve->src_format_size[velem_index];
+ } else {
+ /* Per-vertex attrib. */
+ *offset += vb->stride * start_vertex;
+ *size = vb->stride * (num_vertices - 1) + ve->src_format_size[velem_index];
+ }
+ return true;
+}
+
+
static enum pipe_error
u_vbuf_upload_buffers(struct u_vbuf *mgr,
int start_vertex, unsigned num_vertices,
@@ -965,51 +1014,51 @@ u_vbuf_upload_buffers(struct u_vbuf *mgr,
unsigned nr_velems = ve->count;
const struct pipe_vertex_element *velems =
mgr->using_translate ? mgr->fallback_velems.velems : ve->ve;
+
+ /* Faster path when no vertex attribs are interleaved. */
+ if ((ve->interleaved_vb_mask & mgr->user_vb_mask) == 0) {
+ for (i = 0; i < nr_velems; i++) {
+ const struct pipe_vertex_element *velem = &velems[i];
+ unsigned index = velem->vertex_buffer_index;
+ struct pipe_vertex_buffer *vb = &mgr->vertex_buffer[index];
+ unsigned offset, size;
+
+ if (!get_upload_offset_size(mgr, vb, ve, velem, index, i, start_vertex,
+ num_vertices, start_instance, num_instances,
+ &offset, &size))
+ continue;
+
+ struct pipe_vertex_buffer *real_vb = &mgr->real_vertex_buffer[index];
+ const uint8_t *ptr = mgr->vertex_buffer[index].buffer.user;
+
+ u_upload_data(mgr->pipe->stream_uploader,
+ mgr->has_signed_vb_offset ? 0 : offset,
+ size, 4, ptr + offset, &real_vb->buffer_offset,
+ &real_vb->buffer.resource);
+ if (!real_vb->buffer.resource)
+ return PIPE_ERROR_OUT_OF_MEMORY;
+
+ real_vb->buffer_offset -= offset;
+ }
+ return PIPE_OK;
+ }
+
unsigned start_offset[PIPE_MAX_ATTRIBS];
unsigned end_offset[PIPE_MAX_ATTRIBS];
uint32_t buffer_mask = 0;
+ /* Slower path supporting interleaved vertex attribs using 2 loops. */
/* Determine how much data needs to be uploaded. */
for (i = 0; i < nr_velems; i++) {
const struct pipe_vertex_element *velem = &velems[i];
unsigned index = velem->vertex_buffer_index;
struct pipe_vertex_buffer *vb = &mgr->vertex_buffer[index];
- unsigned instance_div, first, size, index_bit;
+ unsigned first, size, index_bit;
- /* Skip the buffers generated by translate. */
- if ((1 << index) & mgr->fallback_vbs_mask) {
+ if (!get_upload_offset_size(mgr, vb, ve, velem, index, i, start_vertex,
+ num_vertices, start_instance, num_instances,
+ &first, &size))
continue;
- }
-
- if (!vb->is_user_buffer) {
- continue;
- }
-
- instance_div = velem->instance_divisor;
- first = vb->buffer_offset + velem->src_offset;
-
- if (!vb->stride) {
- /* Constant attrib. */
- size = ve->src_format_size[i];
- } else if (instance_div) {
- /* Per-instance attrib. */
-
- /* Figure out how many instances we'll render given instance_div. We
- * can't use the typical div_round_up() pattern because the CTS uses
- * instance_div = ~0 for a test, which overflows div_round_up()'s
- * addition.
- */
- unsigned count = num_instances / instance_div;
- if (count * instance_div != num_instances)
- count++;
-
- first += vb->stride * start_instance;
- size = vb->stride * (count - 1) + ve->src_format_size[i];
- } else {
- /* Per-vertex attrib. */
- first += vb->stride * start_vertex;
- size = vb->stride * (num_vertices - 1) + ve->src_format_size[i];
- }
index_bit = 1 << index;