draw: improve vertex fetch (v2)

The per-element fetch has quite some calculations which are constant, these can be moved outside both the per-element as well as the main shader loop (llvm can figure out it's constant mostly on its own, however this can have a significant compile time cost). Similarly, it looks easier swapping the fetch loops (outer loop per attrib, inner loop filling up the per vertex elements - this way the aos->soa conversion also can be done per attrib and not just at the end though again this doesn't really make much of a difference in the generated code). (This would also make it possible to vectorize the calculations leading to the fetches.) There's also some minimal change simplifying the overflow math slightly. All in all, the generated code seems to look slightly simpler (depending on the actual vs), but more importantly I've seen a significant reduction in compile times for some vs (albeit with old (3.3) llvm version, and the time reduction is only really for the optimizations run on the IR). v2: adapt to other draw change. No changes with piglit. Reviewed-by: Jose Fonseca <[email protected]>
author: Roland Scheidegger <[email protected]> 2016-10-15 03:53:48 +0200
committer: Roland Scheidegger <[email protected]> 2016-10-19 01:44:59 +0200
commit: aeceec54a86d26aad165a1ade67a8aba61ae080f (patch)
tree: 9d193becaaf3f48ae023ca881ae504baf5617d03 /src/gallium
parent: 0942fe548e935ccc849f44bd920649ef2b93a6a5 (diff)
3 files changed, 134 insertions, 86 deletions
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index 3b5685626bb..2f82d9dee84 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -659,85 +659,42 @@ generate_vs(struct draw_llvm_variant *variant,
 static void
 generate_fetch(struct gallivm_state *gallivm,
                struct draw_context *draw,
-               LLVMValueRef vbuffers_ptr,
+               const struct util_format_description *format_desc,
+               LLVMValueRef vb_stride,
+               LLVMValueRef stride_fixed,
+               LLVMValueRef map_ptr,
+               LLVMValueRef buffer_size_adj,
+               LLVMValueRef ofbit,
                LLVMValueRef *res,
-               struct pipe_vertex_element *velem,
-               LLVMValueRef vbuf,
-               LLVMValueRef index,
-               LLVMValueRef instance_id,
-               LLVMValueRef start_instance)
+               LLVMValueRef index)
 {
-   const struct util_format_description *format_desc =
-      util_format_description(velem->src_format);
    LLVMValueRef zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
    LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef indices =
-      LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
-                   velem->vertex_buffer_index, 0);
-   LLVMValueRef vbuffer_ptr = LLVMBuildGEP(builder, vbuffers_ptr,
-                                           &indices, 1, "");
-   LLVMValueRef vb_stride = draw_jit_vbuffer_stride(gallivm, vbuf);
-   LLVMValueRef vb_buffer_offset = draw_jit_vbuffer_offset(gallivm, vbuf);
-   LLVMValueRef map_ptr = draw_jit_dvbuffer_map(gallivm, vbuffer_ptr);
-   LLVMValueRef buffer_size = draw_jit_dvbuffer_size(gallivm, vbuffer_ptr);
    LLVMValueRef stride;
    LLVMValueRef buffer_overflowed;
-   LLVMValueRef needed_buffer_size;
    LLVMValueRef temp_ptr =
       lp_build_alloca(gallivm,
                       lp_build_vec_type(gallivm, lp_float32_vec4_type()), "");
-   LLVMValueRef ofbit = NULL;
    struct lp_build_if_state if_ctx;
 
-   if (velem->src_format == PIPE_FORMAT_NONE) {
+   if (format_desc->format == PIPE_FORMAT_NONE) {
       *res = lp_build_const_vec(gallivm, lp_float32_vec4_type(), 0);
       return;
    }
 
-   if (velem->instance_divisor) {
-      /* Index is equal to the start instance plus the number of current 
-       * instance divided by the divisor. In this case we compute it as:
-       * index = start_instance + (instance_id  / divisor)
-       */
-      LLVMValueRef current_instance;
-      current_instance = LLVMBuildUDiv(builder, instance_id,
-                                       lp_build_const_int32(gallivm, velem->instance_divisor),
-                                       "instance_divisor");
-      index = lp_build_uadd_overflow(gallivm, start_instance,
-                                     current_instance, &ofbit);
-   }
-
    stride = lp_build_umul_overflow(gallivm, vb_stride, index, &ofbit);
-   stride = lp_build_uadd_overflow(gallivm, stride, vb_buffer_offset, &ofbit);
-   stride = lp_build_uadd_overflow(
-      gallivm, stride,
-      lp_build_const_int32(gallivm, velem->src_offset), &ofbit);
-   needed_buffer_size = lp_build_uadd_overflow(
-      gallivm, stride,
-      lp_build_const_int32(gallivm,
-                           util_format_get_blocksize(velem->src_format)),
-      &ofbit);
+   stride = lp_build_uadd_overflow(gallivm, stride, stride_fixed, &ofbit);
 
    buffer_overflowed = LLVMBuildICmp(builder, LLVMIntUGT,
-                                     needed_buffer_size, buffer_size,
+                                     stride, buffer_size_adj,
                                      "buffer_overflowed");
    buffer_overflowed = LLVMBuildOr(builder, buffer_overflowed, ofbit, "");
-#if 0
-   lp_build_printf(gallivm, "vbuf index = %u, vb_stride is %u\n",
-                   index, vb_stride);
-   lp_build_printf(gallivm, "   vb_buffer_offset = %u, src_offset is %u\n",
-                   vb_buffer_offset,
-                   lp_build_const_int32(gallivm, velem->src_offset));
-   lp_build_print_value(gallivm, "   blocksize = ",
-                        lp_build_const_int32(
-                           gallivm,
-                           util_format_get_blocksize(velem->src_format)));
-   lp_build_printf(gallivm, "   instance_id = %u\n", instance_id);
-   lp_build_printf(gallivm, "   stride = %u\n", stride);
-   lp_build_printf(gallivm, "   buffer size = %u\n", buffer_size);
-   lp_build_printf(gallivm, "   needed_buffer_size = %u\n", needed_buffer_size);
-   lp_build_print_value(gallivm, "   buffer overflowed = ", buffer_overflowed);
-#endif
+
+   if (0) {
+      lp_build_printf(gallivm, "   stride = %u\n", stride);
+      lp_build_printf(gallivm, "   buffer size adj = %u\n", buffer_size_adj);
+      lp_build_print_value(gallivm, "   buffer overflowed = ", buffer_overflowed);
+   }
 
    lp_build_if(&if_ctx, gallivm, buffer_overflowed);
    {
@@ -766,36 +723,34 @@ generate_fetch(struct gallivm_state *gallivm,
 
 static void
 convert_to_soa(struct gallivm_state *gallivm,
-               LLVMValueRef (*src_aos)[LP_MAX_VECTOR_WIDTH / 32],
+               LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
                LLVMValueRef (*dst_soa)[TGSI_NUM_CHANNELS],
-               unsigned num_attribs, const struct lp_type soa_type)
+               unsigned attrib, const struct lp_type soa_type)
 {
-   unsigned i, j, k;
+   unsigned j, k;
    struct lp_type aos_channel_type = soa_type;
 
+   LLVMValueRef aos_channels[TGSI_NUM_CHANNELS];
+   unsigned pixels_per_channel = soa_type.length / TGSI_NUM_CHANNELS;
+
    debug_assert(TGSI_NUM_CHANNELS == 4);
    debug_assert((soa_type.length % TGSI_NUM_CHANNELS) == 0);
 
    aos_channel_type.length >>= 1;
 
-   for (i = 0; i < num_attribs; ++i) {
-      LLVMValueRef aos_channels[TGSI_NUM_CHANNELS];
-      unsigned pixels_per_channel = soa_type.length / TGSI_NUM_CHANNELS;
-
-      for (j = 0; j < TGSI_NUM_CHANNELS; ++j) {
-         LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
+   for (j = 0; j < TGSI_NUM_CHANNELS; ++j) {
+      LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
 
-         assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
-
-         for (k = 0; k < pixels_per_channel; ++k) {
-            channel[k] = src_aos[i][j + TGSI_NUM_CHANNELS * k];
-         }
+      assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
 
-         aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
+      for (k = 0; k < pixels_per_channel; ++k) {
+         channel[k] = src_aos[j + TGSI_NUM_CHANNELS * k];
       }
 
-      lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa[i]);
+      aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
    }
+
+   lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa[attrib]);
 }
 
 
@@ -1549,6 +1504,13 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
    LLVMValueRef io_ptr, vbuffers_ptr, vb_ptr;
    LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
    LLVMValueRef one = lp_build_const_int32(gallivm, 1);
+   LLVMValueRef vb_stride[PIPE_MAX_SHADER_INPUTS];
+   LLVMValueRef map_ptr[PIPE_MAX_SHADER_INPUTS];
+   LLVMValueRef buffer_size_adj[PIPE_MAX_SHADER_INPUTS];
+   LLVMValueRef stride_fixed[PIPE_MAX_SHADER_INPUTS];
+   LLVMValueRef ofbit[PIPE_MAX_SHADER_INPUTS];
+   LLVMValueRef instance_index[PIPE_MAX_SHADER_INPUTS];
+
    struct draw_context *draw = llvm->draw;
    const struct tgsi_shader_info *vs_info = &draw->vs.vertex_shader->info;
    unsigned i, j;
@@ -1693,14 +1655,67 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
 
    fetch_max = LLVMBuildSub(builder, end, one, "fetch_max");
 
+   /*
+    * Pre-calculate everything which is constant per shader invocation.
+    */
+   for (j = 0; j < draw->pt.nr_vertex_elements; ++j) {
+      LLVMValueRef vb_buffer_offset, buffer_size;
+      LLVMValueRef vb_info, vbuffer_ptr;
+      struct pipe_vertex_element *velem = &draw->pt.vertex_element[j];
+      LLVMValueRef vb_index =
+         lp_build_const_int32(gallivm, velem->vertex_buffer_index);
+      LLVMValueRef bsize = lp_build_const_int32(gallivm,
+                                                util_format_get_blocksize(velem->src_format));
+      LLVMValueRef src_offset = lp_build_const_int32(gallivm,
+                                                     velem->src_offset);
+
+      vbuffer_ptr = LLVMBuildGEP(builder, vbuffers_ptr, &vb_index, 1, "");
+      vb_info = LLVMBuildGEP(builder, vb_ptr, &vb_index, 1, "");
+      vb_stride[j] = draw_jit_vbuffer_stride(gallivm, vb_info);
+      vb_buffer_offset = draw_jit_vbuffer_offset(gallivm, vb_info);
+      map_ptr[j] = draw_jit_dvbuffer_map(gallivm, vbuffer_ptr);
+      buffer_size = draw_jit_dvbuffer_size(gallivm, vbuffer_ptr);
+
+      ofbit[j] = NULL;
+      stride_fixed[j] = lp_build_uadd_overflow(gallivm, vb_buffer_offset,
+                                               src_offset, &ofbit[j]);
+      buffer_size_adj[j] = lp_build_usub_overflow(gallivm, buffer_size, bsize,
+                                                   &ofbit[j]);
+
+      if (velem->instance_divisor) {
+         /* Index is equal to the start instance plus the number of current 
+          * instance divided by the divisor. In this case we compute it as:
+          * index = start_instance + (instance_id  / divisor)
+          */
+         LLVMValueRef current_instance;
+         current_instance = LLVMBuildUDiv(builder, system_values.instance_id,
+                                          lp_build_const_int32(gallivm,
+                                                               velem->instance_divisor),
+                                          "instance_divisor");
+         instance_index[j] = lp_build_uadd_overflow(gallivm, start_instance,
+                                                    current_instance, &ofbit[j]);
+      }
+
+      if (0) {
+         lp_build_printf(gallivm, "vbuf index = %u, vb_stride is %u\n",
+                         vb_index, vb_stride[j]);
+         lp_build_printf(gallivm, "   vb_buffer_offset = %u, src_offset is %u\n",
+                         vb_buffer_offset, src_offset);
+         lp_build_print_value(gallivm, "   blocksize = ", bsize);
+         lp_build_printf(gallivm, "   instance_id = %u\n", system_values.instance_id);
+         lp_build_printf(gallivm, "   buffer size = %u\n", buffer_size);
+      }
+   }
+
    lp_build_loop_begin(&lp_loop, gallivm, zero);
    {
       LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
-      LLVMValueRef aos_attribs[PIPE_MAX_SHADER_INPUTS][LP_MAX_VECTOR_WIDTH / 32] = { { 0 } };
+      LLVMValueRef aos_attribs[LP_MAX_VECTOR_WIDTH / 32] = { 0 };
       LLVMValueRef io;
       LLVMValueRef clipmask;   /* holds the clipmask value */
       LLVMValueRef true_index_array = lp_build_zero(gallivm,
                                                     lp_type_uint_vec(32, 32*vector_length));
+      LLVMValueRef true_indices[LP_MAX_VECTOR_WIDTH / 32];
       const LLVMValueRef (*ptr_aos)[TGSI_NUM_CHANNELS];
 
       io_itr = lp_loop.counter;
@@ -1760,22 +1775,25 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
             lp_build_endif(&if_ctx);
             true_index = LLVMBuildLoad(builder, index_ptr, "true_index");
          }
+         true_indices[i] = true_index;
          true_index_array = LLVMBuildInsertElement(
             gallivm->builder, true_index_array, true_index,
             lp_build_const_int32(gallivm, i), "");
+      }
+
+      for (j = 0; j < key->nr_vertex_elements; ++j) {
+         struct pipe_vertex_element *velem = &draw->pt.vertex_element[j];
+         const struct util_format_description *format_desc =
+            util_format_description(velem->src_format);
 
-         for (j = 0; j < key->nr_vertex_elements; ++j) {
-            struct pipe_vertex_element *velem = &draw->pt.vertex_element[j];
-            LLVMValueRef vb_index =
-               lp_build_const_int32(gallivm, velem->vertex_buffer_index);
-            LLVMValueRef vb = LLVMBuildGEP(builder, vb_ptr, &vb_index, 1, "");
-            generate_fetch(gallivm, draw, vbuffers_ptr,
-                           &aos_attribs[j][i], velem, vb, true_index,
-                           system_values.instance_id, start_instance);
+         for (i = 0; i < vector_length; ++i) {
+            generate_fetch(gallivm, draw, format_desc,
+                           vb_stride[j], stride_fixed[j], map_ptr[j],
+                           buffer_size_adj[j], ofbit[j], &aos_attribs[i],
+                           velem->instance_divisor ? instance_index[j] : true_indices[i]);
          }
+         convert_to_soa(gallivm, aos_attribs, inputs, j, vs_type);
       }
-      convert_to_soa(gallivm, aos_attribs, inputs,
-                     key->nr_vertex_elements, vs_type);
 
       /* In the paths with elts vertex id has to be unaffected by the
        * index bias and because indices inside our elements array have
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.c b/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.c
index 91247fdbc95..152ad573451 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.c
@@ -127,6 +127,30 @@ lp_build_uadd_overflow(struct gallivm_state *gallivm,
 }
 
 /**
+ * Performs unsigned subtraction of two integers and reports 
+ * overflow if detected.
+ *
+ * The values @a and @b must be of the same integer type. If
+ * an overflow is detected the IN/OUT @ofbit parameter is used:
+ * - if it's pointing to a null value, the overflow bit is simply
+ *   stored inside the variable it's pointing to,
+ * - if it's pointing to a valid value, then that variable,
+ *   which must be of i1 type, is ORed with the newly detected
+ *   overflow bit. This is done to allow chaining of a number of
+ *   overflow functions together without having to test the 
+ *   overflow bit after every single one.
+ */
+LLVMValueRef
+lp_build_usub_overflow(struct gallivm_state *gallivm,
+                       LLVMValueRef a,
+                       LLVMValueRef b,
+                       LLVMValueRef *ofbit)
+{
+   return build_binary_int_overflow(gallivm, "llvm.usub.with.overflow",
+                                    a, b, ofbit);
+}
+
+/**
  * Performs unsigned multiplication of  two integers and 
  * reports overflow if detected.
  *
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.h b/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.h
index 8c35a04181b..34ce00ee0e8 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.h
@@ -49,6 +49,12 @@ lp_build_uadd_overflow(struct gallivm_state *gallivm,
                        LLVMValueRef *ofbit);
 
 LLVMValueRef
+lp_build_usub_overflow(struct gallivm_state *gallivm,
+                       LLVMValueRef a,
+                       LLVMValueRef b,
+                       LLVMValueRef *ofbit);
+
+LLVMValueRef
 lp_build_umul_overflow(struct gallivm_state *gallivm,
                        LLVMValueRef a,
                        LLVMValueRef b,
author	Roland Scheidegger <[email protected]>	2016-10-15 03:53:48 +0200
committer	Roland Scheidegger <[email protected]>	2016-10-19 01:44:59 +0200
commit	aeceec54a86d26aad165a1ade67a8aba61ae080f (patch)
tree	9d193becaaf3f48ae023ca881ae504baf5617d03 /src/gallium
parent	0942fe548e935ccc849f44bd920649ef2b93a6a5 (diff)