r600/sfn: Add lowering UBO access to r600 specific codes

r600 reads vec4 from the UBO, but the offsets in nir are evaluated to the component. If the offsets are not literal then all non-vec4 reads must resolve the component after reading a vec4 component (TODO: figure out whether there is a consistent way to deduct the component that is actually read). Signed-off-by: Gert Wollny <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3225>
author: Gert Wollny <[email protected]> 2019-12-27 17:49:27 +0100
committer: Marge Bot <[email protected]> 2020-02-10 19:09:08 +0000
commit: 37125b7cc220fd9b77e9882268892ca4e79a0627 (patch)
tree: 9b9a1e443c5d3947315eb2ae0a0038116b2ab665 /src/gallium/drivers/r600
parent: 32d3435a78675ff5ebf933d45b9b99fdc4dc7d82 (diff)
2 files changed, 105 insertions, 2 deletions
diff --git a/src/gallium/drivers/r600/sfn/sfn_nir.cpp b/src/gallium/drivers/r600/sfn/sfn_nir.cpp
index 206435528a6..ff7bee26b4f 100644
--- a/src/gallium/drivers/r600/sfn/sfn_nir.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_nir.cpp
@@ -386,12 +386,112 @@ bool r600_lower_scratch_addresses(nir_shader *shader)
    return progress;
 }
 
+static nir_ssa_def *
+r600_lower_ubo_to_align16_impl(nir_builder *b, nir_instr *instr, void *_options)
+{
+   b->cursor = nir_before_instr(instr);
+
+   nir_intrinsic_instr *op = nir_instr_as_intrinsic(instr);
+   assert(op->intrinsic == nir_intrinsic_load_ubo);
+
+   bool const_address = (nir_src_is_const(op->src[1]) && nir_src_is_const(op->src[0]));
+
+   nir_ssa_def *offset = op->src[1].ssa;
+
+   /* This is ugly: With const addressing we can actually set a proper fetch target mask,
+    * but for this we need the component encoded, we don't shift and do de decoding in the
+    * backend. Otherwise we shift by four and resolve the component here
+    * (TODO: encode the start component in the intrinsic when the offset base is non-constant
+    * but a multiple of 16 */
+
+   nir_ssa_def *new_offset = offset;
+   if (!const_address)
+      new_offset = nir_ishr(b, offset,  nir_imm_int(b, 4));
+
+   nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo_r600);
+   load->num_components = const_address ? op->num_components : 4;
+   load->src[0] = op->src[0];
+   load->src[1] = nir_src_for_ssa(new_offset);
+   nir_intrinsic_set_align(load, nir_intrinsic_align(op), nir_intrinsic_align_offset(op));
+
+   nir_ssa_dest_init(&load->instr, &load->dest, load->num_components, 32, NULL);
+   nir_builder_instr_insert(b, &load->instr);
+
+   /* when four components are loaded or both the offset and the location
+    * are constant, then the backend can deal with it better */
+   if (op->num_components == 4 || const_address)
+      return &load->dest.ssa;
+
+   /* What comes below is a performance disaster when the offset is not constant
+    * because then we have to assume that any component can be the first one and we
+    * have to pick the result manually. */
+   nir_ssa_def *first_comp = nir_iand(b, nir_ishr(b, offset,  nir_imm_int(b, 2)),
+                                     nir_imm_int(b,3));
+
+   const unsigned swz_000[4] = {0, 0, 0, 0};
+   nir_ssa_def *component_select = nir_ieq(b, nir_imm_ivec4(b, 0, 1, 2, 3),
+                                           nir_swizzle(b, first_comp, swz_000, 4));
+
+   const unsigned szw_0[1] = {0};
+   const unsigned szw_1[1] = {1};
+   const unsigned szw_2[1] = {2};
+
+   if (op->num_components == 1) {
+      const unsigned szw_3[1] = {3};
+      nir_ssa_def *check0 = nir_bcsel(b, nir_swizzle(b, component_select, szw_0, 1),
+                                      nir_swizzle(b, &load->dest.ssa, szw_0, 1),
+                                      nir_swizzle(b, &load->dest.ssa, szw_3, 1));
+      nir_ssa_def *check1 = nir_bcsel(b, nir_swizzle(b, component_select, szw_1, 1),
+                                      nir_swizzle(b, &load->dest.ssa, szw_1, 1),
+                                      check0);
+      return nir_bcsel(b, nir_swizzle(b, component_select, szw_2, 1),
+                       nir_swizzle(b, &load->dest.ssa, szw_2, 1),
+                       check1);
+   } else if (op->num_components == 2) {
+      const unsigned szw_01[2] = {0, 1};
+      const unsigned szw_12[2] = {1, 2};
+      const unsigned szw_23[2] = {2, 3};
+
+      nir_ssa_def *check0 = nir_bcsel(b, nir_swizzle(b, component_select, szw_0, 1),
+                                      nir_swizzle(b, &load->dest.ssa, szw_01, 2),
+                                      nir_swizzle(b, &load->dest.ssa, szw_23, 2));
+      return nir_bcsel(b, nir_swizzle(b, component_select, szw_1, 1),
+                                      nir_swizzle(b, &load->dest.ssa, szw_12, 2),
+                                      check0);
+   } else {
+      const unsigned szw_012[3] = {0, 1, 3};
+      const unsigned szw_123[3] = {1, 2, 3};
+      return nir_bcsel(b, nir_swizzle(b, component_select, szw_0, 1),
+                       nir_swizzle(b, &load->dest.ssa, szw_012, 3),
+                       nir_swizzle(b, &load->dest.ssa, szw_123, 3));
+   }
+}
+
+bool r600_lower_ubo_to_align16_filter(const nir_instr *instr, const void *_options)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *op = nir_instr_as_intrinsic(instr);
+   return op->intrinsic == nir_intrinsic_load_ubo;
+}
+
+
+bool r600_lower_ubo_to_align16(nir_shader *shader)
+{
+   return nir_shader_lower_instructions(shader,
+                                        r600_lower_ubo_to_align16_filter,
+                                        r600_lower_ubo_to_align16_impl,
+                                        nullptr);
+}
+
 }
 
 using r600::r600_nir_lower_int_tg4;
 using r600::r600_nir_lower_pack_unpack_2x16;
 using r600::r600_lower_scratch_addresses;
 using r600::r600_lower_fs_out_to_vector;
+using r600::r600_lower_ubo_to_align16;
 
 int
 r600_glsl_type_size(const struct glsl_type *type, bool is_bindless)
@@ -512,7 +612,10 @@ int r600_shader_from_nir(struct r600_context *rctx,
    const nir_function *func = reinterpret_cast<const nir_function *>(exec_list_get_head_const(&sel->nir->functions));
    bool optimize = func->impl->registers.length() == 0 && !has_saturate(func);
 
-
+   if (optimize) {
+      optimize_once(sel->nir);
+      NIR_PASS_V(sel->nir, r600_lower_ubo_to_align16);
+   }
    /* It seems the output of this optimization is cached somewhere, and
     * when there are registers, then we can no longer copy propagate, so
     * skip the optimization then. (There is probably a better way, but yeah)
diff --git a/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp b/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp
index 96c8e804e2b..0ee6bbd6313 100644
--- a/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp
@@ -447,7 +447,7 @@ bool ShaderFromNirProcessor::emit_intrinsic_instruction(nir_intrinsic_instr* ins
    case nir_intrinsic_discard:
    case nir_intrinsic_discard_if:
       return emit_discard_if(instr);
-   case nir_intrinsic_load_ubo:
+   case nir_intrinsic_load_ubo_r600:
       return emit_load_ubo(instr);
    case nir_intrinsic_copy_deref:
    case nir_intrinsic_load_constant:
author	Gert Wollny <[email protected]>	2019-12-27 17:49:27 +0100
committer	Marge Bot <[email protected]>	2020-02-10 19:09:08 +0000
commit	37125b7cc220fd9b77e9882268892ca4e79a0627 (patch)
tree	9b9a1e443c5d3947315eb2ae0a0038116b2ab665 /src/gallium/drivers/r600
parent	32d3435a78675ff5ebf933d45b9b99fdc4dc7d82 (diff)