i965/fs: support doubles with UBO loads

UBO loads with constant offset use the UNIFORM_PULL_CONSTANT_LOAD instruction, which reads 16 bytes (a vec4) of data from memory. For dvec types this only provides components x and y. Thus, if we are reading more than 2 components we need to issue a second load at offset+16 to read the next 16-byte chunk with components w and z. UBO loads with non-constant offset emit a load for each component in the vector (and rely in CSE to fix redundant loads), so we only need to consider the size of the data type when computing the offset of each element in a vector. v2 (Sam): - Adapt the code to use component() (Curro). v3 (Sam): - Use type_sz(dest.type) in VARYING_PULL_CONSTANT_LOAD() call (Curro). - Add asserts to ensure std140 vector alignment rules are followed (Curro). Signed-off-by: Samuel Iglesias Gonsálvez <[email protected]> Reviewed-by: Kenneth Graunke <[email protected]> Reviewed-by: Francisco Jerez <[email protected]>
author: Iago Toral Quiroga <[email protected]> 2016-01-13 10:17:10 +0100
committer: Samuel Iglesias Gonsálvez <[email protected]> 2016-05-16 09:55:33 +0200
commit: b86d4780ed203b2a22afba5f95c73b15165a7259 (patch)
tree: bbd7c4ec0a19db69b7b8496abf497dfcb968510a /src/mesa
parent: 58f1804c4f38b76c20872d6887b7b5e6029e0454 (diff)
1 files changed, 57 insertions, 7 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 17eb82ed56f..0ff197f3bde 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -3366,6 +3366,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
                                nir->info.num_ubos - 1);
       }
 
+      /* Number of 32-bit slots in the type */
+      unsigned type_slots = MAX2(1, type_sz(dest.type) / 4);
+
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
       if (const_offset == NULL) {
          fs_reg base_offset = retype(get_nir_src(instr->src[1]),
@@ -3373,19 +3376,66 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
 
          for (int i = 0; i < instr->num_components; i++)
             VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
-                                       base_offset, i * 4);
+                                       base_offset, i * type_sz(dest.type));
       } else {
+         /* Even if we are loading doubles, a pull constant load will load
+          * a 32-bit vec4, so should only reserve vgrf space for that. If we
+          * need to load a full dvec4 we will have to emit 2 loads. This is
+          * similar to demote_pull_constants(), except that in that case we
+          * see individual accesses to each component of the vector and then
+          * we let CSE deal with duplicate loads. Here we see a vector access
+          * and we have to split it if necessary.
+          */
          fs_reg packed_consts = vgrf(glsl_type::float_type);
          packed_consts.type = dest.type;
 
-         struct brw_reg const_offset_reg = brw_imm_ud(const_offset->u32[0] & ~15);
-         bld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
-                  surf_index, const_offset_reg);
+         unsigned const_offset_aligned = const_offset->u32[0] & ~15;
+
+         /* A vec4 only contains half of a dvec4, if we need more than 2
+          * components of a dvec4 we will have to issue another load for
+          * components z and w.
+          */
+         int num_components;
+         if (type_slots == 1)
+            num_components = instr->num_components;
+         else
+            num_components = MIN2(2, instr->num_components);
+
+         /* The computation of num_components doesn't take into account
+          * misalignment, which should be okay according to std140 vector
+          * alignment rules.
+          */
+         assert(const_offset->u32[0] % 16 +
+                type_sz(dest.type) * num_components <= 16);
+
+         int remaining_components = instr->num_components;
+         while (remaining_components > 0) {
+            /* Read the vec4 from a 16-byte aligned offset */
+            struct brw_reg const_offset_reg = brw_imm_ud(const_offset_aligned);
+            bld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+                     retype(packed_consts, BRW_REGISTER_TYPE_F),
+                     surf_index, const_offset_reg);
+
+            const fs_reg consts = byte_offset(packed_consts, (const_offset->u32[0] % 16));
+            unsigned dest_offset = instr->num_components - remaining_components;
+
+            /* XXX: This doesn't update the sub-16B offset across iterations of
+             * the loop, which should work for std140 vector alignment rules.
+             */
+            assert(dest_offset == 0 || const_offset->u32[0] % 16 == 0);
 
-         const fs_reg consts = byte_offset(packed_consts, const_offset->u32[0] % 16);
+            for (int i = 0; i < num_components; i++)
+               bld.MOV(offset(dest, bld, i + dest_offset), component(consts, i));
 
-         for (unsigned i = 0; i < instr->num_components; i++)
-            bld.MOV(offset(dest, bld, i), component(consts, i));
+            /* If this is a large enough 64-bit load, we will need to emit
+             * another message
+             */
+            remaining_components -= num_components;
+            assert(remaining_components == 0 ||
+                   (remaining_components <= 2 && type_slots == 2));
+            num_components = remaining_components;
+            const_offset_aligned += 16;
+         }
       }
       break;
    }
author	Iago Toral Quiroga <[email protected]>	2016-01-13 10:17:10 +0100
committer	Samuel Iglesias Gonsálvez <[email protected]>	2016-05-16 09:55:33 +0200
commit	b86d4780ed203b2a22afba5f95c73b15165a7259 (patch)
tree	bbd7c4ec0a19db69b7b8496abf497dfcb968510a /src/mesa
parent	58f1804c4f38b76c20872d6887b7b5e6029e0454 (diff)