intel/fs: general 8/16/32/64-bit shuffle_src_to_dst function

This new function takes care of shuffle/unshuffle components of a particular bit-size in components with a different bit-size. If source type size is smaller than destination type size the operation needed is a component shuffle. The opposite case would be an unshuffle. Component units are measured in terms of the smaller type between source and destination. As we are un/shuffling the smaller components from/into a bigger one. The operation allows to skip first_component number of components from the source. Shuffle MOVs are retyped using integer types avoiding problems with denorms and float types if source and destination bitsize is different. This allows to simplify uses of shuffle functions that are dealing with these retypes individually. Now there is a new restriction so source and destination can not overlap anymore when calling this shuffle function. Following patches that migrate to use this new function will take care individually of avoiding source and destination overlaps. v2: (Jason Ekstrand) - Rewrite overlap asserts. - Manage type_sz(src.type) == type_sz(dst.type) case using MOVs from source to dest. This works for 64-bit to 64-bits operation that on Gen7 as it doesn't support Q registers. - Explain that components units are based in the smallest type. v3: - Fix unshuffle overlap assert (Jason Ekstrand) Reviewed-by: Jason Ekstrand <[email protected]>
author: Jose Maria Casanova Crespo <[email protected]> 2018-06-09 11:45:01 +0200
committer: Jose Maria Casanova Crespo <[email protected]> 2018-06-16 22:39:08 +0200
commit: a5665056e55fbdc9fc493fcaa9787406561fb4b1 (patch)
tree: ef11650ad46b8c7f4cc3961bcb5f4a45fe84cd87
parent: d882331f7ab2be567ce6641230e730ec74507222 (diff)
1 files changed, 101 insertions, 0 deletions
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 166da0aa6d7..d91faf135ef 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -5362,6 +5362,107 @@ shuffle_16bit_data_for_32bit_write(const fs_builder &bld,
    }
 }
 
+/*
+ * This helper takes a source register and un/shuffles it into the destination
+ * register.
+ *
+ * If source type size is smaller than destination type size the operation
+ * needed is a component shuffle. The opposite case would be an unshuffle. If
+ * source/destination type size is equal a shuffle is done that would be
+ * equivalent to a simple MOV.
+ *
+ * For example, if source is a 16-bit type and destination is 32-bit. A 3
+ * components .xyz 16-bit vector on SIMD8 would be.
+ *
+ *    |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
+ *    |z1|z2|z3|z4|z5|z6|z7|z8|  |  |  |  |  |  |  |  |
+ *
+ * This helper will return the following 2 32-bit components with the 16-bit
+ * values shuffled:
+ *
+ *    |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
+ *    |z1   |z2   |z3   |z4   |z5   |z6   |z7   |z8   |
+ *
+ * For unshuffle, the example would be the opposite, a 64-bit type source
+ * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
+ * would be:
+ *
+ *    | x1l   x1h | x2l   x2h | x3l   x3h | x4l   x4h |
+ *    | x5l   x5h | x6l   x6h | x7l   x7h | x8l   x8h |
+ *    | y1l   y1h | y2l   y2h | y3l   y3h | y4l   y4h |
+ *    | y5l   y5h | y6l   y6h | y7l   y7h | y8l   y8h |
+ *
+ * The returned result would be the following 4 32-bit components unshuffled:
+ *
+ *    | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
+ *    | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
+ *    | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
+ *    | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
+ *
+ * - Source and destination register must not be overlapped.
+ * - components units are measured in terms of the smaller type between
+ *   source and destination because we are un/shuffling the smaller
+ *   components from/into the bigger ones.
+ * - first_component parameter allows skipping source components.
+ */
+void
+shuffle_src_to_dst(const fs_builder &bld,
+                   const fs_reg &dst,
+                   const fs_reg &src,
+                   uint32_t first_component,
+                   uint32_t components)
+{
+   if (type_sz(src.type) == type_sz(dst.type)) {
+      assert(!regions_overlap(dst,
+         type_sz(dst.type) * bld.dispatch_width() * components,
+         offset(src, bld, first_component),
+         type_sz(src.type) * bld.dispatch_width() * components));
+      for (unsigned i = 0; i < components; i++) {
+         bld.MOV(retype(offset(dst, bld, i), src.type),
+                 offset(src, bld, i + first_component));
+      }
+   } else if (type_sz(src.type) < type_sz(dst.type)) {
+      /* Source is shuffled into destination */
+      unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
+      assert(!regions_overlap(dst,
+         type_sz(dst.type) * bld.dispatch_width() *
+         DIV_ROUND_UP(components, size_ratio),
+         offset(src, bld, first_component),
+         type_sz(src.type) * bld.dispatch_width() * components));
+
+      brw_reg_type shuffle_type =
+         brw_reg_type_from_bit_size(8 * type_sz(src.type),
+                                    BRW_REGISTER_TYPE_D);
+      for (unsigned i = 0; i < components; i++) {
+         fs_reg shuffle_component_i =
+            subscript(offset(dst, bld, i / size_ratio),
+                      shuffle_type, i % size_ratio);
+         bld.MOV(shuffle_component_i,
+                 retype(offset(src, bld, i + first_component), shuffle_type));
+      }
+   } else {
+      /* Source is unshuffled into destination */
+      unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
+      assert(!regions_overlap(dst,
+         type_sz(dst.type) * bld.dispatch_width() * components,
+         offset(src, bld, first_component / size_ratio),
+         type_sz(src.type) * bld.dispatch_width() *
+         DIV_ROUND_UP(components + (first_component % size_ratio),
+                      size_ratio)));
+
+      brw_reg_type shuffle_type =
+         brw_reg_type_from_bit_size(8 * type_sz(dst.type),
+                                    BRW_REGISTER_TYPE_D);
+      for (unsigned i = 0; i < components; i++) {
+         fs_reg shuffle_component_i =
+            subscript(offset(src, bld, (first_component + i) / size_ratio),
+                      shuffle_type, (first_component + i) % size_ratio);
+         bld.MOV(retype(offset(dst, bld, i), shuffle_type),
+                 shuffle_component_i);
+      }
+   }
+}
+
 fs_reg
 setup_imm_df(const fs_builder &bld, double v)
 {
author	Jose Maria Casanova Crespo <[email protected]>	2018-06-09 11:45:01 +0200
committer	Jose Maria Casanova Crespo <[email protected]>	2018-06-16 22:39:08 +0200
commit	a5665056e55fbdc9fc493fcaa9787406561fb4b1 (patch)
tree	ef11650ad46b8c7f4cc3961bcb5f4a45fe84cd87
parent	d882331f7ab2be567ce6641230e730ec74507222 (diff)