2 files changed, 76 insertions, 0 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index c4a0004dbed..58b03265c34 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -328,6 +328,11 @@ public:
 
    src_reg setup_imm_df(double v);
 
+   vec4_instruction *shuffle_64bit_data(dst_reg dst, src_reg src,
+                                        bool for_write,
+                                        bblock_t *block = NULL,
+                                        vec4_instruction *ref = NULL);
+
    virtual void emit_nir_code();
    virtual void nir_setup_uniforms();
    virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index fe82ed8f15b..14d0546c5c0 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -2224,4 +2224,75 @@ vec4_visitor::nir_emit_undef(nir_ssa_undef_instr *instr)
       dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(instr->def.bit_size, 32)));
 }
 
+/* SIMD4x2 64bit data is stored in register space like this:
+ *
+ * r0.0:DF  x0 y0 z0 w0
+ * r1.0:DF  x1 y1 z1 w1
+ *
+ * When we need to write data such as this to memory using 32-bit write
+ * messages we need to shuffle it in this fashion:
+ *
+ * r0.0:DF  x0 y0 x1 y1 (to be written at base offset)
+ * r0.0:DF  z0 w0 z1 w1 (to be written at base offset + 16)
+ *
+ * We need to do the inverse operation when we read using 32-bit messages,
+ * which we can do by applying the same exact shuffling on the 64-bit data
+ * read, only that because the data for each vertex is positioned differently
+ * we need to apply different channel enables.
+ *
+ * This function takes 64bit data and shuffles it as explained above.
+ *
+ * The @for_write parameter is used to specify if the shuffling is being done
+ * for proper SIMD4x2 64-bit data that needs to be shuffled prior to a 32-bit
+ * write message (for_write = true), or instead we are doing the inverse
+ * operation and we have just read 64-bit data using a 32-bit messages that we
+ * need to shuffle to create valid SIMD4x2 64-bit data (for_write = false).
+ *
+ * If @block and @ref are non-NULL, then the shuffling is done after @ref,
+ * otherwise the instructions are emitted normally at the end. The function
+ * returns the last instruction inserted.
+ *
+ * Notice that @src and @dst cannot be the same register.
+ */
+vec4_instruction *
+vec4_visitor::shuffle_64bit_data(dst_reg dst, src_reg src, bool for_write,
+                                 bblock_t *block, vec4_instruction *ref)
+{
+   assert(type_sz(src.type) == 8);
+   assert(type_sz(dst.type) == 8);
+   assert(!regions_overlap(dst, 2 * REG_SIZE, src, 2 * REG_SIZE));
+   assert(!ref == !block);
+
+   const vec4_builder bld = !ref ? vec4_builder(this).at_end() :
+                                   vec4_builder(this).at(block, ref->next);
+
+   /* Resolve swizzle in src */
+   vec4_instruction *inst;
+   if (src.swizzle != BRW_SWIZZLE_XYZW) {
+      dst_reg data = dst_reg(this, glsl_type::dvec4_type);
+      inst = bld.MOV(data, src);
+      src = src_reg(data);
+   }
+
+   /* dst+0.XY = src+0.XY */
+   inst = bld.group(4, 0).MOV(writemask(dst, WRITEMASK_XY), src);
+
+   /* dst+0.ZW = src+1.XY */
+   inst = bld.group(4, for_write ? 1 : 0)
+             .MOV(writemask(dst, WRITEMASK_ZW),
+                  swizzle(byte_offset(src, REG_SIZE), BRW_SWIZZLE_XYXY));
+
+   /* dst+1.XY = src+0.ZW */
+   inst = bld.group(4, for_write ? 0 : 1)
+            .MOV(writemask(byte_offset(dst, REG_SIZE), WRITEMASK_XY),
+                 swizzle(src, BRW_SWIZZLE_ZWZW));
+
+   /* dst+1.ZW = src+1.ZW */
+   inst = bld.group(4, 1)
+             .MOV(writemask(byte_offset(dst, REG_SIZE), WRITEMASK_ZW),
+                 byte_offset(src, REG_SIZE));
+
+   return inst;
+}
+
 }