summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIago Toral Quiroga <[email protected]>2015-08-13 15:36:05 -0700
committerSamuel Iglesias Gonsálvez <[email protected]>2017-01-03 11:26:50 +0100
commit558f27953101c438747c3e9d3c3f98ce21e79007 (patch)
tree87eb24c967dbd74cea33e126bee73faaa4c3b296
parent2d6eee3144ce16b39909522be466bdb3871f4c1b (diff)
i965/vec4: add double/float conversion pseudo-opcodes
These need to be emitted as align1 MOV's, since they need to have a stride of 2 on the float register (whether src or dest) so that data from another thread doesn't cross the middle of a SIMD8 register. v2 (Iago): - The float-to-double needs to align 32-bit data to 64-bit before doing the conversion. This was doable in align16 when we tried to use an execsize of 4, but with an execsize of 8 we would need another align1 opcode to do that (since we need data to cross the middle of a SIMD register). Just making the opcode handle this internally seems more practical that adding another opcode just for this purpose and having the caller know about this before converting. - The double-to-float conversion produces 32-bit elements aligned to 64-bit so we make the opcode re-pack the result to 32-bit and fit in one register, as expected by SIMD4x2 operation. This still requires that callers reserve two registers for the float data destination because we need to produce 64-bit aligned data first, and repack it later on the same destination register, but it saves the need for a re-pack opcode only to achieve this making the operation complete in a single opcode. Hopefully that is worth the weirdness of the double register allocation... Signed-off-by: Connor Abbott <[email protected]> Signed-off-by: Iago Toral Quiroga <[email protected]> Reviewed-by: Ian Romanick <[email protected]> Reviewed-by: Matt Turner <[email protected]>
-rw-r--r--src/mesa/drivers/dri/i965/brw_defines.h2
-rw-r--r--src/mesa/drivers/dri/i965/brw_shader.cpp4
-rw-r--r--src/mesa/drivers/dri/i965/brw_vec4.cpp8
-rw-r--r--src/mesa/drivers/dri/i965/brw_vec4_generator.cpp44
4 files changed, 58 insertions, 0 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index a07d307764b..91d9d5225b9 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1098,6 +1098,8 @@ enum opcode {
VEC4_OPCODE_MOV_BYTES,
VEC4_OPCODE_PACK_BYTES,
VEC4_OPCODE_UNPACK_UNIFORM,
+ VEC4_OPCODE_DOUBLE_TO_FLOAT,
+ VEC4_OPCODE_FLOAT_TO_DOUBLE,
FS_OPCODE_DDX_COARSE,
FS_OPCODE_DDX_FINE,
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 9487a3e1ee8..ec321ae1910 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -322,6 +322,10 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
return "pack_bytes";
case VEC4_OPCODE_UNPACK_UNIFORM:
return "unpack_uniform";
+ case VEC4_OPCODE_DOUBLE_TO_FLOAT:
+ return "double_to_float";
+ case VEC4_OPCODE_FLOAT_TO_DOUBLE:
+ return "float_to_double";
case FS_OPCODE_DDX_COARSE:
return "ddx_coarse";
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index bf36cacb0b7..3f3fd6bbcf3 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -253,6 +253,8 @@ vec4_instruction::can_do_writemask(const struct gen_device_info *devinfo)
{
switch (opcode) {
case SHADER_OPCODE_GEN4_SCRATCH_READ:
+ case VEC4_OPCODE_DOUBLE_TO_FLOAT:
+ case VEC4_OPCODE_FLOAT_TO_DOUBLE:
case VS_OPCODE_PULL_CONSTANT_LOAD:
case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
@@ -505,6 +507,12 @@ vec4_visitor::opt_reduce_swizzle()
case BRW_OPCODE_DP2:
swizzle = brw_swizzle_for_size(2);
break;
+
+ case VEC4_OPCODE_FLOAT_TO_DOUBLE:
+ case VEC4_OPCODE_DOUBLE_TO_FLOAT:
+ swizzle = brw_swizzle_for_size(4);
+ break;
+
default:
swizzle = brw_swizzle_for_mask(inst->dst.writemask);
break;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index 496766304c2..7864be1f733 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -1909,6 +1909,50 @@ generate_code(struct brw_codegen *p,
break;
}
+ case VEC4_OPCODE_DOUBLE_TO_FLOAT: {
+ assert(src[0].type == BRW_REGISTER_TYPE_DF);
+ assert(dst.type == BRW_REGISTER_TYPE_F);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ dst.hstride = BRW_HORIZONTAL_STRIDE_2;
+ dst.width = BRW_WIDTH_4;
+ src[0].vstride = BRW_VERTICAL_STRIDE_4;
+ src[0].width = BRW_WIDTH_4;
+ brw_MOV(p, dst, src[0]);
+
+ struct brw_reg dst_as_src = dst;
+ dst.hstride = BRW_HORIZONTAL_STRIDE_1;
+ dst.width = BRW_WIDTH_8;
+ brw_MOV(p, dst, dst_as_src);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ break;
+ }
+
+ case VEC4_OPCODE_FLOAT_TO_DOUBLE: {
+ assert(src[0].type == BRW_REGISTER_TYPE_F);
+ assert(dst.type == BRW_REGISTER_TYPE_DF);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ struct brw_reg tmp = retype(dst, src[0].type);
+ tmp.hstride = BRW_HORIZONTAL_STRIDE_2;
+ tmp.width = BRW_WIDTH_4;
+ src[0].vstride = BRW_VERTICAL_STRIDE_4;
+ src[0].hstride = BRW_HORIZONTAL_STRIDE_1;
+ src[0].width = BRW_WIDTH_4;
+ brw_MOV(p, tmp, src[0]);
+
+ tmp.vstride = BRW_VERTICAL_STRIDE_8;
+ tmp.hstride = BRW_HORIZONTAL_STRIDE_2;
+ tmp.width = BRW_WIDTH_4;
+ brw_MOV(p, dst, tmp);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ break;
+ }
+
case VEC4_OPCODE_PACK_BYTES: {
/* Is effectively:
*