i965/fs: Set up gen7 UBO loads as sends from GRFs.

This gives the instruction scheduler a chance to schedule between the loads, whereas before it was restricted due to the dependencies between the MRFs for setting them up. For one shader in gles3conform, it goes from getting stuck in register allocation for as long as anybody's bothered to leave it running down to 23 seconds, thanks to the LIFO scheduling. Acked-by: Kenneth Graunke <[email protected]>
author: Eric Anholt <[email protected]> 2012-12-05 00:06:30 -0800
committer: Eric Anholt <[email protected]> 2012-12-14 15:18:05 -0800
commit: 461a29783a28e579a9a5a236e5f47ffb6d18a328 (patch)
tree: 989e37ae57e2a330c76070ab5d76d0fc8aeed0cc /src
parent: 456dbcc3377ee23dbeffa4da02a4d80a8519bb62 (diff)
5 files changed, 114 insertions, 7 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 40571a4d54d..ab206d1920f 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -676,10 +676,12 @@ enum opcode {
    FS_OPCODE_SPILL,
    FS_OPCODE_UNSPILL,
    FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+   FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7,
    FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
    FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
    FS_OPCODE_MOV_DISPATCH_TO_FLAGS,
    FS_OPCODE_DISCARD_JUMP,
+   FS_OPCODE_SET_GLOBAL_OFFSET,
 
    VS_OPCODE_URB_WRITE,
    VS_OPCODE_SCRATCH_READ,
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 9a18410ac5f..83128117328 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -330,7 +330,9 @@ fs_inst::is_math()
 bool
 fs_inst::is_send_from_grf()
 {
-   return opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
+   return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
+           (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
+            src[1].file == GRF));
 }
 
 bool
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index b75314cd665..87257123f27 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -529,6 +529,10 @@ private:
    void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst,
                                             struct brw_reg index,
                                             struct brw_reg offset);
+   void generate_uniform_pull_constant_load_gen7(fs_inst *inst,
+                                                 struct brw_reg dst,
+                                                 struct brw_reg surf_index,
+                                                 struct brw_reg offset);
    void generate_varying_pull_constant_load(fs_inst *inst, struct brw_reg dst,
                                             struct brw_reg index);
    void generate_varying_pull_constant_load_gen7(fs_inst *inst,
@@ -536,6 +540,10 @@ private:
                                                  struct brw_reg index,
                                                  struct brw_reg offset);
    void generate_mov_dispatch_to_flags(fs_inst *inst);
+   void generate_set_global_offset(fs_inst *inst,
+                                   struct brw_reg dst,
+                                   struct brw_reg src,
+                                   struct brw_reg offset);
    void generate_discard_jump(fs_inst *inst);
 
    void patch_discard_jumps_to_fb_writes();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
index 9a891414e62..63f09fe7941 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
@@ -666,6 +666,44 @@ fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
 }
 
 void
+fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
+                                                       struct brw_reg dst,
+                                                       struct brw_reg index,
+                                                       struct brw_reg offset)
+{
+   assert(inst->mlen == 0);
+
+   assert(index.file == BRW_IMMEDIATE_VALUE &&
+	  index.type == BRW_REGISTER_TYPE_UD);
+   uint32_t surf_index = index.dw1.ud;
+
+   assert(offset.file == BRW_GENERAL_REGISTER_FILE);
+
+   brw_push_insn_state(p);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_pop_insn_state(p);
+
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, offset);
+   if (intel->gen < 6)
+      send->header.destreg__conditionalmod = inst->base_mrf;
+
+   uint32_t msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
+   uint32_t msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ;
+   bool header_present = true;
+   brw_set_dp_read_message(p, send,
+                           surf_index,
+                           msg_control,
+                           msg_type,
+                           BRW_DATAPORT_READ_TARGET_DATA_CACHE,
+                           1,
+                           header_present,
+                           1);
+}
+
+void
 fs_generator::generate_varying_pull_constant_load(fs_inst *inst,
                                                   struct brw_reg dst,
                                                   struct brw_reg index)
@@ -852,6 +890,35 @@ brw_reg_from_fs_reg(fs_reg *reg)
    return brw_reg;
 }
 
+/**
+ * Sets the second dword of a vgrf for gen7+ message setup.
+ *
+ * For setting up gen7 messages in VGRFs, we need to be able to set the second
+ * dword for some payloads where in the MRF world we'd have just used
+ * brw_message_reg().  We don't want to bake it into the send message's code
+ * generation because that means we don't get a chance to schedule the
+ * instructions.
+ */
+void
+fs_generator::generate_set_global_offset(fs_inst *inst,
+                                         struct brw_reg dst,
+                                         struct brw_reg src,
+                                         struct brw_reg value)
+{
+   /* We use a matching src and dst to get the information on how this
+    * instruction works exposed to various optimization passes that would
+    * otherwise treat it as completely overwriting the dst.
+    */
+   assert(src.file == dst.file && src.nr == dst.nr);
+   assert(value.file == BRW_IMMEDIATE_VALUE);
+
+   brw_push_insn_state(p);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 2), value.type), value);
+   brw_pop_insn_state(p);
+}
+
 void
 fs_generator::generate_code(exec_list *instructions)
 {
@@ -1127,6 +1194,10 @@ fs_generator::generate_code(exec_list *instructions)
 	 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
 	 break;
 
+      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
+	 generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
+	 break;
+
       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 	 generate_varying_pull_constant_load(inst, dst, src[0]);
 	 break;
@@ -1151,6 +1222,10 @@ fs_generator::generate_code(exec_list *instructions)
          brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_WM_SHADER_TIME);
          break;
 
+      case FS_OPCODE_SET_GLOBAL_OFFSET:
+         generate_set_global_offset(inst, dst, src[0], src[1]);
+         break;
+
       default:
 	 if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
 	    _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index ccf905ebc62..6a39f98509e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -581,12 +581,32 @@ fs_visitor::visit(ir_expression *ir)
       if (const_offset) {
          fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
          packed_consts.type = result.type;
-         fs_inst *pull = emit(fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                                      packed_consts,
-                                      surf_index,
-                                      fs_reg(const_offset->value.u[0])));
-         pull->base_mrf = 14;
-         pull->mlen = 1;
+
+         if (intel->gen >= 7) {
+            fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] / 16);
+            fs_reg payload = fs_reg(this, glsl_type::uint_type);
+            struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
+                                       BRW_REGISTER_TYPE_UD);
+            fs_inst *setup = emit(MOV(payload, fs_reg(g0)));
+            setup->force_writemask_all = true;
+            /* We don't need the second half of this vgrf to be filled with g1
+             * in the 16-wide case, but if we use force_uncompressed then live
+             * variable analysis won't consider this a def!
+             */
+
+            emit(FS_OPCODE_SET_GLOBAL_OFFSET, payload,
+                 payload, const_offset_reg);
+            emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7, packed_consts,
+                 surf_index, payload);
+         } else {
+            fs_reg const_offset_reg = fs_reg(const_offset->value.u[0]);
+            fs_inst *pull = emit(fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+                                         packed_consts,
+                                         surf_index,
+                                         const_offset_reg));
+            pull->base_mrf = 14;
+            pull->mlen = 1;
+         }
 
          packed_consts.smear = const_offset->value.u[0] % 16 / 4;
          for (int i = 0; i < ir->type->vector_elements; i++) {
author	Eric Anholt <[email protected]>	2012-12-05 00:06:30 -0800
committer	Eric Anholt <[email protected]>	2012-12-14 15:18:05 -0800
commit	461a29783a28e579a9a5a236e5f47ffb6d18a328 (patch)
tree	989e37ae57e2a330c76070ab5d76d0fc8aeed0cc /src
parent	456dbcc3377ee23dbeffa4da02a4d80a8519bb62 (diff)