7 files changed, 118 insertions, 109 deletions
diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
index 96c22ab429a..7ae17dbdd37 100644
--- a/src/intel/compiler/brw_eu.h
+++ b/src/intel/compiler/brw_eu.h
@@ -1148,12 +1148,13 @@ brw_untyped_surface_write(struct brw_codegen *p,
                           unsigned num_channels,
                           bool header_present);
 
-unsigned
+void
 brw_memory_fence(struct brw_codegen *p,
                  struct brw_reg dst,
                  struct brw_reg src,
                  enum opcode send_op,
-                 bool stall,
+                 enum brw_message_target sfid,
+                 bool commit_enable,
                  unsigned bti);
 
 void
diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h
index 146d3b3f067..742e12c2830 100644
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -454,8 +454,8 @@ enum opcode {
     * Memory fence messages.
     *
     * Source 0: Must be register g0, used as header.
-    * Source 1: Immediate bool to indicate whether or not we need to stall
-    *           until memory transactions prior to the fence are completed.
+    * Source 1: Immediate bool to indicate whether control is returned to the
+    *           thread only after the fence has been honored.
     * Source 2: Immediate byte indicating which memory to fence.  Zero means
     *           global memory; GEN7_BTI_SLM means SLM (for Gen11+ only).
     *
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 8786903dada..12042131999 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -3145,68 +3145,29 @@ brw_set_memory_fence_message(struct brw_codegen *p,
    brw_inst_set_binding_table_index(devinfo, insn, bti);
 }
 
-unsigned
+void
 brw_memory_fence(struct brw_codegen *p,
                  struct brw_reg dst,
                  struct brw_reg src,
                  enum opcode send_op,
-                 bool stall,
+                 enum brw_message_target sfid,
+                 bool commit_enable,
                  unsigned bti)
 {
    const struct gen_device_info *devinfo = p->devinfo;
-   const bool commit_enable = stall ||
-      devinfo->gen >= 10 || /* HSD ES # 1404612949 */
-      (devinfo->gen == 7 && !devinfo->is_haswell);
-   struct brw_inst *insn;
 
-   unsigned fences = 0;
-
-   brw_push_insn_state(p);
-   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
-   brw_set_default_exec_size(p, BRW_EXECUTE_1);
    dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
    src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
 
    /* Set dst as destination for dependency tracking, the MEMORY_FENCE
     * message doesn't write anything back.
     */
-   insn = next_insn(p, send_op);
+   struct brw_inst *insn = next_insn(p, send_op);
+   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
+   brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
    brw_set_dest(p, insn, dst);
    brw_set_src0(p, insn, src);
-   brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
-                                commit_enable, bti);
-   fences++;
-
-   if (devinfo->gen == 7 && !devinfo->is_haswell) {
-      /* IVB does typed surface access through the render cache, so we need to
-       * flush it too.  Use a different register so both flushes can be
-       * pipelined by the hardware.
-       */
-      insn = next_insn(p, send_op);
-      brw_set_dest(p, insn, offset(dst, 1));
-      brw_set_src0(p, insn, src);
-      brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
-                                   commit_enable, bti);
-      fences++;
-
-      /* Now write the response of the second message into the response of the
-       * first to trigger a pipeline stall -- This way future render and data
-       * cache messages will be properly ordered with respect to past data and
-       * render cache messages.
-       */
-      brw_MOV(p, dst, offset(dst, 1));
-   }
-
-   if (stall) {
-      brw_set_default_swsb(p, tgl_swsb_sbid(TGL_SBID_DST,
-                                            brw_get_default_swsb(p).sbid));
-
-      brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), dst);
-   }
-
-   brw_pop_insn_state(p);
-
-   return fences;
+   brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
 }
 
 void
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index b055110ddf8..5825e0770d4 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -2217,13 +2217,19 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
          generate_shader_time_add(inst, src[0], src[1], src[2]);
          break;
 
+      case SHADER_OPCODE_INTERLOCK:
       case SHADER_OPCODE_MEMORY_FENCE: {
          assert(src[1].file == BRW_IMMEDIATE_VALUE);
          assert(src[2].file == BRW_IMMEDIATE_VALUE);
-         const unsigned sends =
-            brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud,
-                             src[2].ud);
-         send_count += sends;
+
+         const enum opcode send_op = inst->opcode == SHADER_OPCODE_INTERLOCK ?
+            BRW_OPCODE_SENDC : BRW_OPCODE_SEND;
+
+         brw_memory_fence(p, dst, src[0], send_op,
+                          brw_message_target(inst->sfid),
+                          /* commit_enable */ src[1].ud,
+                          /* bti */ src[2].ud);
+         send_count++;
          break;
       }
 
@@ -2257,12 +2263,6 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
 
          break;
 
-      case SHADER_OPCODE_INTERLOCK:
-         assert(devinfo->gen >= 9);
-         /* The interlock is basically a memory fence issued via sendc */
-         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC, false, /* bti */ 0);
-         break;
-
       case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
          const struct brw_reg mask =
             brw_stage_has_packed_dispatch(devinfo, stage,
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index f7d94e618b4..6f415d5de82 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4232,19 +4232,52 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
    case nir_intrinsic_memory_barrier_shared:
    case nir_intrinsic_memory_barrier_buffer:
    case nir_intrinsic_memory_barrier_image:
-   case nir_intrinsic_memory_barrier: {
+   case nir_intrinsic_memory_barrier:
+   case nir_intrinsic_begin_invocation_interlock:
+   case nir_intrinsic_end_invocation_interlock: {
       bool l3_fence, slm_fence;
-      if (instr->intrinsic == nir_intrinsic_scoped_memory_barrier) {
+      const enum opcode opcode =
+         instr->intrinsic == nir_intrinsic_begin_invocation_interlock ?
+         SHADER_OPCODE_INTERLOCK : SHADER_OPCODE_MEMORY_FENCE;
+
+      switch (instr->intrinsic) {
+      case nir_intrinsic_scoped_memory_barrier: {
          nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
          l3_fence = modes & (nir_var_shader_out |
                              nir_var_mem_ssbo |
                              nir_var_mem_global);
          slm_fence = modes & nir_var_mem_shared;
-      } else {
+         break;
+      }
+
+      case nir_intrinsic_begin_invocation_interlock:
+      case nir_intrinsic_end_invocation_interlock:
+         /* For beginInvocationInterlockARB(), we will generate a memory fence
+          * but with a different opcode so that generator can pick SENDC
+          * instead of SEND.
+          *
+          * For endInvocationInterlockARB(), we need to insert a memory fence which
+          * stalls in the shader until the memory transactions prior to that
+          * fence are complete.  This ensures that the shader does not end before
+          * any writes from its critical section have landed.  Otherwise, you can
+          * end up with a case where the next invocation on that pixel properly
+          * stalls for previous FS invocation on its pixel to complete but
+          * doesn't actually wait for the dataport memory transactions from that
+          * thread to land before submitting its own.
+          *
+          * Handling them here will allow the logic for IVB render cache (see
+          * below) to be reused.
+          */
+         l3_fence = true;
+         slm_fence = false;
+         break;
+
+      default:
          l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared;
          slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier ||
                      instr->intrinsic == nir_intrinsic_memory_barrier ||
                      instr->intrinsic == nir_intrinsic_memory_barrier_shared;
+         break;
       }
 
       if (stage != MESA_SHADER_COMPUTE)
@@ -4266,6 +4299,12 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          l3_fence = true;
       }
 
+      /* IVB does typed surface access through the render cache, so we need
+       * to flush it too.
+       */
+      const bool needs_render_fence =
+         devinfo->gen == 7 && !devinfo->is_haswell;
+
       /* Be conservative in Gen11+ and always stall in a fence.  Since there
        * are two different fences, and shader might want to synchronize
        * between them.
@@ -4276,23 +4315,57 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        * TODO: When emitting more than one fence, it might help emit all
        * the fences first and then generate the stall moves.
        */
-      const bool stall = devinfo->gen >= 11;
+      const bool stall = devinfo->gen >= 11 || needs_render_fence ||
+         instr->intrinsic == nir_intrinsic_end_invocation_interlock;
+
+      const bool commit_enable = stall ||
+         devinfo->gen >= 10; /* HSD ES # 1404612949 */
 
       const fs_builder ubld = bld.group(8, 0);
-      const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
 
       if (l3_fence) {
-         ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
-                   brw_vec8_grf(0, 0), brw_imm_ud(stall),
-                   /* bti */ brw_imm_ud(0))
-            ->size_written = 2 * REG_SIZE;
+         fs_inst *fence =
+            ubld.emit(opcode,
+                      ubld.vgrf(BRW_REGISTER_TYPE_UD),
+                      brw_vec8_grf(0, 0),
+                      brw_imm_ud(commit_enable),
+                      brw_imm_ud(/* bti */ 0));
+         fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
+
+         if (needs_render_fence) {
+            fs_inst *render_fence =
+               ubld.emit(opcode,
+                         ubld.vgrf(BRW_REGISTER_TYPE_UD),
+                         brw_vec8_grf(0, 0),
+                         brw_imm_ud(commit_enable),
+                         brw_imm_ud(/* bti */ 0));
+            render_fence->sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
+
+            ubld.exec_all().group(1, 0).emit(
+               FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
+               fence->dst, render_fence->dst);
+         } else if (stall) {
+            ubld.exec_all().group(1, 0).emit(
+               FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
+               fence->dst);
+         }
       }
 
       if (slm_fence) {
-         ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
-                   brw_vec8_grf(0, 0), brw_imm_ud(stall),
-                   brw_imm_ud(GEN7_BTI_SLM))
-            ->size_written = 2 * REG_SIZE;
+         assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
+         fs_inst *fence =
+            ubld.emit(opcode,
+                      ubld.vgrf(BRW_REGISTER_TYPE_UD),
+                      brw_vec8_grf(0, 0),
+                      brw_imm_ud(commit_enable),
+                      brw_imm_ud(GEN7_BTI_SLM));
+         fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
+
+         if (stall) {
+            ubld.exec_all().group(1, 0).emit(
+               FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
+               fence->dst);
+         }
       }
 
       if (!l3_fence && !slm_fence)
@@ -5210,33 +5283,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
-   case nir_intrinsic_begin_invocation_interlock: {
-      const fs_builder ubld = bld.group(8, 0);
-      const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-
-      ubld.emit(SHADER_OPCODE_INTERLOCK, tmp, brw_vec8_grf(0, 0))
-         ->size_written = 2 * REG_SIZE;
-      break;
-   }
-
-   case nir_intrinsic_end_invocation_interlock: {
-      /* For endInvocationInterlock(), we need to insert a memory fence which
-       * stalls in the shader until the memory transactions prior to that
-       * fence are complete.  This ensures that the shader does not end before
-       * any writes from its critical section have landed.  Otherwise, you can
-       * end up with a case where the next invocation on that pixel properly
-       * stalls for previous FS invocation on its pixel to complete but
-       * doesn't actually wait for the dataport memory transactions from that
-       * thread to land before submitting its own.
-       */
-      const fs_builder ubld = bld.group(8, 0);
-      const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-      ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
-                brw_vec8_grf(0, 0), brw_imm_ud(1), brw_imm_ud(0))
-         ->size_written = 2 * REG_SIZE;
-      break;
-   }
-
    default:
       unreachable("unknown intrinsic");
    }
diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp
index 7c038c97196..be6697cc16c 100644
--- a/src/intel/compiler/brw_vec4_generator.cpp
+++ b/src/intel/compiler/brw_vec4_generator.cpp
@@ -1911,13 +1911,13 @@ generate_code(struct brw_codegen *p,
          send_count++;
          break;
 
-      case SHADER_OPCODE_MEMORY_FENCE: {
-         const unsigned sends =
-            brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, false,
-                             /* bti */ 0);
-         send_count += sends;
+      case SHADER_OPCODE_MEMORY_FENCE:
+         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND,
+                          brw_message_target(inst->sfid),
+                          /* commit_enable */ false,
+                          /* bti */ 0);
+         send_count++;
          break;
-      }
 
       case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
          const struct brw_reg mask =
diff --git a/src/intel/compiler/brw_vec4_nir.cpp b/src/intel/compiler/brw_vec4_nir.cpp
index 1baf9e67177..76446adcf54 100644
--- a/src/intel/compiler/brw_vec4_nir.cpp
+++ b/src/intel/compiler/brw_vec4_nir.cpp
@@ -704,9 +704,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
    case nir_intrinsic_scoped_memory_barrier: {
       const vec4_builder bld =
          vec4_builder(this).at_end().annotate(current_annotation, base_ir);
-      const dst_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-      bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, brw_vec8_grf(0, 0))
-         ->size_written = 2 * REG_SIZE;
+      const dst_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      vec4_instruction *fence =
+         bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, brw_vec8_grf(0, 0));
+      fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
       break;
    }