aboutsummaryrefslogtreecommitdiffstats
path: root/src/intel/compiler
diff options
context:
space:
mode:
Diffstat (limited to 'src/intel/compiler')
-rw-r--r--src/intel/compiler/brw_eu.h5
-rw-r--r--src/intel/compiler/brw_eu_defines.h4
-rw-r--r--src/intel/compiler/brw_eu_emit.c53
-rw-r--r--src/intel/compiler/brw_fs_generator.cpp20
-rw-r--r--src/intel/compiler/brw_fs_nir.cpp126
-rw-r--r--src/intel/compiler/brw_vec4_generator.cpp12
-rw-r--r--src/intel/compiler/brw_vec4_nir.cpp7
7 files changed, 118 insertions, 109 deletions
diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
index 96c22ab429a..7ae17dbdd37 100644
--- a/src/intel/compiler/brw_eu.h
+++ b/src/intel/compiler/brw_eu.h
@@ -1148,12 +1148,13 @@ brw_untyped_surface_write(struct brw_codegen *p,
unsigned num_channels,
bool header_present);
-unsigned
+void
brw_memory_fence(struct brw_codegen *p,
struct brw_reg dst,
struct brw_reg src,
enum opcode send_op,
- bool stall,
+ enum brw_message_target sfid,
+ bool commit_enable,
unsigned bti);
void
diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h
index 146d3b3f067..742e12c2830 100644
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -454,8 +454,8 @@ enum opcode {
* Memory fence messages.
*
* Source 0: Must be register g0, used as header.
- * Source 1: Immediate bool to indicate whether or not we need to stall
- * until memory transactions prior to the fence are completed.
+ * Source 1: Immediate bool to indicate whether control is returned to the
+ * thread only after the fence has been honored.
* Source 2: Immediate byte indicating which memory to fence. Zero means
* global memory; GEN7_BTI_SLM means SLM (for Gen11+ only).
*
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 8786903dada..12042131999 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -3145,68 +3145,29 @@ brw_set_memory_fence_message(struct brw_codegen *p,
brw_inst_set_binding_table_index(devinfo, insn, bti);
}
-unsigned
+void
brw_memory_fence(struct brw_codegen *p,
struct brw_reg dst,
struct brw_reg src,
enum opcode send_op,
- bool stall,
+ enum brw_message_target sfid,
+ bool commit_enable,
unsigned bti)
{
const struct gen_device_info *devinfo = p->devinfo;
- const bool commit_enable = stall ||
- devinfo->gen >= 10 || /* HSD ES # 1404612949 */
- (devinfo->gen == 7 && !devinfo->is_haswell);
- struct brw_inst *insn;
- unsigned fences = 0;
-
- brw_push_insn_state(p);
- brw_set_default_mask_control(p, BRW_MASK_DISABLE);
- brw_set_default_exec_size(p, BRW_EXECUTE_1);
dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
/* Set dst as destination for dependency tracking, the MEMORY_FENCE
* message doesn't write anything back.
*/
- insn = next_insn(p, send_op);
+ struct brw_inst *insn = next_insn(p, send_op);
+ brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
+ brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
brw_set_dest(p, insn, dst);
brw_set_src0(p, insn, src);
- brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
- commit_enable, bti);
- fences++;
-
- if (devinfo->gen == 7 && !devinfo->is_haswell) {
- /* IVB does typed surface access through the render cache, so we need to
- * flush it too. Use a different register so both flushes can be
- * pipelined by the hardware.
- */
- insn = next_insn(p, send_op);
- brw_set_dest(p, insn, offset(dst, 1));
- brw_set_src0(p, insn, src);
- brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
- commit_enable, bti);
- fences++;
-
- /* Now write the response of the second message into the response of the
- * first to trigger a pipeline stall -- This way future render and data
- * cache messages will be properly ordered with respect to past data and
- * render cache messages.
- */
- brw_MOV(p, dst, offset(dst, 1));
- }
-
- if (stall) {
- brw_set_default_swsb(p, tgl_swsb_sbid(TGL_SBID_DST,
- brw_get_default_swsb(p).sbid));
-
- brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), dst);
- }
-
- brw_pop_insn_state(p);
-
- return fences;
+ brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
}
void
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index b055110ddf8..5825e0770d4 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -2217,13 +2217,19 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
generate_shader_time_add(inst, src[0], src[1], src[2]);
break;
+ case SHADER_OPCODE_INTERLOCK:
case SHADER_OPCODE_MEMORY_FENCE: {
assert(src[1].file == BRW_IMMEDIATE_VALUE);
assert(src[2].file == BRW_IMMEDIATE_VALUE);
- const unsigned sends =
- brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud,
- src[2].ud);
- send_count += sends;
+
+ const enum opcode send_op = inst->opcode == SHADER_OPCODE_INTERLOCK ?
+ BRW_OPCODE_SENDC : BRW_OPCODE_SEND;
+
+ brw_memory_fence(p, dst, src[0], send_op,
+ brw_message_target(inst->sfid),
+ /* commit_enable */ src[1].ud,
+ /* bti */ src[2].ud);
+ send_count++;
break;
}
@@ -2257,12 +2263,6 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
break;
- case SHADER_OPCODE_INTERLOCK:
- assert(devinfo->gen >= 9);
- /* The interlock is basically a memory fence issued via sendc */
- brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC, false, /* bti */ 0);
- break;
-
case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
const struct brw_reg mask =
brw_stage_has_packed_dispatch(devinfo, stage,
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index f7d94e618b4..6f415d5de82 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4232,19 +4232,52 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
case nir_intrinsic_memory_barrier_shared:
case nir_intrinsic_memory_barrier_buffer:
case nir_intrinsic_memory_barrier_image:
- case nir_intrinsic_memory_barrier: {
+ case nir_intrinsic_memory_barrier:
+ case nir_intrinsic_begin_invocation_interlock:
+ case nir_intrinsic_end_invocation_interlock: {
bool l3_fence, slm_fence;
- if (instr->intrinsic == nir_intrinsic_scoped_memory_barrier) {
+ const enum opcode opcode =
+ instr->intrinsic == nir_intrinsic_begin_invocation_interlock ?
+ SHADER_OPCODE_INTERLOCK : SHADER_OPCODE_MEMORY_FENCE;
+
+ switch (instr->intrinsic) {
+ case nir_intrinsic_scoped_memory_barrier: {
nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
l3_fence = modes & (nir_var_shader_out |
nir_var_mem_ssbo |
nir_var_mem_global);
slm_fence = modes & nir_var_mem_shared;
- } else {
+ break;
+ }
+
+ case nir_intrinsic_begin_invocation_interlock:
+ case nir_intrinsic_end_invocation_interlock:
+ /* For beginInvocationInterlockARB(), we will generate a memory fence
+ * but with a different opcode so that generator can pick SENDC
+ * instead of SEND.
+ *
+ * For endInvocationInterlockARB(), we need to insert a memory fence which
+ * stalls in the shader until the memory transactions prior to that
+ * fence are complete. This ensures that the shader does not end before
+ * any writes from its critical section have landed. Otherwise, you can
+ * end up with a case where the next invocation on that pixel properly
+ * stalls for previous FS invocation on its pixel to complete but
+ * doesn't actually wait for the dataport memory transactions from that
+ * thread to land before submitting its own.
+ *
+ * Handling them here will allow the logic for IVB render cache (see
+ * below) to be reused.
+ */
+ l3_fence = true;
+ slm_fence = false;
+ break;
+
+ default:
l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared;
slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier ||
instr->intrinsic == nir_intrinsic_memory_barrier ||
instr->intrinsic == nir_intrinsic_memory_barrier_shared;
+ break;
}
if (stage != MESA_SHADER_COMPUTE)
@@ -4266,6 +4299,12 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
l3_fence = true;
}
+ /* IVB does typed surface access through the render cache, so we need
+ * to flush it too.
+ */
+ const bool needs_render_fence =
+ devinfo->gen == 7 && !devinfo->is_haswell;
+
/* Be conservative in Gen11+ and always stall in a fence. Since there
* are two different fences, and shader might want to synchronize
* between them.
@@ -4276,23 +4315,57 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
* TODO: When emitting more than one fence, it might help emit all
* the fences first and then generate the stall moves.
*/
- const bool stall = devinfo->gen >= 11;
+ const bool stall = devinfo->gen >= 11 || needs_render_fence ||
+ instr->intrinsic == nir_intrinsic_end_invocation_interlock;
+
+ const bool commit_enable = stall ||
+ devinfo->gen >= 10; /* HSD ES # 1404612949 */
const fs_builder ubld = bld.group(8, 0);
- const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
if (l3_fence) {
- ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
- brw_vec8_grf(0, 0), brw_imm_ud(stall),
- /* bti */ brw_imm_ud(0))
- ->size_written = 2 * REG_SIZE;
+ fs_inst *fence =
+ ubld.emit(opcode,
+ ubld.vgrf(BRW_REGISTER_TYPE_UD),
+ brw_vec8_grf(0, 0),
+ brw_imm_ud(commit_enable),
+ brw_imm_ud(/* bti */ 0));
+ fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
+
+ if (needs_render_fence) {
+ fs_inst *render_fence =
+ ubld.emit(opcode,
+ ubld.vgrf(BRW_REGISTER_TYPE_UD),
+ brw_vec8_grf(0, 0),
+ brw_imm_ud(commit_enable),
+ brw_imm_ud(/* bti */ 0));
+ render_fence->sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
+
+ ubld.exec_all().group(1, 0).emit(
+ FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
+ fence->dst, render_fence->dst);
+ } else if (stall) {
+ ubld.exec_all().group(1, 0).emit(
+ FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
+ fence->dst);
+ }
}
if (slm_fence) {
- ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
- brw_vec8_grf(0, 0), brw_imm_ud(stall),
- brw_imm_ud(GEN7_BTI_SLM))
- ->size_written = 2 * REG_SIZE;
+ assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
+ fs_inst *fence =
+ ubld.emit(opcode,
+ ubld.vgrf(BRW_REGISTER_TYPE_UD),
+ brw_vec8_grf(0, 0),
+ brw_imm_ud(commit_enable),
+ brw_imm_ud(GEN7_BTI_SLM));
+ fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
+
+ if (stall) {
+ ubld.exec_all().group(1, 0).emit(
+ FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
+ fence->dst);
+ }
}
if (!l3_fence && !slm_fence)
@@ -5210,33 +5283,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
break;
}
- case nir_intrinsic_begin_invocation_interlock: {
- const fs_builder ubld = bld.group(8, 0);
- const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-
- ubld.emit(SHADER_OPCODE_INTERLOCK, tmp, brw_vec8_grf(0, 0))
- ->size_written = 2 * REG_SIZE;
- break;
- }
-
- case nir_intrinsic_end_invocation_interlock: {
- /* For endInvocationInterlock(), we need to insert a memory fence which
- * stalls in the shader until the memory transactions prior to that
- * fence are complete. This ensures that the shader does not end before
- * any writes from its critical section have landed. Otherwise, you can
- * end up with a case where the next invocation on that pixel properly
- * stalls for previous FS invocation on its pixel to complete but
- * doesn't actually wait for the dataport memory transactions from that
- * thread to land before submitting its own.
- */
- const fs_builder ubld = bld.group(8, 0);
- const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
- ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
- brw_vec8_grf(0, 0), brw_imm_ud(1), brw_imm_ud(0))
- ->size_written = 2 * REG_SIZE;
- break;
- }
-
default:
unreachable("unknown intrinsic");
}
diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp
index 7c038c97196..be6697cc16c 100644
--- a/src/intel/compiler/brw_vec4_generator.cpp
+++ b/src/intel/compiler/brw_vec4_generator.cpp
@@ -1911,13 +1911,13 @@ generate_code(struct brw_codegen *p,
send_count++;
break;
- case SHADER_OPCODE_MEMORY_FENCE: {
- const unsigned sends =
- brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, false,
- /* bti */ 0);
- send_count += sends;
+ case SHADER_OPCODE_MEMORY_FENCE:
+ brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND,
+ brw_message_target(inst->sfid),
+ /* commit_enable */ false,
+ /* bti */ 0);
+ send_count++;
break;
- }
case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
const struct brw_reg mask =
diff --git a/src/intel/compiler/brw_vec4_nir.cpp b/src/intel/compiler/brw_vec4_nir.cpp
index 1baf9e67177..76446adcf54 100644
--- a/src/intel/compiler/brw_vec4_nir.cpp
+++ b/src/intel/compiler/brw_vec4_nir.cpp
@@ -704,9 +704,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
case nir_intrinsic_scoped_memory_barrier: {
const vec4_builder bld =
vec4_builder(this).at_end().annotate(current_annotation, base_ir);
- const dst_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
- bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, brw_vec8_grf(0, 0))
- ->size_written = 2 * REG_SIZE;
+ const dst_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+ vec4_instruction *fence =
+ bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, brw_vec8_grf(0, 0));
+ fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
break;
}