summaryrefslogtreecommitdiffstats
path: root/src/intel
diff options
context:
space:
mode:
Diffstat (limited to 'src/intel')
-rw-r--r--src/intel/compiler/brw_eu.h3
-rw-r--r--src/intel/compiler/brw_eu_defines.h11
-rw-r--r--src/intel/compiler/brw_eu_emit.c13
-rw-r--r--src/intel/compiler/brw_fs_generator.cpp5
-rw-r--r--src/intel/compiler/brw_fs_nir.cpp44
-rw-r--r--src/intel/compiler/brw_vec4_generator.cpp2
6 files changed, 66 insertions, 12 deletions
diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
index dd504cc25fb..c4dba558ecd 100644
--- a/src/intel/compiler/brw_eu.h
+++ b/src/intel/compiler/brw_eu.h
@@ -1117,7 +1117,8 @@ brw_memory_fence(struct brw_codegen *p,
struct brw_reg dst,
struct brw_reg src,
enum opcode send_op,
- bool stall);
+ bool stall,
+ unsigned bti);
void
brw_pixel_interpolator_query(struct brw_codegen *p,
diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h
index e8ca7ff8b98..1d4c0b83c87 100644
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -452,6 +452,17 @@ enum opcode {
SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
+ /**
+ * Memory fence messages.
+ *
+ * Source 0: Must be register g0, used as header.
+ * Source 1: Immediate bool to indicate whether or not we need to stall
+ * until memory transactions prior to the fence are completed.
+ * Source 2: Immediate byte indicating which memory to fence. Zero means
+ * global memory; GEN7_BTI_SLM means SLM (for Gen11+ only).
+ *
+ * Vec4 backend only uses Source 0.
+ */
SHADER_OPCODE_MEMORY_FENCE,
SHADER_OPCODE_GEN4_SCRATCH_READ,
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 8e7263ce447..60761e83c62 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -3012,7 +3012,8 @@ static void
brw_set_memory_fence_message(struct brw_codegen *p,
struct brw_inst *insn,
enum brw_message_target sfid,
- bool commit_enable)
+ bool commit_enable,
+ unsigned bti)
{
const struct gen_device_info *devinfo = p->devinfo;
@@ -3034,6 +3035,9 @@ brw_set_memory_fence_message(struct brw_codegen *p,
if (commit_enable)
brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
+
+ assert(devinfo->gen >= 11 || bti == 0);
+ brw_inst_set_binding_table_index(devinfo, insn, bti);
}
void
@@ -3041,7 +3045,8 @@ brw_memory_fence(struct brw_codegen *p,
struct brw_reg dst,
struct brw_reg src,
enum opcode send_op,
- bool stall)
+ bool stall,
+ unsigned bti)
{
const struct gen_device_info *devinfo = p->devinfo;
const bool commit_enable = stall ||
@@ -3062,7 +3067,7 @@ brw_memory_fence(struct brw_codegen *p,
brw_set_dest(p, insn, dst);
brw_set_src0(p, insn, src);
brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
- commit_enable);
+ commit_enable, bti);
if (devinfo->gen == 7 && !devinfo->is_haswell) {
/* IVB does typed surface access through the render cache, so we need to
@@ -3073,7 +3078,7 @@ brw_memory_fence(struct brw_codegen *p,
brw_set_dest(p, insn, offset(dst, 1));
brw_set_src0(p, insn, src);
brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
- commit_enable);
+ commit_enable, bti);
/* Now write the response of the second message into the response of the
* first to trigger a pipeline stall -- This way future render and data
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index be6a00e8476..88de5189064 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -2069,13 +2069,14 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
case SHADER_OPCODE_MEMORY_FENCE:
assert(src[1].file == BRW_IMMEDIATE_VALUE);
- brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud);
+ assert(src[2].file == BRW_IMMEDIATE_VALUE);
+ brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud, src[2].ud);
break;
case SHADER_OPCODE_INTERLOCK:
assert(devinfo->gen >= 9);
/* The interlock is basically a memory fence issued via sendc */
- brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC, false);
+ brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC, false, /* bti */ 0);
break;
case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 00ce6af23c7..aeebaaeb62c 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4416,11 +4416,47 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
case nir_intrinsic_memory_barrier_buffer:
case nir_intrinsic_memory_barrier_image:
case nir_intrinsic_memory_barrier: {
+ bool l3_fence, slm_fence;
+ if (devinfo->gen >= 11) {
+ l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared;
+ slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier ||
+ instr->intrinsic == nir_intrinsic_memory_barrier ||
+ instr->intrinsic == nir_intrinsic_memory_barrier_shared;
+ } else {
+ /* Prior to gen11, we only have one kind of fence. */
+ l3_fence = true;
+ slm_fence = false;
+ }
+
+ /* Be conservative in Gen11+ and always stall in a fence. Since there
+ * are two different fences, and shader might want to synchronize
+ * between them.
+ *
+ * TODO: Improve NIR so that scope and visibility information for the
+ * barriers is available here to make a better decision.
+ *
+ * TODO: When emitting more than one fence, it might help emit all
+ * the fences first and then generate the stall moves.
+ */
+ const bool stall = devinfo->gen >= 11;
+
const fs_builder ubld = bld.group(8, 0);
const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
- ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
- brw_vec8_grf(0, 0), brw_imm_ud(0))
- ->size_written = 2 * REG_SIZE;
+
+ if (l3_fence) {
+ ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
+ brw_vec8_grf(0, 0), brw_imm_ud(stall),
+ /* bti */ brw_imm_ud(0))
+ ->size_written = 2 * REG_SIZE;
+ }
+
+ if (slm_fence) {
+ ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
+ brw_vec8_grf(0, 0), brw_imm_ud(stall),
+ brw_imm_ud(GEN7_BTI_SLM))
+ ->size_written = 2 * REG_SIZE;
+ }
+
break;
}
@@ -5238,7 +5274,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
const fs_builder ubld = bld.group(8, 0);
const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
- brw_vec8_grf(0, 0), brw_imm_ud(1))
+ brw_vec8_grf(0, 0), brw_imm_ud(1), brw_imm_ud(0))
->size_written = 2 * REG_SIZE;
break;
}
diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp
index 8f9e4f16677..d85e3c43241 100644
--- a/src/intel/compiler/brw_vec4_generator.cpp
+++ b/src/intel/compiler/brw_vec4_generator.cpp
@@ -1886,7 +1886,7 @@ generate_code(struct brw_codegen *p,
break;
case SHADER_OPCODE_MEMORY_FENCE:
- brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, false);
+ brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, false, /* bti */ 0);
break;
case SHADER_OPCODE_FIND_LIVE_CHANNEL: {