diff options
Diffstat (limited to 'src/intel/compiler')
-rw-r--r-- | src/intel/compiler/brw_eu.h | 5 | ||||
-rw-r--r-- | src/intel/compiler/brw_eu_defines.h | 4 | ||||
-rw-r--r-- | src/intel/compiler/brw_eu_emit.c | 53 | ||||
-rw-r--r-- | src/intel/compiler/brw_fs_generator.cpp | 20 | ||||
-rw-r--r-- | src/intel/compiler/brw_fs_nir.cpp | 126 | ||||
-rw-r--r-- | src/intel/compiler/brw_vec4_generator.cpp | 12 | ||||
-rw-r--r-- | src/intel/compiler/brw_vec4_nir.cpp | 7 |
7 files changed, 118 insertions, 109 deletions
diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h index 96c22ab429a..7ae17dbdd37 100644 --- a/src/intel/compiler/brw_eu.h +++ b/src/intel/compiler/brw_eu.h @@ -1148,12 +1148,13 @@ brw_untyped_surface_write(struct brw_codegen *p, unsigned num_channels, bool header_present); -unsigned +void brw_memory_fence(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src, enum opcode send_op, - bool stall, + enum brw_message_target sfid, + bool commit_enable, unsigned bti); void diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 146d3b3f067..742e12c2830 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -454,8 +454,8 @@ enum opcode { * Memory fence messages. * * Source 0: Must be register g0, used as header. - * Source 1: Immediate bool to indicate whether or not we need to stall - * until memory transactions prior to the fence are completed. + * Source 1: Immediate bool to indicate whether control is returned to the + * thread only after the fence has been honored. * Source 2: Immediate byte indicating which memory to fence. Zero means * global memory; GEN7_BTI_SLM means SLM (for Gen11+ only). * diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c index 8786903dada..12042131999 100644 --- a/src/intel/compiler/brw_eu_emit.c +++ b/src/intel/compiler/brw_eu_emit.c @@ -3145,68 +3145,29 @@ brw_set_memory_fence_message(struct brw_codegen *p, brw_inst_set_binding_table_index(devinfo, insn, bti); } -unsigned +void brw_memory_fence(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src, enum opcode send_op, - bool stall, + enum brw_message_target sfid, + bool commit_enable, unsigned bti) { const struct gen_device_info *devinfo = p->devinfo; - const bool commit_enable = stall || - devinfo->gen >= 10 || /* HSD ES # 1404612949 */ - (devinfo->gen == 7 && !devinfo->is_haswell); - struct brw_inst *insn; - unsigned fences = 0; - - brw_push_insn_state(p); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_set_default_exec_size(p, BRW_EXECUTE_1); dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW); src = retype(vec1(src), BRW_REGISTER_TYPE_UD); /* Set dst as destination for dependency tracking, the MEMORY_FENCE * message doesn't write anything back. */ - insn = next_insn(p, send_op); + struct brw_inst *insn = next_insn(p, send_op); + brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); + brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1); brw_set_dest(p, insn, dst); brw_set_src0(p, insn, src); - brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE, - commit_enable, bti); - fences++; - - if (devinfo->gen == 7 && !devinfo->is_haswell) { - /* IVB does typed surface access through the render cache, so we need to - * flush it too. Use a different register so both flushes can be - * pipelined by the hardware. - */ - insn = next_insn(p, send_op); - brw_set_dest(p, insn, offset(dst, 1)); - brw_set_src0(p, insn, src); - brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE, - commit_enable, bti); - fences++; - - /* Now write the response of the second message into the response of the - * first to trigger a pipeline stall -- This way future render and data - * cache messages will be properly ordered with respect to past data and - * render cache messages. - */ - brw_MOV(p, dst, offset(dst, 1)); - } - - if (stall) { - brw_set_default_swsb(p, tgl_swsb_sbid(TGL_SBID_DST, - brw_get_default_swsb(p).sbid)); - - brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), dst); - } - - brw_pop_insn_state(p); - - return fences; + brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti); } void diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index b055110ddf8..5825e0770d4 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -2217,13 +2217,19 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, generate_shader_time_add(inst, src[0], src[1], src[2]); break; + case SHADER_OPCODE_INTERLOCK: case SHADER_OPCODE_MEMORY_FENCE: { assert(src[1].file == BRW_IMMEDIATE_VALUE); assert(src[2].file == BRW_IMMEDIATE_VALUE); - const unsigned sends = - brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud, - src[2].ud); - send_count += sends; + + const enum opcode send_op = inst->opcode == SHADER_OPCODE_INTERLOCK ? + BRW_OPCODE_SENDC : BRW_OPCODE_SEND; + + brw_memory_fence(p, dst, src[0], send_op, + brw_message_target(inst->sfid), + /* commit_enable */ src[1].ud, + /* bti */ src[2].ud); + send_count++; break; } @@ -2257,12 +2263,6 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, break; - case SHADER_OPCODE_INTERLOCK: - assert(devinfo->gen >= 9); - /* The interlock is basically a memory fence issued via sendc */ - brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC, false, /* bti */ 0); - break; - case SHADER_OPCODE_FIND_LIVE_CHANNEL: { const struct brw_reg mask = brw_stage_has_packed_dispatch(devinfo, stage, diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index f7d94e618b4..6f415d5de82 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4232,19 +4232,52 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_memory_barrier_shared: case nir_intrinsic_memory_barrier_buffer: case nir_intrinsic_memory_barrier_image: - case nir_intrinsic_memory_barrier: { + case nir_intrinsic_memory_barrier: + case nir_intrinsic_begin_invocation_interlock: + case nir_intrinsic_end_invocation_interlock: { bool l3_fence, slm_fence; - if (instr->intrinsic == nir_intrinsic_scoped_memory_barrier) { + const enum opcode opcode = + instr->intrinsic == nir_intrinsic_begin_invocation_interlock ? + SHADER_OPCODE_INTERLOCK : SHADER_OPCODE_MEMORY_FENCE; + + switch (instr->intrinsic) { + case nir_intrinsic_scoped_memory_barrier: { nir_variable_mode modes = nir_intrinsic_memory_modes(instr); l3_fence = modes & (nir_var_shader_out | nir_var_mem_ssbo | nir_var_mem_global); slm_fence = modes & nir_var_mem_shared; - } else { + break; + } + + case nir_intrinsic_begin_invocation_interlock: + case nir_intrinsic_end_invocation_interlock: + /* For beginInvocationInterlockARB(), we will generate a memory fence + * but with a different opcode so that generator can pick SENDC + * instead of SEND. + * + * For endInvocationInterlockARB(), we need to insert a memory fence which + * stalls in the shader until the memory transactions prior to that + * fence are complete. This ensures that the shader does not end before + * any writes from its critical section have landed. Otherwise, you can + * end up with a case where the next invocation on that pixel properly + * stalls for previous FS invocation on its pixel to complete but + * doesn't actually wait for the dataport memory transactions from that + * thread to land before submitting its own. + * + * Handling them here will allow the logic for IVB render cache (see + * below) to be reused. + */ + l3_fence = true; + slm_fence = false; + break; + + default: l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared; slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier || instr->intrinsic == nir_intrinsic_memory_barrier || instr->intrinsic == nir_intrinsic_memory_barrier_shared; + break; } if (stage != MESA_SHADER_COMPUTE) @@ -4266,6 +4299,12 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr l3_fence = true; } + /* IVB does typed surface access through the render cache, so we need + * to flush it too. + */ + const bool needs_render_fence = + devinfo->gen == 7 && !devinfo->is_haswell; + /* Be conservative in Gen11+ and always stall in a fence. Since there * are two different fences, and shader might want to synchronize * between them. @@ -4276,23 +4315,57 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr * TODO: When emitting more than one fence, it might help emit all * the fences first and then generate the stall moves. */ - const bool stall = devinfo->gen >= 11; + const bool stall = devinfo->gen >= 11 || needs_render_fence || + instr->intrinsic == nir_intrinsic_end_invocation_interlock; + + const bool commit_enable = stall || + devinfo->gen >= 10; /* HSD ES # 1404612949 */ const fs_builder ubld = bld.group(8, 0); - const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); if (l3_fence) { - ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, - brw_vec8_grf(0, 0), brw_imm_ud(stall), - /* bti */ brw_imm_ud(0)) - ->size_written = 2 * REG_SIZE; + fs_inst *fence = + ubld.emit(opcode, + ubld.vgrf(BRW_REGISTER_TYPE_UD), + brw_vec8_grf(0, 0), + brw_imm_ud(commit_enable), + brw_imm_ud(/* bti */ 0)); + fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE; + + if (needs_render_fence) { + fs_inst *render_fence = + ubld.emit(opcode, + ubld.vgrf(BRW_REGISTER_TYPE_UD), + brw_vec8_grf(0, 0), + brw_imm_ud(commit_enable), + brw_imm_ud(/* bti */ 0)); + render_fence->sfid = GEN6_SFID_DATAPORT_RENDER_CACHE; + + ubld.exec_all().group(1, 0).emit( + FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), + fence->dst, render_fence->dst); + } else if (stall) { + ubld.exec_all().group(1, 0).emit( + FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), + fence->dst); + } } if (slm_fence) { - ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, - brw_vec8_grf(0, 0), brw_imm_ud(stall), - brw_imm_ud(GEN7_BTI_SLM)) - ->size_written = 2 * REG_SIZE; + assert(opcode == SHADER_OPCODE_MEMORY_FENCE); + fs_inst *fence = + ubld.emit(opcode, + ubld.vgrf(BRW_REGISTER_TYPE_UD), + brw_vec8_grf(0, 0), + brw_imm_ud(commit_enable), + brw_imm_ud(GEN7_BTI_SLM)); + fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE; + + if (stall) { + ubld.exec_all().group(1, 0).emit( + FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), + fence->dst); + } } if (!l3_fence && !slm_fence) @@ -5210,33 +5283,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } - case nir_intrinsic_begin_invocation_interlock: { - const fs_builder ubld = bld.group(8, 0); - const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); - - ubld.emit(SHADER_OPCODE_INTERLOCK, tmp, brw_vec8_grf(0, 0)) - ->size_written = 2 * REG_SIZE; - break; - } - - case nir_intrinsic_end_invocation_interlock: { - /* For endInvocationInterlock(), we need to insert a memory fence which - * stalls in the shader until the memory transactions prior to that - * fence are complete. This ensures that the shader does not end before - * any writes from its critical section have landed. Otherwise, you can - * end up with a case where the next invocation on that pixel properly - * stalls for previous FS invocation on its pixel to complete but - * doesn't actually wait for the dataport memory transactions from that - * thread to land before submitting its own. - */ - const fs_builder ubld = bld.group(8, 0); - const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); - ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, - brw_vec8_grf(0, 0), brw_imm_ud(1), brw_imm_ud(0)) - ->size_written = 2 * REG_SIZE; - break; - } - default: unreachable("unknown intrinsic"); } diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp index 7c038c97196..be6697cc16c 100644 --- a/src/intel/compiler/brw_vec4_generator.cpp +++ b/src/intel/compiler/brw_vec4_generator.cpp @@ -1911,13 +1911,13 @@ generate_code(struct brw_codegen *p, send_count++; break; - case SHADER_OPCODE_MEMORY_FENCE: { - const unsigned sends = - brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, false, - /* bti */ 0); - send_count += sends; + case SHADER_OPCODE_MEMORY_FENCE: + brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, + brw_message_target(inst->sfid), + /* commit_enable */ false, + /* bti */ 0); + send_count++; break; - } case SHADER_OPCODE_FIND_LIVE_CHANNEL: { const struct brw_reg mask = diff --git a/src/intel/compiler/brw_vec4_nir.cpp b/src/intel/compiler/brw_vec4_nir.cpp index 1baf9e67177..76446adcf54 100644 --- a/src/intel/compiler/brw_vec4_nir.cpp +++ b/src/intel/compiler/brw_vec4_nir.cpp @@ -704,9 +704,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) case nir_intrinsic_scoped_memory_barrier: { const vec4_builder bld = vec4_builder(this).at_end().annotate(current_annotation, base_ir); - const dst_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); - bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, brw_vec8_grf(0, 0)) - ->size_written = 2 * REG_SIZE; + const dst_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); + vec4_instruction *fence = + bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, brw_vec8_grf(0, 0)); + fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE; break; } |