diff options
Diffstat (limited to 'src/mesa/drivers/dri')
59 files changed, 2199 insertions, 842 deletions
diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am index 04b3f9cc8ce..9d003e48bd8 100644 --- a/src/mesa/drivers/dri/i965/Makefile.am +++ b/src/mesa/drivers/dri/i965/Makefile.am @@ -59,6 +59,7 @@ TESTS = \ test_fs_saturate_propagation \ test_eu_compact \ test_vf_float_conversions \ + test_vec4_cmod_propagation \ test_vec4_copy_propagation \ test_vec4_register_coalesce @@ -94,6 +95,12 @@ test_vec4_copy_propagation_LDADD = \ $(top_builddir)/src/gtest/libgtest.la \ $(TEST_LIBS) +test_vec4_cmod_propagation_SOURCES = \ + test_vec4_cmod_propagation.cpp +test_vec4_cmod_propagation_LDADD = \ + $(top_builddir)/src/gtest/libgtest.la \ + $(TEST_LIBS) + test_eu_compact_SOURCES = \ test_eu_compact.c nodist_EXTRA_test_eu_compact_SOURCES = dummy.cpp diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources index ccd540dabca..ed2654ef329 100644 --- a/src/mesa/drivers/dri/i965/Makefile.sources +++ b/src/mesa/drivers/dri/i965/Makefile.sources @@ -58,6 +58,7 @@ i965_compiler_FILES = \ brw_util.c \ brw_util.h \ brw_vec4_builder.h \ + brw_vec4_cmod_propagation.cpp \ brw_vec4_copy_propagation.cpp \ brw_vec4.cpp \ brw_vec4_cse.cpp \ diff --git a/src/mesa/drivers/dri/i965/brw_cfg.cpp b/src/mesa/drivers/dri/i965/brw_cfg.cpp index 10bcd4bafd4..5d46615bc7b 100644 --- a/src/mesa/drivers/dri/i965/brw_cfg.cpp +++ b/src/mesa/drivers/dri/i965/brw_cfg.cpp @@ -528,7 +528,9 @@ cfg_t::dump_domtree() { printf("digraph DominanceTree {\n"); foreach_block(block, this) { - printf("\t%d -> %d\n", block->idom->num, block->num); + if (block->idom) { + printf("\t%d -> %d\n", block->idom->num, block->num); + } } printf("}\n"); } diff --git a/src/mesa/drivers/dri/i965/brw_cfg.h b/src/mesa/drivers/dri/i965/brw_cfg.h index a06b0aa1cd0..69e39e8964d 100644 --- a/src/mesa/drivers/dri/i965/brw_cfg.h +++ b/src/mesa/drivers/dri/i965/brw_cfg.h @@ -90,6 +90,8 @@ struct bblock_t { struct exec_list parents; struct exec_list children; int num; + + unsigned cycle_count; }; static inline struct backend_instruction * @@ -285,6 +287,8 @@ struct cfg_t { int num_blocks; bool idom_dirty; + + unsigned cycle_count; }; /* Note that this is implemented with a double for loop -- break will diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h index d9967143d8a..e5133ef5a3d 100644 --- a/src/mesa/drivers/dri/i965/brw_compiler.h +++ b/src/mesa/drivers/dri/i965/brw_compiler.h @@ -338,6 +338,7 @@ struct brw_wm_prog_data { } binding_table; uint8_t computed_depth_mode; + bool computed_stencil; bool early_fragment_tests; bool no_8; @@ -443,9 +444,7 @@ struct brw_vue_map { * directly correspond to a gl_varying_slot, the value comes from * brw_varying_slot. * - * For slots that are not in use, the value is BRW_VARYING_SLOT_COUNT (this - * simplifies code that uses the value stored in slot_to_varying to - * create a bit mask). + * For slots that are not in use, the value is BRW_VARYING_SLOT_PAD. */ signed char slot_to_varying[BRW_VARYING_SLOT_COUNT]; @@ -467,8 +466,8 @@ static inline GLuint brw_vue_slot_to_offset(GLuint slot) * Convert a vertex output (brw_varying_slot) into a byte offset within the * VUE. */ -static inline GLuint brw_varying_to_offset(struct brw_vue_map *vue_map, - GLuint varying) +static inline +GLuint brw_varying_to_offset(const struct brw_vue_map *vue_map, GLuint varying) { return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]); } diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 4f503ae4869..c83f47bdff7 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -501,8 +501,6 @@ struct brw_cache_item { }; -typedef void (*cache_aux_free_func)(const void *aux); - struct brw_cache { struct brw_context *brw; @@ -512,9 +510,6 @@ struct brw_cache { uint32_t next_offset; bool bo_used_by_gpu; - - /** Optional functions for freeing other pointers attached to a prog_data. */ - cache_aux_free_func aux_free[BRW_MAX_CACHE]; }; @@ -1177,7 +1172,7 @@ struct brw_context int num_atoms[BRW_NUM_PIPELINES]; const struct brw_tracked_state render_atoms[60]; - const struct brw_tracked_state compute_atoms[8]; + const struct brw_tracked_state compute_atoms[9]; /* If (INTEL_DEBUG & DEBUG_BATCH) */ struct { @@ -1463,7 +1458,7 @@ void brw_upload_ubo_surfaces(struct brw_context *brw, struct brw_stage_prog_data *prog_data, bool dword_pitch); void brw_upload_abo_surfaces(struct brw_context *brw, - struct gl_shader_program *prog, + struct gl_shader *shader, struct brw_stage_state *stage_state, struct brw_stage_prog_data *prog_data); void brw_upload_image_surfaces(struct brw_context *brw, @@ -1680,6 +1675,7 @@ struct opcode_desc { extern const struct opcode_desc opcode_descs[128]; extern const char * const conditional_modifier[16]; +extern const char *const pred_ctrl_align16[16]; void brw_emit_depthbuffer(struct brw_context *brw); diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 169d092f90e..754da9fc3da 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -913,20 +913,15 @@ enum opcode { /** * Same as FS_OPCODE_FB_WRITE but expects its arguments separately as - * individual sources instead of as a single payload blob: - * - * Source 0: [required] Color 0. - * Source 1: [optional] Color 1 (for dual source blend messages). - * Source 2: [optional] Src0 Alpha. - * Source 3: [optional] Source Depth (gl_FragDepth) - * Source 4: [optional (gen4-5)] Destination Depth passthrough from thread - * Source 5: [optional] Sample Mask (gl_SampleMask). - * Source 6: [required] Number of color components (as a UD immediate). + * individual sources instead of as a single payload blob. The + * position/ordering of the arguments are defined by the enum + * fb_write_logical_srcs. */ FS_OPCODE_FB_WRITE_LOGICAL, FS_OPCODE_BLORP_FB_WRITE, FS_OPCODE_REP_FB_WRITE, + FS_OPCODE_PACK_STENCIL_REF, SHADER_OPCODE_RCP, SHADER_OPCODE_RSQ, SHADER_OPCODE_SQRT, @@ -1332,6 +1327,17 @@ enum brw_urb_write_flags { BRW_URB_WRITE_ALLOCATE | BRW_URB_WRITE_COMPLETE, }; +enum fb_write_logical_srcs { + FB_WRITE_LOGICAL_SRC_COLOR0, /* REQUIRED */ + FB_WRITE_LOGICAL_SRC_COLOR1, /* for dual source blend messages */ + FB_WRITE_LOGICAL_SRC_SRC0_ALPHA, + FB_WRITE_LOGICAL_SRC_SRC_DEPTH, /* gl_FragDepth */ + FB_WRITE_LOGICAL_SRC_DST_DEPTH, /* GEN4-5: passthrough from thread */ + FB_WRITE_LOGICAL_SRC_SRC_STENCIL, /* gl_FragStencilRefARB */ + FB_WRITE_LOGICAL_SRC_OMASK, /* Sample Mask (gl_SampleMask) */ + FB_WRITE_LOGICAL_SRC_COMPONENTS, /* REQUIRED */ +}; + #ifdef __cplusplus /** * Allow brw_urb_write_flags enums to be ORed together. diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c index 65172490da3..6372fb5c55f 100644 --- a/src/mesa/drivers/dri/i965/brw_device_info.c +++ b/src/mesa/drivers/dri/i965/brw_device_info.c @@ -311,7 +311,7 @@ static const struct brw_device_info brw_device_info_chv = { .max_gs_threads = 336, \ .max_hs_threads = 336, \ .max_ds_threads = 336, \ - .max_wm_threads = 64 * 6, \ + .max_wm_threads = 64 * 9, \ .max_cs_threads = 56, \ .urb = { \ .size = 384, \ @@ -335,6 +335,10 @@ static const struct brw_device_info brw_device_info_skl_gt3 = { GEN9_FEATURES, .gt = 3, }; +static const struct brw_device_info brw_device_info_skl_gt4 = { + GEN9_FEATURES, .gt = 4, +}; + static const struct brw_device_info brw_device_info_bxt = { GEN9_FEATURES, .is_broxton = 1, @@ -359,7 +363,7 @@ static const struct brw_device_info brw_device_info_bxt = { }; const struct brw_device_info * -brw_get_device_info(int devid, int revision) +brw_get_device_info(int devid) { const struct brw_device_info *devinfo; switch (devid) { diff --git a/src/mesa/drivers/dri/i965/brw_device_info.h b/src/mesa/drivers/dri/i965/brw_device_info.h index 7bab5716b43..6f4a250e874 100644 --- a/src/mesa/drivers/dri/i965/brw_device_info.h +++ b/src/mesa/drivers/dri/i965/brw_device_info.h @@ -86,5 +86,5 @@ struct brw_device_info /** @} */ }; -const struct brw_device_info *brw_get_device_info(int devid, int revision); +const struct brw_device_info *brw_get_device_info(int devid); const char *brw_get_device_name(int devid); diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c index db23a187a93..df747107188 100644 --- a/src/mesa/drivers/dri/i965/brw_disasm.c +++ b/src/mesa/drivers/dri/i965/brw_disasm.c @@ -252,7 +252,7 @@ static const char *const pred_inv[2] = { [1] = "-" }; -static const char *const pred_ctrl_align16[16] = { +const char *const pred_ctrl_align16[16] = { [1] = "", [2] = ".x", [3] = ".y", @@ -726,7 +726,7 @@ reg(FILE *file, unsigned _reg_file, unsigned _reg_nr) switch (_reg_nr & 0xf0) { case BRW_ARF_NULL: string(file, "null"); - return -1; + break; case BRW_ARF_ADDRESS: format(file, "a%d", _reg_nr & 0x0f); break; @@ -908,7 +908,6 @@ src_ia1(FILE *file, unsigned _addr_subreg_nr, unsigned _negate, unsigned __abs, - unsigned _addr_mode, unsigned _horiz_stride, unsigned _width, unsigned _vert_stride) { int err = 0; @@ -1143,7 +1142,6 @@ src0(FILE *file, const struct brw_device_info *devinfo, brw_inst *inst) brw_inst_src0_ia_subreg_nr(devinfo, inst), brw_inst_src0_negate(devinfo, inst), brw_inst_src0_abs(devinfo, inst), - brw_inst_src0_address_mode(devinfo, inst), brw_inst_src0_hstride(devinfo, inst), brw_inst_src0_width(devinfo, inst), brw_inst_src0_vstride(devinfo, inst)); @@ -1200,7 +1198,6 @@ src1(FILE *file, const struct brw_device_info *devinfo, brw_inst *inst) brw_inst_src1_ia_subreg_nr(devinfo, inst), brw_inst_src1_negate(devinfo, inst), brw_inst_src1_abs(devinfo, inst), - brw_inst_src1_address_mode(devinfo, inst), brw_inst_src1_hstride(devinfo, inst), brw_inst_src1_width(devinfo, inst), brw_inst_src1_vstride(devinfo, inst)); diff --git a/src/mesa/drivers/dri/i965/brw_eu.c b/src/mesa/drivers/dri/i965/brw_eu.c index 1f4a3516fa2..40ec87d38f0 100644 --- a/src/mesa/drivers/dri/i965/brw_eu.c +++ b/src/mesa/drivers/dri/i965/brw_eu.c @@ -261,7 +261,7 @@ void brw_disassemble(const struct brw_device_info *devinfo, void *assembly, int start, int end, FILE *out) { - bool dump_hex = false; + bool dump_hex = (INTEL_DEBUG & DEBUG_HEX) != 0; for (int offset = start; offset < end;) { brw_inst *insn = assembly + offset; diff --git a/src/mesa/drivers/dri/i965/brw_eu_compact.c b/src/mesa/drivers/dri/i965/brw_eu_compact.c index f787ea3d4f8..07ace6bfbcb 100644 --- a/src/mesa/drivers/dri/i965/brw_eu_compact.c +++ b/src/mesa/drivers/dri/i965/brw_eu_compact.c @@ -1407,6 +1407,9 @@ void brw_compact_instructions(struct brw_codegen *p, int start_offset, int num_annotations, struct annotation *annotation) { + if (unlikely(INTEL_DEBUG & DEBUG_NO_COMPACTION)) + return; + const struct brw_device_info *devinfo = p->devinfo; void *store = p->store + start_offset / 16; /* For an instruction at byte offset 16*i before compaction, this is the diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c index bf2fee9ed48..a6fbb542919 100644 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c @@ -410,7 +410,7 @@ brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg) if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.dw1.bits.indirect_offset); } else { - brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.dw1.bits.indirect_offset); + brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.dw1.bits.indirect_offset); } } @@ -2511,12 +2511,20 @@ brw_send_indirect_message(struct brw_codegen *p, struct brw_reg desc) { const struct brw_device_info *devinfo = p->devinfo; - struct brw_inst *send, *setup; + struct brw_inst *send; + int setup; assert(desc.type == BRW_REGISTER_TYPE_UD); + /* We hold on to the setup instruction (the SEND in the direct case, the OR + * in the indirect case) by its index in the instruction store. The + * pointer returned by next_insn() may become invalid if emitting the SEND + * in the indirect case reallocs the store. + */ + if (desc.file == BRW_IMMEDIATE_VALUE) { - setup = send = next_insn(p, BRW_OPCODE_SEND); + setup = p->nr_insn; + send = next_insn(p, BRW_OPCODE_SEND); brw_set_src1(p, send, desc); } else { @@ -2531,7 +2539,8 @@ brw_send_indirect_message(struct brw_codegen *p, * caller can specify additional descriptor bits with the usual * brw_set_*_message() helper functions. */ - setup = brw_OR(p, addr, desc, brw_imm_ud(0)); + setup = p->nr_insn; + brw_OR(p, addr, desc, brw_imm_ud(0)); brw_pop_insn_state(p); @@ -2543,7 +2552,7 @@ brw_send_indirect_message(struct brw_codegen *p, brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD)); brw_inst_set_sfid(devinfo, send, sfid); - return setup; + return &p->store[setup]; } static struct brw_inst * @@ -2906,11 +2915,10 @@ brw_untyped_surface_read(struct brw_codegen *p, const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ? HSW_SFID_DATAPORT_DATA_CACHE_1 : GEN7_SFID_DATAPORT_DATA_CACHE); - const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1); struct brw_inst *insn = brw_send_indirect_surface_message( p, sfid, dst, payload, surface, msg_length, brw_surface_payload_size(p, num_channels, true, true), - align1); + false); brw_set_dp_untyped_surface_read_message( p, insn, num_channels); diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 8320cd77299..e218a85a363 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -88,8 +88,6 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, case IMM: case UNIFORM: unreachable("Invalid destination register file"); - default: - unreachable("Invalid register file"); } this->writes_accumulator = false; @@ -538,18 +536,6 @@ fs_visitor::get_timestamp(const fs_builder &bld) */ bld.group(4, 0).exec_all().MOV(dst, ts); - /* The caller wants the low 32 bits of the timestamp. Since it's running - * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds, - * which is plenty of time for our purposes. It is identical across the - * EUs, but since it's tracking GPU core speed it will increment at a - * varying rate as render P-states change. - * - * The caller could also check if render P-states have changed (or anything - * else that might disrupt timing) by setting smear to 2 and checking if - * that field is != 0. - */ - dst.set_smear(0); - return dst; } @@ -557,6 +543,14 @@ void fs_visitor::emit_shader_time_begin() { shader_start_time = get_timestamp(bld.annotate("shader time start")); + + /* We want only the low 32 bits of the timestamp. Since it's running + * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds, + * which is plenty of time for our purposes. It is identical across the + * EUs, but since it's tracking GPU core speed it will increment at a + * varying rate as render P-states change. + */ + shader_start_time.set_smear(0); } void @@ -570,6 +564,15 @@ fs_visitor::emit_shader_time_end() fs_reg shader_end_time = get_timestamp(ibld); + /* We only use the low 32 bits of the timestamp - see + * emit_shader_time_begin()). + * + * We could also check if render P-states have changed (or anything + * else that might disrupt timing) by setting smear to 2 and checking if + * that field is != 0. + */ + shader_end_time.set_smear(0); + /* Check that there weren't any timestamp reset events (assuming these * were the only two timestamp reads that happened). */ @@ -700,10 +703,10 @@ fs_inst::components_read(unsigned i) const return 2; case FS_OPCODE_FB_WRITE_LOGICAL: - assert(src[6].file == IMM); + assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM); /* First/second FB write color. */ if (i < 2) - return src[6].fixed_hw_reg.dw1.ud; + return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].fixed_hw_reg.dw1.ud; else return 1; @@ -841,9 +844,8 @@ fs_inst::regs_read(int arg) const REG_SIZE); case MRF: unreachable("MRF registers are not allowed as sources"); - default: - unreachable("Invalid register file"); } + return 0; } bool @@ -1283,9 +1285,9 @@ fs_visitor::emit_sampleid_setup() fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type)); if (key->compute_sample_id) { - fs_reg t1 = vgrf(glsl_type::int_type); - fs_reg t2 = vgrf(glsl_type::int_type); - t2.type = BRW_REGISTER_TYPE_UW; + fs_reg t1(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_D); + t1.set_smear(0); + fs_reg t2(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_W); /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with * 8x multisampling, subspan 0 will represent sample N (where N @@ -1306,13 +1308,13 @@ fs_visitor::emit_sampleid_setup() * are sample 1 of subspan 0; the third group is sample 0 of * subspan 1, and finally sample 1 of subspan 1. */ - abld.exec_all() - .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)), + abld.exec_all().group(1, 0) + .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)), fs_reg(0xc0)); - abld.exec_all().SHR(t1, t1, fs_reg(5)); + abld.exec_all().group(1, 0).SHR(t1, t1, fs_reg(5)); /* This works for both SIMD8 and SIMD16 */ - abld.exec_all() + abld.exec_all().group(4, 0) .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)); /* This special instruction takes care of setting vstride=1, @@ -1443,6 +1445,9 @@ fs_visitor::calculate_urb_setup() } } } else { + bool include_vue_header = + nir->info.inputs_read & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT); + /* We have enough input varyings that the SF/SBE pipeline stage can't * arbitrarily rearrange them to suit our whim; we have to put them * in an order that matches the output of the previous pipeline stage @@ -1452,15 +1457,14 @@ fs_visitor::calculate_urb_setup() brw_compute_vue_map(devinfo, &prev_stage_vue_map, key->input_slots_valid, nir->info.separate_shader); - int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET; + int first_slot = + include_vue_header ? 0 : 2 * BRW_SF_URB_ENTRY_READ_OFFSET; + assert(prev_stage_vue_map.num_slots <= first_slot + 32); for (int slot = first_slot; slot < prev_stage_vue_map.num_slots; slot++) { int varying = prev_stage_vue_map.slot_to_varying[slot]; - /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is - * unused. - */ - if (varying != BRW_VARYING_SLOT_COUNT && + if (varying != BRW_VARYING_SLOT_PAD && (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK & BITFIELD64_BIT(varying))) { prog_data->urb_setup[varying] = slot - first_slot; @@ -2615,7 +2619,7 @@ fs_visitor::eliminate_find_live_channel() case SHADER_OPCODE_FIND_LIVE_CHANNEL: if (depth == 0) { inst->opcode = BRW_OPCODE_MOV; - inst->src[0] = fs_reg(0); + inst->src[0] = fs_reg(0u); inst->sources = 1; inst->force_writemask_all = true; progress = true; @@ -2643,8 +2647,9 @@ fs_visitor::emit_repclear_shader() fs_inst *mov; if (uniforms == 1) { - mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)), - fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)); + mov = bld.exec_all().group(4, 0) + .MOV(brw_message_reg(color_mrf), + fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)); } else { struct brw_reg reg = brw_reg(BRW_GENERAL_REGISTER_FILE, @@ -2653,8 +2658,8 @@ fs_visitor::emit_repclear_shader() BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4, BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); - mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)), - fs_reg(reg)); + mov = bld.exec_all().group(4, 0) + .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg)); } fs_inst *write; @@ -3366,15 +3371,17 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, const brw_wm_prog_key *key, const fs_visitor::thread_payload &payload) { - assert(inst->src[6].file == IMM); + assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM); const brw_device_info *devinfo = bld.shader->devinfo; - const fs_reg &color0 = inst->src[0]; - const fs_reg &color1 = inst->src[1]; - const fs_reg &src0_alpha = inst->src[2]; - const fs_reg &src_depth = inst->src[3]; - const fs_reg &dst_depth = inst->src[4]; - fs_reg sample_mask = inst->src[5]; - const unsigned components = inst->src[6].fixed_hw_reg.dw1.ud; + const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0]; + const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1]; + const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA]; + const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH]; + const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH]; + const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL]; + fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK]; + const unsigned components = + inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].fixed_hw_reg.dw1.ud; /* We can potentially have a message length of up to 15, so we have to set * base_mrf to either 0 or 1 in order to fit in m0..m15. @@ -3464,6 +3471,17 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, length++; } + if (src_stencil.file != BAD_FILE) { + assert(devinfo->gen >= 9); + assert(bld.dispatch_width() != 16); + + sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.exec_all().annotate("FB write OS") + .emit(FS_OPCODE_PACK_STENCIL_REF, sources[length], + retype(src_stencil, BRW_REGISTER_TYPE_UB)); + length++; + } + fs_inst *load; if (devinfo->gen >= 7) { /* Send from the GRF */ @@ -4073,7 +4091,7 @@ fs_visitor::lower_logical_sends() case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: lower_surface_logical_send(ibld, inst, SHADER_OPCODE_UNTYPED_SURFACE_READ, - fs_reg(0xffff)); + fs_reg()); break; case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: @@ -4202,10 +4220,12 @@ get_lowered_simd_width(const struct brw_device_info *devinfo, /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them * here. */ - assert(devinfo->gen != 6 || inst->src[3].file == BAD_FILE || + assert(devinfo->gen != 6 || + inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE || inst->exec_size == 8); /* Dual-source FB writes are unsupported in SIMD16 mode. */ - return (inst->src[1].file != BAD_FILE ? 8 : inst->exec_size); + return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ? + 8 : inst->exec_size); case SHADER_OPCODE_TXD_LOGICAL: /* TXD is unsupported in SIMD16 mode. */ @@ -4499,9 +4519,8 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) if (inst->dst.fixed_hw_reg.subnr) fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr); break; - default: - fprintf(file, "???"); - break; + case IMM: + unreachable("not reached"); } fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type)); @@ -4594,9 +4613,6 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) if (inst->src[i].fixed_hw_reg.abs) fprintf(file, "|"); break; - default: - fprintf(file, "???"); - break; } if (inst->src[i].abs) fprintf(file, "|"); @@ -4977,8 +4993,7 @@ fs_visitor::allocate_registers() if (failed) return; - if (!allocated_without_spills) - schedule_instructions(SCHEDULE_POST); + schedule_instructions(SCHEDULE_POST); if (last_scratch > 0) prog_data->total_scratch = brw_get_scratch_size(last_scratch); @@ -5236,6 +5251,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, prog_data->uses_omask = shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK); prog_data->computed_depth_mode = computed_depth_mode(shader); + prog_data->computed_stencil = + shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL); prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests; diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 50e98becf03..8058b344b7a 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -145,6 +145,8 @@ public: void assign_vs_urb_setup(); bool assign_regs(bool allow_spilling); void assign_regs_trivial(); + void calculate_payload_ranges(int payload_node_count, + int *payload_last_use_ip); void setup_payload_interference(struct ra_graph *g, int payload_reg_count, int first_payload_node); int choose_spill_reg(struct ra_graph *g); @@ -337,6 +339,7 @@ public: int *push_constant_loc; fs_reg frag_depth; + fs_reg frag_stencil; fs_reg sample_mask; fs_reg outputs[VARYING_SLOT_MAX]; unsigned output_components[VARYING_SLOT_MAX]; @@ -427,6 +430,8 @@ private: void generate_urb_read(fs_inst *inst, struct brw_reg dst, struct brw_reg payload); void generate_urb_write(fs_inst *inst, struct brw_reg payload); void generate_cs_terminate(fs_inst *inst, struct brw_reg payload); + void generate_stencil_ref_packing(fs_inst *inst, struct brw_reg dst, + struct brw_reg src); void generate_barrier(fs_inst *inst, struct brw_reg src); void generate_blorp_fb_write(fs_inst *inst); void generate_linterp(fs_inst *inst, struct brw_reg dst, diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h index df10a9de293..f121f3463d3 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_builder.h +++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h @@ -390,14 +390,21 @@ namespace brw { src_reg emit_uniformize(const src_reg &src) const { + /* FIXME: We use a vector chan_index and dst to allow constant and + * copy propagration to move result all the way into the consuming + * instruction (typically a surface index or sampler index for a + * send). This uses 1 or 3 extra hw registers in 16 or 32 wide + * dispatch. Once we teach const/copy propagation about scalars we + * should go back to scalar destinations here. + */ const fs_builder ubld = exec_all(); - const dst_reg chan_index = component(vgrf(BRW_REGISTER_TYPE_UD), 0); - const dst_reg dst = component(vgrf(src.type), 0); + const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD); + const dst_reg dst = vgrf(src.type); ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index); - ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index); + ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0)); - return src_reg(dst); + return src_reg(component(dst, 0)); } /** diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp index 5589716239a..26204827156 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp @@ -416,9 +416,10 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) inst->src[arg].subreg_offset = offset % 32; } break; - default: - unreachable("Invalid register file"); - break; + + case MRF: + case IMM: + unreachable("not reached"); } if (has_source_modifiers) { @@ -612,6 +613,21 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry) } break; + case SHADER_OPCODE_UNTYPED_ATOMIC: + case SHADER_OPCODE_UNTYPED_SURFACE_READ: + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: + case SHADER_OPCODE_TYPED_ATOMIC: + case SHADER_OPCODE_TYPED_SURFACE_READ: + case SHADER_OPCODE_TYPED_SURFACE_WRITE: + /* We only propagate into the surface argument of the + * instruction. Everything else goes through LOAD_PAYLOAD. + */ + if (i == 1) { + inst->src[i] = val; + progress = true; + } + break; + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: case SHADER_OPCODE_BROADCAST: inst->src[i] = val; diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp index c7628dcc2f4..3a28c8d591d 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp @@ -93,7 +93,8 @@ is_expression(const fs_visitor *v, const fs_inst *const inst) case SHADER_OPCODE_LOAD_PAYLOAD: return !inst->is_copy_payload(v->alloc); default: - return inst->is_send_from_grf() && !inst->has_side_effects(); + return inst->is_send_from_grf() && !inst->has_side_effects() && + !inst->is_volatile(); } } diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index bb7e792044f..e207a77fdc1 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -42,9 +42,13 @@ static uint32_t brw_file_from_reg(fs_reg *reg) return BRW_MESSAGE_REGISTER_FILE; case IMM: return BRW_IMMEDIATE_VALUE; - default: + case BAD_FILE: + case HW_REG: + case ATTR: + case UNIFORM: unreachable("not reached"); } + return 0; } static struct brw_reg @@ -116,7 +120,8 @@ brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen) /* Probably unused. */ brw_reg = brw_null_reg(); break; - default: + case ATTR: + case UNIFORM: unreachable("not reached"); } if (reg->abs) @@ -317,6 +322,14 @@ fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload) brw_imm_ud(inst->target)); } + /* Set computes stencil to render target */ + if (prog_data->computed_stencil) { + brw_OR(p, + vec1(retype(payload, BRW_REGISTER_TYPE_UD)), + vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)), + brw_imm_ud(0x1 << 14)); + } + implied_header = brw_null_reg(); } else { implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); @@ -437,6 +450,47 @@ fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload) } void +fs_generator::generate_stencil_ref_packing(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src) +{ + assert(dispatch_width == 8); + assert(devinfo->gen >= 9); + + /* Stencil value updates are provided in 8 slots of 1 byte per slot. + * Presumably, in order to save memory bandwidth, the stencil reference + * values written from the FS need to be packed into 2 dwords (this makes + * sense because the stencil values are limited to 1 byte each and a SIMD8 + * send, so stencil slots 0-3 in dw0, and 4-7 in dw1.) + * + * The spec is confusing here because in the payload definition of MDP_RTW_S8 + * (Message Data Payload for Render Target Writes with Stencil 8b) the + * stencil value seems to be dw4.0-dw4.7. However, if you look at the type of + * dw4 it is type MDPR_STENCIL (Message Data Payload Register) which is the + * packed values specified above and diagrammed below: + * + * 31 0 + * -------------------------------- + * DW | | + * 2-7 | IGNORED | + * | | + * -------------------------------- + * DW1 | STC | STC | STC | STC | + * | slot7 | slot6 | slot5 | slot4| + * -------------------------------- + * DW0 | STC | STC | STC | STC | + * | slot3 | slot2 | slot1 | slot0| + * -------------------------------- + */ + + src.vstride = BRW_VERTICAL_STRIDE_4; + src.width = BRW_WIDTH_1; + src.hstride = BRW_HORIZONTAL_STRIDE_0; + assert(src.type == BRW_REGISTER_TYPE_UB); + brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UB), src); +} + +void fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src) { brw_barrier(p, src); @@ -1455,18 +1509,18 @@ fs_generator::generate_set_sample_id(fs_inst *inst, assert(src0.type == BRW_REGISTER_TYPE_D || src0.type == BRW_REGISTER_TYPE_UD); - brw_push_insn_state(p); - brw_set_default_exec_size(p, BRW_EXECUTE_8); - brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - struct brw_reg reg = retype(stride(src1, 1, 4, 0), BRW_REGISTER_TYPE_UW); - if (dispatch_width == 8) { + struct brw_reg reg = stride(src1, 1, 4, 0); + if (devinfo->gen >= 8 || dispatch_width == 8) { brw_ADD(p, dst, src0, reg); } else if (dispatch_width == 16) { + brw_push_insn_state(p); + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); brw_ADD(p, firsthalf(dst), firsthalf(src0), reg); + brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2)); + brw_pop_insn_state(p); } - brw_pop_insn_state(p); } void @@ -2182,6 +2236,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) generate_barrier(inst, src[0]); break; + case FS_OPCODE_PACK_STENCIL_REF: + generate_stencil_ref_packing(inst, dst, src[0]); + break; + default: unreachable("Unsupported opcode"); @@ -2216,9 +2274,9 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) if (unlikely(debug_flag)) { fprintf(stderr, "Native code for %s\n" - "SIMD%d shader: %d instructions. %d loops. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d" + "SIMD%d shader: %d instructions. %d loops. %u cycles. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d" " bytes (%.0f%%)\n", - shader_name, dispatch_width, before_size / 16, loop_count, + shader_name, dispatch_width, before_size / 16, loop_count, cfg->cycle_count, spill_count, fill_count, promoted_constants, before_size, after_size, 100.0f * (before_size - after_size) / before_size); @@ -2228,12 +2286,13 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) } compiler->shader_debug_log(log_data, - "%s SIMD%d shader: %d inst, %d loops, " + "%s SIMD%d shader: %d inst, %d loops, %u cycles, " "%d:%d spills:fills, Promoted %u constants, " "compacted %d to %d bytes.\n", stage_abbrev, dispatch_width, before_size / 16, - loop_count, spill_count, fill_count, - promoted_constants, before_size, after_size); + loop_count, cfg->cycle_count, spill_count, + fill_count, promoted_constants, before_size, + after_size); return start_offset; } diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 7b5a0482519..486741bea31 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -71,6 +71,14 @@ fs_visitor::nir_setup_inputs() var->data.origin_upper_left); emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, bld.dispatch_width(), input, reg), 0xF); + } else if (var->data.location == VARYING_SLOT_LAYER) { + struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_LAYER, 1), 3); + reg.type = BRW_REGISTER_TYPE_D; + bld.emit(FS_OPCODE_CINTERP, retype(input, BRW_REGISTER_TYPE_D), reg); + } else if (var->data.location == VARYING_SLOT_VIEWPORT) { + struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_VIEWPORT, 2), 3); + reg.type = BRW_REGISTER_TYPE_D; + bld.emit(FS_OPCODE_CINTERP, retype(input, BRW_REGISTER_TYPE_D), reg); } else { emit_general_interpolation(input, var->name, var->type, (glsl_interp_qualifier) var->data.interpolation, @@ -114,6 +122,8 @@ fs_visitor::nir_setup_outputs() } } else if (var->data.location == FRAG_RESULT_DEPTH) { this->frag_depth = reg; + } else if (var->data.location == FRAG_RESULT_STENCIL) { + this->frag_stencil = reg; } else if (var->data.location == FRAG_RESULT_SAMPLE_MASK) { this->sample_mask = reg; } else { @@ -896,12 +906,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then * subtract the result from 31 to convert the MSB count into an LSB count. */ - bld.CMP(bld.null_reg_d(), result, fs_reg(-1), BRW_CONDITIONAL_NZ); - fs_reg neg_result(result); - neg_result.negate = true; - inst = bld.ADD(result, neg_result, fs_reg(31)); + + inst = bld.ADD(result, result, fs_reg(31)); inst->predicate = BRW_PREDICATE_NORMAL; + inst->src[0].negate = true; break; } @@ -1322,6 +1331,15 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } + case nir_intrinsic_shader_clock: { + /* We cannot do anything if there is an event, so ignore it for now */ + fs_reg shader_clock = get_timestamp(bld); + const fs_reg srcs[] = { shader_clock.set_smear(0), shader_clock.set_smear(1) }; + + bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0); + break; + } + case nir_intrinsic_image_size: { /* Get the referenced image variable and type. */ const nir_variable *var = instr->variables[0]->var; @@ -1509,7 +1527,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr surf_index = vgrf(glsl_type::uint_type); bld.ADD(surf_index, get_nir_src(instr->src[0]), fs_reg(stage_prog_data->binding_table.ssbo_start)); - surf_index = bld.emit_uniformize(surf_index); /* Assume this may touch any UBO. It would be nice to provide * a tighter bound, but the array information is already lowered away. @@ -1520,34 +1537,21 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr } /* Get the offset to read from */ - fs_reg offset_reg = vgrf(glsl_type::uint_type); - unsigned const_offset_bytes = 0; + fs_reg offset_reg; if (has_indirect) { - bld.MOV(offset_reg, get_nir_src(instr->src[1])); + offset_reg = get_nir_src(instr->src[1]); } else { - const_offset_bytes = instr->const_index[0]; - bld.MOV(offset_reg, fs_reg(const_offset_bytes)); + offset_reg = fs_reg(instr->const_index[0]); } /* Read the vector */ - for (int i = 0; i < instr->num_components; i++) { - fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg, - 1 /* dims */, 1 /* size */, - BRW_PREDICATE_NONE); - read_result.type = dest.type; - bld.MOV(dest, read_result); - dest = offset(dest, bld, 1); - - /* Vector components are stored contiguous in memory */ - if (i < instr->num_components) { - if (!has_indirect) { - const_offset_bytes += 4; - bld.MOV(offset_reg, fs_reg(const_offset_bytes)); - } else { - bld.ADD(offset_reg, offset_reg, brw_imm_ud(4)); - } - } - } + fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg, + 1 /* dims */, + instr->num_components, + BRW_PREDICATE_NONE); + read_result.type = dest.type; + for (int i = 0; i < instr->num_components; i++) + bld.MOV(offset(dest, bld, i), offset(read_result, bld, i)); break; } @@ -1765,52 +1769,46 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr surf_index = vgrf(glsl_type::uint_type); bld.ADD(surf_index, get_nir_src(instr->src[1]), fs_reg(stage_prog_data->binding_table.ssbo_start)); - surf_index = bld.emit_uniformize(surf_index); brw_mark_surface_used(prog_data, stage_prog_data->binding_table.ssbo_start + nir->info.num_ssbos - 1); } - /* Offset */ - fs_reg offset_reg = vgrf(glsl_type::uint_type); - unsigned const_offset_bytes = 0; - if (has_indirect) { - bld.MOV(offset_reg, get_nir_src(instr->src[2])); - } else { - const_offset_bytes = instr->const_index[0]; - bld.MOV(offset_reg, fs_reg(const_offset_bytes)); - } - /* Value */ fs_reg val_reg = get_nir_src(instr->src[0]); /* Writemask */ unsigned writemask = instr->const_index[1]; - /* Write each component present in the writemask */ - unsigned skipped_channels = 0; - for (int i = 0; i < instr->num_components; i++) { - int component_mask = 1 << i; - if (writemask & component_mask) { - if (skipped_channels) { - if (!has_indirect) { - const_offset_bytes += 4 * skipped_channels; - bld.MOV(offset_reg, fs_reg(const_offset_bytes)); - } else { - bld.ADD(offset_reg, offset_reg, - brw_imm_ud(4 * skipped_channels)); - } - skipped_channels = 0; - } + /* Combine groups of consecutive enabled channels in one write + * message. We use ffs to find the first enabled channel and then ffs on + * the bit-inverse, down-shifted writemask to determine the length of + * the block of enabled bits. + */ + while (writemask) { + unsigned first_component = ffs(writemask) - 1; + unsigned length = ffs(~(writemask >> first_component)) - 1; + fs_reg offset_reg; - emit_untyped_write(bld, surf_index, offset_reg, - offset(val_reg, bld, i), - 1 /* dims */, 1 /* size */, - BRW_PREDICATE_NONE); + if (!has_indirect) { + offset_reg = fs_reg(instr->const_index[0] + 4 * first_component); + } else { + offset_reg = vgrf(glsl_type::uint_type); + bld.ADD(offset_reg, + retype(get_nir_src(instr->src[2]), BRW_REGISTER_TYPE_UD), + fs_reg(4 * first_component)); } - skipped_channels++; + emit_untyped_write(bld, surf_index, offset_reg, + offset(val_reg, bld, first_component), + 1 /* dims */, length, + BRW_PREDICATE_NONE); + + /* Clear the bits in the writemask that we just wrote, then try + * again to see if more channels are left. + */ + writemask &= (15 << (first_component + length)); } break; } diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp index 36388fad98d..9251d9552a5 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp @@ -330,32 +330,12 @@ count_to_loop_end(const bblock_t *block) unreachable("not reached"); } -/** - * Sets up interference between thread payload registers and the virtual GRFs - * to be allocated for program temporaries. - * - * We want to be able to reallocate the payload for our virtual GRFs, notably - * because the setup coefficients for a full set of 16 FS inputs takes up 8 of - * our 128 registers. - * - * The layout of the payload registers is: - * - * 0..payload.num_regs-1: fixed function setup (including bary coordinates). - * payload.num_regs..payload.num_regs+curb_read_lengh-1: uniform data - * payload.num_regs+curb_read_lengh..first_non_payload_grf-1: setup coefficients. - * - * And we have payload_node_count nodes covering these registers in order - * (note that in SIMD16, a node is two registers). - */ -void -fs_visitor::setup_payload_interference(struct ra_graph *g, - int payload_node_count, - int first_payload_node) +void fs_visitor::calculate_payload_ranges(int payload_node_count, + int *payload_last_use_ip) { int loop_depth = 0; int loop_end_ip = 0; - int payload_last_use_ip[payload_node_count]; for (int i = 0; i < payload_node_count; i++) payload_last_use_ip[i] = -1; @@ -426,6 +406,33 @@ fs_visitor::setup_payload_interference(struct ra_graph *g, ip++; } +} + + +/** + * Sets up interference between thread payload registers and the virtual GRFs + * to be allocated for program temporaries. + * + * We want to be able to reallocate the payload for our virtual GRFs, notably + * because the setup coefficients for a full set of 16 FS inputs takes up 8 of + * our 128 registers. + * + * The layout of the payload registers is: + * + * 0..payload.num_regs-1: fixed function setup (including bary coordinates). + * payload.num_regs..payload.num_regs+curb_read_lengh-1: uniform data + * payload.num_regs+curb_read_lengh..first_non_payload_grf-1: setup coefficients. + * + * And we have payload_node_count nodes covering these registers in order + * (note that in SIMD16, a node is two registers). + */ +void +fs_visitor::setup_payload_interference(struct ra_graph *g, + int payload_node_count, + int first_payload_node) +{ + int payload_last_use_ip[payload_node_count]; + calculate_payload_ranges(payload_node_count, payload_last_use_ip); for (int i = 0; i < payload_node_count; i++) { if (payload_last_use_ip[i] == -1) diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 7cc4f3c927a..5c57944ca39 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -697,7 +697,7 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld, const fs_reg dst_depth = (payload.dest_depth_reg ? fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0)) : fs_reg()); - fs_reg src_depth; + fs_reg src_depth, src_stencil; if (source_depth_to_render_target) { if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) @@ -706,10 +706,14 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld, src_depth = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)); } + if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) + src_stencil = frag_stencil; + const fs_reg sources[] = { - color0, color1, src0_alpha, src_depth, dst_depth, sample_mask, - fs_reg(components) + color0, color1, src0_alpha, src_depth, dst_depth, src_stencil, + sample_mask, fs_reg(components) }; + assert(ARRAY_SIZE(sources) - 1 == FB_WRITE_LOGICAL_SRC_COMPONENTS); fs_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, fs_reg(), sources, ARRAY_SIZE(sources)); @@ -740,6 +744,16 @@ fs_visitor::emit_fb_writes() no16("Missing support for simd16 depth writes on gen6\n"); } + if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) { + /* From the 'Render Target Write message' section of the docs: + * "Output Stencil is not supported with SIMD16 Render Target Write + * Messages." + * + * FINISHME: split 16 into 2 8s + */ + no16("FINISHME: support 2 simd8 writes for gl_FragStencilRefARB\n"); + } + if (do_dual_src) { const fs_builder abld = bld.annotate("FB dual-source write"); diff --git a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c index 00125c0f405..76ed237d88a 100644 --- a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c @@ -105,8 +105,8 @@ brw_upload_gs_abo_surfaces(struct brw_context *brw) if (prog) { /* BRW_NEW_GS_PROG_DATA */ - brw_upload_abo_surfaces(brw, prog, &brw->gs.base, - &brw->gs.prog_data->base.base); + brw_upload_abo_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_GEOMETRY], + &brw->gs.base, &brw->gs.prog_data->base.base); } } diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h index 7726e4b78a0..4417555f18e 100644 --- a/src/mesa/drivers/dri/i965/brw_ir_fs.h +++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h @@ -97,7 +97,9 @@ byte_offset(fs_reg reg, unsigned delta) case MRF: reg.reg += delta / 32; break; - default: + case IMM: + case HW_REG: + case UNIFORM: assert(delta == 0); } reg.subreg_offset += delta % 32; @@ -119,7 +121,7 @@ horiz_offset(fs_reg reg, unsigned delta) case MRF: case ATTR: return byte_offset(reg, delta * reg.stride * type_sz(reg.type)); - default: + case HW_REG: assert(delta == 0); } return reg; @@ -163,7 +165,6 @@ half(fs_reg reg, unsigned idx) case ATTR: case HW_REG: - default: unreachable("Cannot take half of this register type"); } return reg; diff --git a/src/mesa/drivers/dri/i965/brw_ir_vec4.h b/src/mesa/drivers/dri/i965/brw_ir_vec4.h index 1b57b65db27..29642c6d2a4 100644 --- a/src/mesa/drivers/dri/i965/brw_ir_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_ir_vec4.h @@ -161,9 +161,6 @@ public: const src_reg &src1 = src_reg(), const src_reg &src2 = src_reg()); - struct brw_reg get_dst(unsigned gen); - struct brw_reg get_src(const struct brw_vue_prog_data *prog_data, int i); - dst_reg dst; src_reg src[3]; @@ -186,6 +183,27 @@ public: return predicate || opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2; } + bool reads_flag(unsigned c) + { + if (opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2) + return true; + + switch (predicate) { + case BRW_PREDICATE_NONE: + return false; + case BRW_PREDICATE_ALIGN16_REPLICATE_X: + return c == 0; + case BRW_PREDICATE_ALIGN16_REPLICATE_Y: + return c == 1; + case BRW_PREDICATE_ALIGN16_REPLICATE_Z: + return c == 2; + case BRW_PREDICATE_ALIGN16_REPLICATE_W: + return c == 3; + default: + return true; + } + } + bool writes_flag() { return (conditional_mod && (opcode != BRW_OPCODE_SEL && diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c index 9a33188cb5c..8c1a34ee17a 100644 --- a/src/mesa/drivers/dri/i965/brw_nir.c +++ b/src/mesa/drivers/dri/i965/brw_nir.c @@ -205,6 +205,9 @@ brw_create_nir(struct brw_context *brw, if (shader_prog) { nir_lower_samplers(nir, shader_prog); nir_validate_shader(nir); + + nir_lower_atomics(nir, shader_prog); + nir_validate_shader(nir); } brw_postprocess_nir(nir, brw->intelScreen->devinfo, is_scalar); @@ -278,9 +281,6 @@ brw_postprocess_nir(nir_shader *nir, nir_lower_system_values(nir); nir_validate_shader(nir); - nir_lower_atomics(nir); - nir_validate_shader(nir); - nir_optimize(nir, is_scalar); if (devinfo->gen >= 6) { diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h index 87e7e011541..083c46a3726 100644 --- a/src/mesa/drivers/dri/i965/brw_reg.h +++ b/src/mesa/drivers/dri/i965/brw_reg.h @@ -205,7 +205,7 @@ enum PACKED brw_reg_type { /** @} */ /** Immediates only: @{ */ - BRW_REGISTER_TYPE_UV, + BRW_REGISTER_TYPE_UV, /* Gen6+ */ BRW_REGISTER_TYPE_V, BRW_REGISTER_TYPE_VF, /** @} */ diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp index b710c60148c..88c45f74333 100644 --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp @@ -26,6 +26,7 @@ */ #include "brw_fs.h" +#include "brw_fs_live_variables.h" #include "brw_vec4.h" #include "brw_cfg.h" #include "brw_shader.h" @@ -400,22 +401,49 @@ schedule_node::set_latency_gen7(bool is_haswell) class instruction_scheduler { public: instruction_scheduler(backend_shader *s, int grf_count, + int hw_reg_count, int block_count, instruction_scheduler_mode mode) { this->bs = s; this->mem_ctx = ralloc_context(NULL); this->grf_count = grf_count; + this->hw_reg_count = hw_reg_count; this->instructions.make_empty(); this->instructions_to_schedule = 0; this->post_reg_alloc = (mode == SCHEDULE_POST); this->mode = mode; this->time = 0; if (!post_reg_alloc) { - this->remaining_grf_uses = rzalloc_array(mem_ctx, int, grf_count); - this->grf_active = rzalloc_array(mem_ctx, bool, grf_count); + this->reg_pressure_in = rzalloc_array(mem_ctx, int, block_count); + + this->livein = ralloc_array(mem_ctx, BITSET_WORD *, block_count); + for (int i = 0; i < block_count; i++) + this->livein[i] = rzalloc_array(mem_ctx, BITSET_WORD, + BITSET_WORDS(grf_count)); + + this->liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count); + for (int i = 0; i < block_count; i++) + this->liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD, + BITSET_WORDS(grf_count)); + + this->hw_liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count); + for (int i = 0; i < block_count; i++) + this->hw_liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD, + BITSET_WORDS(hw_reg_count)); + + this->written = rzalloc_array(mem_ctx, bool, grf_count); + + this->reads_remaining = rzalloc_array(mem_ctx, int, grf_count); + + this->hw_reads_remaining = rzalloc_array(mem_ctx, int, hw_reg_count); } else { - this->remaining_grf_uses = NULL; - this->grf_active = NULL; + this->reg_pressure_in = NULL; + this->livein = NULL; + this->liveout = NULL; + this->hw_liveout = NULL; + this->written = NULL; + this->reads_remaining = NULL; + this->hw_reads_remaining = NULL; } } @@ -442,7 +470,8 @@ public: */ virtual int issue_time(backend_instruction *inst) = 0; - virtual void count_remaining_grf_uses(backend_instruction *inst) = 0; + virtual void count_reads_remaining(backend_instruction *inst) = 0; + virtual void setup_liveness(cfg_t *cfg) = 0; virtual void update_register_pressure(backend_instruction *inst) = 0; virtual int get_register_pressure_benefit(backend_instruction *inst) = 0; @@ -453,33 +482,63 @@ public: bool post_reg_alloc; int instructions_to_schedule; int grf_count; + int hw_reg_count; int time; + int reg_pressure; + int block_idx; exec_list instructions; backend_shader *bs; instruction_scheduler_mode mode; - /** - * Number of instructions left to schedule that reference each vgrf. - * - * Used so that we can prefer scheduling instructions that will end the - * live intervals of multiple variables, to reduce register pressure. + /* + * The register pressure at the beginning of each basic block. */ - int *remaining_grf_uses; - /** - * Tracks whether each VGRF has had an instruction scheduled that uses it. - * - * This is used to estimate whether scheduling a new instruction will - * increase register pressure. + int *reg_pressure_in; + + /* + * The virtual GRF's whose range overlaps the beginning of each basic block. + */ + + BITSET_WORD **livein; + + /* + * The virtual GRF's whose range overlaps the end of each basic block. + */ + + BITSET_WORD **liveout; + + /* + * The hardware GRF's whose range overlaps the end of each basic block. + */ + + BITSET_WORD **hw_liveout; + + /* + * Whether we've scheduled a write for this virtual GRF yet. + */ + + bool *written; + + /* + * How many reads we haven't scheduled for this virtual GRF yet. + */ + + int *reads_remaining; + + /* + * How many reads we haven't scheduled for this hardware GRF yet. */ - bool *grf_active; + + int *hw_reads_remaining; }; class fs_instruction_scheduler : public instruction_scheduler { public: - fs_instruction_scheduler(fs_visitor *v, int grf_count, + fs_instruction_scheduler(fs_visitor *v, int grf_count, int hw_reg_count, + int block_count, instruction_scheduler_mode mode); void calculate_deps(); bool is_compressed(fs_inst *inst); @@ -487,35 +546,109 @@ public: int issue_time(backend_instruction *inst); fs_visitor *v; - void count_remaining_grf_uses(backend_instruction *inst); + void count_reads_remaining(backend_instruction *inst); + void setup_liveness(cfg_t *cfg); void update_register_pressure(backend_instruction *inst); int get_register_pressure_benefit(backend_instruction *inst); }; fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v, - int grf_count, + int grf_count, int hw_reg_count, + int block_count, instruction_scheduler_mode mode) - : instruction_scheduler(v, grf_count, mode), + : instruction_scheduler(v, grf_count, hw_reg_count, block_count, mode), v(v) { } +static bool +is_src_duplicate(fs_inst *inst, int src) +{ + for (int i = 0; i < src; i++) + if (inst->src[i].equals(inst->src[src])) + return true; + + return false; +} + void -fs_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be) +fs_instruction_scheduler::count_reads_remaining(backend_instruction *be) { fs_inst *inst = (fs_inst *)be; - if (!remaining_grf_uses) + if (!reads_remaining) return; - if (inst->dst.file == GRF) - remaining_grf_uses[inst->dst.reg]++; - for (int i = 0; i < inst->sources; i++) { - if (inst->src[i].file != GRF) + if (is_src_duplicate(inst, i)) continue; - remaining_grf_uses[inst->src[i].reg]++; + if (inst->src[i].file == GRF) { + reads_remaining[inst->src[i].reg]++; + } else if (inst->src[i].file == HW_REG && + inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { + if (inst->src[i].fixed_hw_reg.nr >= hw_reg_count) + continue; + + for (int j = 0; j < inst->regs_read(i); j++) + hw_reads_remaining[inst->src[i].fixed_hw_reg.nr + j]++; + } + } +} + +void +fs_instruction_scheduler::setup_liveness(cfg_t *cfg) +{ + /* First, compute liveness on a per-GRF level using the in/out sets from + * liveness calculation. + */ + for (int block = 0; block < cfg->num_blocks; block++) { + for (int i = 0; i < v->live_intervals->num_vars; i++) { + if (BITSET_TEST(v->live_intervals->block_data[block].livein, i)) { + int vgrf = v->live_intervals->vgrf_from_var[i]; + if (!BITSET_TEST(livein[block], vgrf)) { + reg_pressure_in[block] += v->alloc.sizes[vgrf]; + BITSET_SET(livein[block], vgrf); + } + } + + if (BITSET_TEST(v->live_intervals->block_data[block].liveout, i)) + BITSET_SET(liveout[block], v->live_intervals->vgrf_from_var[i]); + } + } + + /* Now, extend the live in/live out sets for when a range crosses a block + * boundary, which matches what our register allocator/interference code + * does to account for force_writemask_all and incompatible exec_mask's. + */ + for (int block = 0; block < cfg->num_blocks - 1; block++) { + for (int i = 0; i < grf_count; i++) { + if (v->virtual_grf_start[i] <= cfg->blocks[block]->end_ip && + v->virtual_grf_end[i] >= cfg->blocks[block + 1]->start_ip) { + if (!BITSET_TEST(livein[block + 1], i)) { + reg_pressure_in[block + 1] += v->alloc.sizes[i]; + BITSET_SET(livein[block + 1], i); + } + + BITSET_SET(liveout[block], i); + } + } + } + + int payload_last_use_ip[hw_reg_count]; + v->calculate_payload_ranges(hw_reg_count, payload_last_use_ip); + + for (int i = 0; i < hw_reg_count; i++) { + if (payload_last_use_ip[i] == -1) + continue; + + for (int block = 0; block < cfg->num_blocks; block++) { + if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i]) + reg_pressure_in[block]++; + + if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i]) + BITSET_SET(hw_liveout[block], i); + } } } @@ -524,18 +657,24 @@ fs_instruction_scheduler::update_register_pressure(backend_instruction *be) { fs_inst *inst = (fs_inst *)be; - if (!remaining_grf_uses) + if (!reads_remaining) return; if (inst->dst.file == GRF) { - remaining_grf_uses[inst->dst.reg]--; - grf_active[inst->dst.reg] = true; + written[inst->dst.reg] = true; } for (int i = 0; i < inst->sources; i++) { + if (is_src_duplicate(inst, i)) + continue; + if (inst->src[i].file == GRF) { - remaining_grf_uses[inst->src[i].reg]--; - grf_active[inst->src[i].reg] = true; + reads_remaining[inst->src[i].reg]--; + } else if (inst->src[i].file == HW_REG && + inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE && + inst->src[i].fixed_hw_reg.nr < hw_reg_count) { + for (int off = 0; off < inst->regs_read(i); off++) + hw_reads_remaining[inst->src[i].fixed_hw_reg.nr + off]--; } } } @@ -547,20 +686,31 @@ fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be) int benefit = 0; if (inst->dst.file == GRF) { - if (remaining_grf_uses[inst->dst.reg] == 1) - benefit += v->alloc.sizes[inst->dst.reg]; - if (!grf_active[inst->dst.reg]) + if (!BITSET_TEST(livein[block_idx], inst->dst.reg) && + !written[inst->dst.reg]) benefit -= v->alloc.sizes[inst->dst.reg]; } for (int i = 0; i < inst->sources; i++) { - if (inst->src[i].file != GRF) + if (is_src_duplicate(inst, i)) continue; - if (remaining_grf_uses[inst->src[i].reg] == 1) + if (inst->src[i].file == GRF && + !BITSET_TEST(liveout[block_idx], inst->src[i].reg) && + reads_remaining[inst->src[i].reg] == 1) benefit += v->alloc.sizes[inst->src[i].reg]; - if (!grf_active[inst->src[i].reg]) - benefit -= v->alloc.sizes[inst->src[i].reg]; + + if (inst->src[i].file == HW_REG && + inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE && + inst->src[i].fixed_hw_reg.nr < hw_reg_count) { + for (int off = 0; off < inst->regs_read(i); off++) { + int reg = inst->src[i].fixed_hw_reg.nr + off; + if (!BITSET_TEST(hw_liveout[block_idx], reg) && + hw_reads_remaining[reg] == 1) { + benefit++; + } + } + } } return benefit; @@ -575,20 +725,26 @@ public: int issue_time(backend_instruction *inst); vec4_visitor *v; - void count_remaining_grf_uses(backend_instruction *inst); + void count_reads_remaining(backend_instruction *inst); + void setup_liveness(cfg_t *cfg); void update_register_pressure(backend_instruction *inst); int get_register_pressure_benefit(backend_instruction *inst); }; vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v, int grf_count) - : instruction_scheduler(v, grf_count, SCHEDULE_POST), + : instruction_scheduler(v, grf_count, 0, 0, SCHEDULE_POST), v(v) { } void -vec4_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be) +vec4_instruction_scheduler::count_reads_remaining(backend_instruction *be) +{ +} + +void +vec4_instruction_scheduler::setup_liveness(cfg_t *cfg) { } @@ -822,7 +978,7 @@ fs_instruction_scheduler::calculate_deps() inst->src[i].file != IMM && inst->src[i].file != UNIFORM && (inst->src[i].file != HW_REG || - inst->src[i].fixed_hw_reg.file != IMM)) { + inst->src[i].fixed_hw_reg.file != BRW_IMMEDIATE_VALUE)) { assert(inst->src[i].file != MRF); add_barrier_deps(n); } @@ -927,10 +1083,10 @@ fs_instruction_scheduler::calculate_deps() if (inst->src[i].file == GRF) { if (post_reg_alloc) { for (int r = 0; r < inst->regs_read(i); r++) - add_dep(n, last_grf_write[inst->src[i].reg + r]); + add_dep(n, last_grf_write[inst->src[i].reg + r], 0); } else { for (int r = 0; r < inst->regs_read(i); r++) { - add_dep(n, last_grf_write[inst->src[i].reg * 16 + inst->src[i].reg_offset + r]); + add_dep(n, last_grf_write[inst->src[i].reg * 16 + inst->src[i].reg_offset + r], 0); } } } else if (inst->src[i].file == HW_REG && @@ -941,17 +1097,17 @@ fs_instruction_scheduler::calculate_deps() if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0) size = 1; for (int r = 0; r < size; r++) - add_dep(n, last_grf_write[inst->src[i].fixed_hw_reg.nr + r]); + add_dep(n, last_grf_write[inst->src[i].fixed_hw_reg.nr + r], 0); } else { - add_dep(n, last_fixed_grf_write); + add_dep(n, last_fixed_grf_write, 0); } } else if (inst->src[i].is_accumulator()) { - add_dep(n, last_accumulator_write); + add_dep(n, last_accumulator_write, 0); } else if (inst->src[i].file != BAD_FILE && inst->src[i].file != IMM && inst->src[i].file != UNIFORM && (inst->src[i].file != HW_REG || - inst->src[i].fixed_hw_reg.file != IMM)) { + inst->src[i].fixed_hw_reg.file != BRW_IMMEDIATE_VALUE)) { assert(inst->src[i].file != MRF); add_barrier_deps(n); } @@ -1080,7 +1236,7 @@ vec4_instruction_scheduler::calculate_deps() inst->src[i].file != IMM && inst->src[i].file != UNIFORM && (inst->src[i].file != HW_REG || - inst->src[i].fixed_hw_reg.file != IMM)) { + inst->src[i].fixed_hw_reg.file != BRW_IMMEDIATE_VALUE)) { /* No reads from MRF, and ATTR is already translated away */ assert(inst->src[i].file != MRF && inst->src[i].file != ATTR); @@ -1177,7 +1333,7 @@ vec4_instruction_scheduler::calculate_deps() inst->src[i].file != IMM && inst->src[i].file != UNIFORM && (inst->src[i].file != HW_REG || - inst->src[i].fixed_hw_reg.file != IMM)) { + inst->src[i].fixed_hw_reg.file != BRW_IMMEDIATE_VALUE)) { assert(inst->src[i].file != MRF && inst->src[i].file != ATTR); add_barrier_deps(n); @@ -1387,6 +1543,9 @@ instruction_scheduler::schedule_instructions(bblock_t *block) const struct brw_device_info *devinfo = bs->devinfo; backend_instruction *inst = block->end(); time = 0; + if (!post_reg_alloc) + reg_pressure = reg_pressure_in[block->num]; + block_idx = block->num; /* Remove non-DAG heads from the list. */ foreach_in_list_safe(schedule_node, n, &instructions) { @@ -1403,23 +1562,30 @@ instruction_scheduler::schedule_instructions(bblock_t *block) chosen->remove(); inst->insert_before(block, chosen->inst); instructions_to_schedule--; - update_register_pressure(chosen->inst); - /* Update the clock for how soon an instruction could start after the - * chosen one. - */ - time += issue_time(chosen->inst); + if (!post_reg_alloc) { + reg_pressure -= get_register_pressure_benefit(chosen->inst); + update_register_pressure(chosen->inst); + } /* If we expected a delay for scheduling, then bump the clock to reflect - * that as well. In reality, the hardware will switch to another - * hyperthread and may not return to dispatching our thread for a while - * even after we're unblocked. + * that. In reality, the hardware will switch to another hyperthread + * and may not return to dispatching our thread for a while even after + * we're unblocked. After this, we have the time when the chosen + * instruction will start executing. */ time = MAX2(time, chosen->unblocked_time); + /* Update the clock for how soon an instruction could start after the + * chosen one. + */ + time += issue_time(chosen->inst); + if (debug) { fprintf(stderr, "clock %4d, scheduled: ", time); bs->dump_instruction(chosen->inst); + if (!post_reg_alloc) + fprintf(stderr, "(register pressure %d)\n", reg_pressure); } /* Now that we've scheduled a new instruction, some of its @@ -1466,30 +1632,53 @@ instruction_scheduler::schedule_instructions(bblock_t *block) if (block->end()->opcode == BRW_OPCODE_NOP) block->end()->remove(block); assert(instructions_to_schedule == 0); + + block->cycle_count = time; +} + +static unsigned get_cycle_count(cfg_t *cfg) +{ + unsigned count = 0, multiplier = 1; + foreach_block(block, cfg) { + if (block->start()->opcode == BRW_OPCODE_DO) + multiplier *= 10; /* assume that loops execute ~10 times */ + + count += block->cycle_count * multiplier; + + if (block->end()->opcode == BRW_OPCODE_WHILE) + multiplier /= 10; + } + + return count; } void instruction_scheduler::run(cfg_t *cfg) { - if (debug) { + if (debug && !post_reg_alloc) { fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n", post_reg_alloc); - bs->dump_instructions(); + bs->dump_instructions(); } - /* Populate the remaining GRF uses array to improve the pre-regalloc - * scheduling. - */ - if (remaining_grf_uses) { - foreach_block_and_inst(block, backend_instruction, inst, cfg) { - count_remaining_grf_uses(inst); - } - } + if (!post_reg_alloc) + setup_liveness(cfg); foreach_block(block, cfg) { if (block->end_ip - block->start_ip <= 1) continue; + if (reads_remaining) { + memset(reads_remaining, 0, + grf_count * sizeof(*reads_remaining)); + memset(hw_reads_remaining, 0, + hw_reg_count * sizeof(*hw_reads_remaining)); + memset(written, 0, grf_count * sizeof(*written)); + + foreach_inst_in_block(fs_inst, inst, block) + count_reads_remaining(inst); + } + add_insts_from_block(block); calculate_deps(); @@ -1501,23 +1690,29 @@ instruction_scheduler::run(cfg_t *cfg) schedule_instructions(block); } - if (debug) { + if (debug && !post_reg_alloc) { fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n", post_reg_alloc); bs->dump_instructions(); } + + cfg->cycle_count = get_cycle_count(cfg); } void fs_visitor::schedule_instructions(instruction_scheduler_mode mode) { + if (mode != SCHEDULE_POST) + calculate_live_intervals(); + int grf_count; if (mode == SCHEDULE_POST) grf_count = grf_used; else grf_count = alloc.count; - fs_instruction_scheduler sched(this, grf_count, mode); + fs_instruction_scheduler sched(this, grf_count, first_non_payload_grf, + cfg->num_blocks, mode); sched.run(cfg); if (unlikely(debug_enabled) && mode == SCHEDULE_POST) { diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp index e48f559afa7..063cb84a958 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.cpp +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp @@ -298,6 +298,8 @@ brw_instruction_name(enum opcode op) return "fb_write"; case FS_OPCODE_FB_WRITE_LOGICAL: return "fb_write_logical"; + case FS_OPCODE_PACK_STENCIL_REF: + return "pack_stencil_ref"; case FS_OPCODE_BLORP_FB_WRITE: return "blorp_fb_write"; case FS_OPCODE_REP_FB_WRITE: @@ -988,6 +990,20 @@ backend_instruction::has_side_effects() const } } +bool +backend_instruction::is_volatile() const +{ + switch (opcode) { + case SHADER_OPCODE_UNTYPED_SURFACE_READ: + case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: + case SHADER_OPCODE_TYPED_SURFACE_READ: + case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: + return true; + default: + return false; + } +} + #ifndef NDEBUG static bool inst_is_in_block(const bblock_t *block, const backend_instruction *inst) @@ -1178,9 +1194,9 @@ brw_assign_common_binding_table_offsets(gl_shader_stage stage, stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0; } - if (shader_prog && shader_prog->NumAtomicBuffers) { + if (shader && shader->NumAtomicBuffers) { stage_prog_data->binding_table.abo_start = next_binding_table_offset; - next_binding_table_offset += shader_prog->NumAtomicBuffers; + next_binding_table_offset += shader->NumAtomicBuffers; } else { stage_prog_data->binding_table.abo_start = 0xd0d0d0d0; } diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h index 8899b30c1ae..f4647cca4f9 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.h +++ b/src/mesa/drivers/dri/i965/brw_shader.h @@ -115,6 +115,12 @@ struct backend_instruction : public exec_node { * optimize these out unless you know what you are doing. */ bool has_side_effects() const; + + /** + * True if the instruction might be affected by side effects of other + * instructions. + */ + bool is_volatile() const; #else struct backend_instruction { struct exec_node link; diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h index dc2b9415673..2aa1248fea6 100644 --- a/src/mesa/drivers/dri/i965/brw_state.h +++ b/src/mesa/drivers/dri/i965/brw_state.h @@ -49,6 +49,7 @@ extern const struct brw_tracked_state brw_clip_unit; extern const struct brw_tracked_state brw_vs_pull_constants; extern const struct brw_tracked_state brw_gs_pull_constants; extern const struct brw_tracked_state brw_wm_pull_constants; +extern const struct brw_tracked_state brw_cs_pull_constants; extern const struct brw_tracked_state brw_constant_buffer; extern const struct brw_tracked_state brw_curbe_offsets; extern const struct brw_tracked_state brw_invariant_state; @@ -220,7 +221,7 @@ bool brw_search_cache(struct brw_cache *cache, enum brw_cache_id cache_id, const void *key, GLuint key_size, - uint32_t *inout_offset, void *out_aux); + uint32_t *inout_offset, void *inout_aux); void brw_state_cache_check_size( struct brw_context *brw ); void brw_init_caches( struct brw_context *brw ); @@ -345,7 +346,8 @@ calculate_attr_overrides(const struct brw_context *brw, uint16_t *attr_overrides, uint32_t *point_sprite_enables, uint32_t *flat_enables, - uint32_t *urb_entry_read_length); + uint32_t *urb_entry_read_length, + uint32_t *urb_entry_read_offset); /* gen6_surface_state.c */ void gen6_init_vtable_surface_functions(struct brw_context *brw); diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c index 2fbcd146750..f7c0a2037d9 100644 --- a/src/mesa/drivers/dri/i965/brw_state_cache.c +++ b/src/mesa/drivers/dri/i965/brw_state_cache.c @@ -137,7 +137,7 @@ bool brw_search_cache(struct brw_cache *cache, enum brw_cache_id cache_id, const void *key, GLuint key_size, - uint32_t *inout_offset, void *out_aux) + uint32_t *inout_offset, void *inout_aux) { struct brw_context *brw = cache->brw; struct brw_cache_item *item; @@ -155,11 +155,12 @@ brw_search_cache(struct brw_cache *cache, if (item == NULL) return false; - *(void **)out_aux = ((char *)item->key + item->key_size); + void *aux = ((char *) item->key) + item->key_size; - if (item->offset != *inout_offset) { + if (item->offset != *inout_offset || aux != *((void **) inout_aux)) { brw->ctx.NewDriverState |= (1 << cache_id); *inout_offset = item->offset; + *((void **) inout_aux) = aux; } return true; @@ -349,11 +350,6 @@ brw_init_caches(struct brw_context *brw) 4096, 64); if (brw->has_llc) drm_intel_gem_bo_map_unsynchronized(cache->bo); - - cache->aux_free[BRW_CACHE_VS_PROG] = brw_stage_prog_data_free; - cache->aux_free[BRW_CACHE_GS_PROG] = brw_stage_prog_data_free; - cache->aux_free[BRW_CACHE_FS_PROG] = brw_stage_prog_data_free; - cache->aux_free[BRW_CACHE_CS_PROG] = brw_stage_prog_data_free; } static void @@ -367,9 +363,12 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache) for (i = 0; i < cache->size; i++) { for (c = cache->items[i]; c; c = next) { next = c->next; - if (cache->aux_free[c->cache_id]) { + if (c->cache_id == BRW_CACHE_VS_PROG || + c->cache_id == BRW_CACHE_GS_PROG || + c->cache_id == BRW_CACHE_FS_PROG || + c->cache_id == BRW_CACHE_CS_PROG) { const void *item_aux = c->key + c->key_size; - cache->aux_free[c->cache_id](item_aux); + brw_stage_prog_data_free(item_aux); } free((void *)c->key); free(c); diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c index 79b8301954e..0344b8a7fb0 100644 --- a/src/mesa/drivers/dri/i965/brw_state_upload.c +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c @@ -259,6 +259,7 @@ static const struct brw_tracked_state *gen7_compute_atoms[] = &brw_state_base_address, &brw_cs_image_surfaces, &gen7_cs_push_constants, + &brw_cs_pull_constants, &brw_cs_ubo_surfaces, &brw_cs_abo_surfaces, &brw_texture_surfaces, @@ -353,6 +354,7 @@ static const struct brw_tracked_state *gen8_compute_atoms[] = &gen8_state_base_address, &brw_cs_image_surfaces, &gen7_cs_push_constants, + &brw_cs_pull_constants, &brw_cs_ubo_surfaces, &brw_cs_abo_surfaces, &brw_texture_surfaces, diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index 3e7078d0b32..01eb1580953 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -1370,9 +1370,10 @@ vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) vec4_instruction *inst = (vec4_instruction *)be_inst; if (inst->predicate) { - fprintf(file, "(%cf0.%d) ", + fprintf(file, "(%cf0.%d%s) ", inst->predicate_inverse ? '-' : '+', - inst->flag_subreg); + inst->flag_subreg, + pred_ctrl_align16[inst->predicate]); } fprintf(file, "%s", brw_instruction_name(inst->opcode)); @@ -1426,9 +1427,10 @@ vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) case BAD_FILE: fprintf(file, "(null)"); break; - default: - fprintf(file, "???"); - break; + case IMM: + case ATTR: + case UNIFORM: + unreachable("not reached"); } if (inst->dst.writemask != WRITEMASK_XYZW) { fprintf(file, "."); @@ -1520,9 +1522,8 @@ vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) case BAD_FILE: fprintf(file, "(null)"); break; - default: - fprintf(file, "???"); - break; + case MRF: + unreachable("not reached"); } /* Don't print .0; and only VGRFs have reg_offsets and sizes */ @@ -1787,13 +1788,100 @@ vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value) emit(MOV(offset, src_reg(index * SHADER_TIME_STRIDE))); time.type = BRW_REGISTER_TYPE_UD; - emit(MOV(time, src_reg(value))); + emit(MOV(time, value)); vec4_instruction *inst = emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst)); inst->mlen = 2; } +void +vec4_visitor::convert_to_hw_regs() +{ + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + for (int i = 0; i < 3; i++) { + struct src_reg &src = inst->src[i]; + struct brw_reg reg; + switch (src.file) { + case GRF: + reg = brw_vec8_grf(src.reg + src.reg_offset, 0); + reg.type = src.type; + reg.dw1.bits.swizzle = src.swizzle; + reg.abs = src.abs; + reg.negate = src.negate; + break; + + case IMM: + reg = brw_imm_reg(src.type); + reg.dw1.ud = src.fixed_hw_reg.dw1.ud; + break; + + case UNIFORM: + reg = stride(brw_vec4_grf(prog_data->base.dispatch_grf_start_reg + + (src.reg + src.reg_offset) / 2, + ((src.reg + src.reg_offset) % 2) * 4), + 0, 4, 1); + reg.type = src.type; + reg.dw1.bits.swizzle = src.swizzle; + reg.abs = src.abs; + reg.negate = src.negate; + + /* This should have been moved to pull constants. */ + assert(!src.reladdr); + break; + + case HW_REG: + assert(src.type == src.fixed_hw_reg.type); + continue; + + case BAD_FILE: + /* Probably unused. */ + reg = brw_null_reg(); + break; + + case MRF: + case ATTR: + unreachable("not reached"); + } + src.fixed_hw_reg = reg; + } + + dst_reg &dst = inst->dst; + struct brw_reg reg; + + switch (inst->dst.file) { + case GRF: + reg = brw_vec8_grf(dst.reg + dst.reg_offset, 0); + reg.type = dst.type; + reg.dw1.bits.writemask = dst.writemask; + break; + + case MRF: + assert(((dst.reg + dst.reg_offset) & ~(1 << 7)) < BRW_MAX_MRF(devinfo->gen)); + reg = brw_message_reg(dst.reg + dst.reg_offset); + reg.type = dst.type; + reg.dw1.bits.writemask = dst.writemask; + break; + + case HW_REG: + assert(dst.type == dst.fixed_hw_reg.type); + reg = dst.fixed_hw_reg; + break; + + case BAD_FILE: + reg = brw_null_reg(); + break; + + case IMM: + case ATTR: + case UNIFORM: + unreachable("not reached"); + } + + dst.fixed_hw_reg = reg; + } +} + bool vec4_visitor::run() { @@ -1862,6 +1950,7 @@ vec4_visitor::run() OPT(dead_code_eliminate); OPT(dead_control_flow_eliminate, this); OPT(opt_copy_propagation); + OPT(opt_cmod_propagation); OPT(opt_cse); OPT(opt_algebraic); OPT(opt_register_coalesce); @@ -1914,6 +2003,8 @@ vec4_visitor::run() opt_set_dependency_control(); + convert_to_hw_regs(); + if (last_scratch > 0) { prog_data->base.total_scratch = brw_get_scratch_size(last_scratch * REG_SIZE); @@ -2020,9 +2111,9 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, return NULL; } - vec4_generator g(compiler, log_data, &prog_data->base, - mem_ctx, INTEL_DEBUG & DEBUG_VS, "vertex", "VS"); - assembly = g.generate_assembly(v.cfg, final_assembly_size, shader); + assembly = brw_vec4_generate_assembly(compiler, log_data, mem_ctx, + shader, &prog_data->base, v.cfg, + final_assembly_size); } return assembly; diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index d861b2e85df..ec8abf49cd8 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -52,6 +52,15 @@ extern "C" { extern "C" { #endif +const unsigned * +brw_vec4_generate_assembly(const struct brw_compiler *compiler, + void *log_data, + void *mem_ctx, + const nir_shader *nir, + struct brw_vue_prog_data *prog_data, + const struct cfg_t *cfg, + unsigned *out_assembly_size); + #ifdef __cplusplus } /* extern "C" */ @@ -149,6 +158,7 @@ public: int var_range_start(unsigned v, unsigned n) const; int var_range_end(unsigned v, unsigned n) const; bool virtual_grf_interferes(int a, int b); + bool opt_cmod_propagation(); bool opt_copy_propagation(bool do_constant_prop = true); bool opt_cse_local(bblock_t *block); bool opt_cse(); @@ -158,6 +168,7 @@ public: bool is_dep_ctrl_unsafe(const vec4_instruction *inst); void opt_set_dependency_control(); void opt_schedule_instructions(); + void convert_to_hw_regs(); vec4_instruction *emit(vec4_instruction *inst); @@ -381,117 +392,6 @@ private: unsigned last_scratch; /**< measured in 32-byte (register size) units */ }; - -/** - * The vertex shader code generator. - * - * Translates VS IR to actual i965 assembly code. - */ -class vec4_generator -{ -public: - vec4_generator(const struct brw_compiler *compiler, void *log_data, - struct brw_vue_prog_data *prog_data, - void *mem_ctx, - bool debug_flag, - const char *stage_name, - const char *stage_abbrev); - ~vec4_generator(); - - const unsigned *generate_assembly(const cfg_t *cfg, unsigned *asm_size, - const nir_shader *nir); - -private: - void generate_code(const cfg_t *cfg, const nir_shader *nir); - - void generate_math1_gen4(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src); - void generate_math2_gen4(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src0, - struct brw_reg src1); - void generate_math_gen6(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src0, - struct brw_reg src1); - - void generate_tex(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src, - struct brw_reg sampler_index); - - void generate_vs_urb_write(vec4_instruction *inst); - void generate_gs_urb_write(vec4_instruction *inst); - void generate_gs_urb_write_allocate(vec4_instruction *inst); - void generate_gs_thread_end(vec4_instruction *inst); - void generate_gs_set_write_offset(struct brw_reg dst, - struct brw_reg src0, - struct brw_reg src1); - void generate_gs_set_vertex_count(struct brw_reg dst, - struct brw_reg src); - void generate_gs_svb_write(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src0, - struct brw_reg src1); - void generate_gs_svb_set_destination_index(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src); - void generate_gs_set_dword_2(struct brw_reg dst, struct brw_reg src); - void generate_gs_prepare_channel_masks(struct brw_reg dst); - void generate_gs_set_channel_masks(struct brw_reg dst, struct brw_reg src); - void generate_gs_get_instance_id(struct brw_reg dst); - void generate_gs_ff_sync_set_primitives(struct brw_reg dst, - struct brw_reg src0, - struct brw_reg src1, - struct brw_reg src2); - void generate_gs_ff_sync(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src0, - struct brw_reg src1); - void generate_gs_set_primitive_id(struct brw_reg dst); - void generate_oword_dual_block_offsets(struct brw_reg m1, - struct brw_reg index); - void generate_scratch_write(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src, - struct brw_reg index); - void generate_scratch_read(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg index); - void generate_pull_constant_load(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg index, - struct brw_reg offset); - void generate_pull_constant_load_gen7(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg surf_index, - struct brw_reg offset); - void generate_set_simd4x2_header_gen9(vec4_instruction *inst, - struct brw_reg dst); - - void generate_get_buffer_size(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src, - struct brw_reg index); - - void generate_unpack_flags(struct brw_reg dst); - - const struct brw_compiler *compiler; - void *log_data; /* Passed to compiler->*_log functions */ - - const struct brw_device_info *devinfo; - - struct brw_codegen *p; - - struct brw_vue_prog_data *prog_data; - - void *mem_ctx; - const char *stage_name; - const char *stage_abbrev; - const bool debug_flag; -}; - } /* namespace brw */ #endif /* __cplusplus */ diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp new file mode 100644 index 00000000000..329f24269ce --- /dev/null +++ b/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp @@ -0,0 +1,157 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +/** @file brw_vec4_cmod_propagation.cpp + * + * Really similar to brw_fs_cmod_propagation but adapted to vec4 needs. Check + * brw_fs_cmod_propagation for further details on the rationale behind this + * optimization. + */ + +#include "brw_vec4.h" +#include "brw_cfg.h" + +namespace brw { + +static bool +opt_cmod_propagation_local(bblock_t *block) +{ + bool progress = false; + int ip = block->end_ip + 1; + + foreach_inst_in_block_reverse_safe(vec4_instruction, inst, block) { + ip--; + + if ((inst->opcode != BRW_OPCODE_AND && + inst->opcode != BRW_OPCODE_CMP && + inst->opcode != BRW_OPCODE_MOV) || + inst->predicate != BRW_PREDICATE_NONE || + !inst->dst.is_null() || + inst->src[0].file != GRF || + inst->src[0].abs) + continue; + + if (inst->opcode == BRW_OPCODE_AND && + !(inst->src[1].is_one() && + inst->conditional_mod == BRW_CONDITIONAL_NZ && + !inst->src[0].negate)) + continue; + + if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) + continue; + + if (inst->opcode == BRW_OPCODE_MOV && + inst->conditional_mod != BRW_CONDITIONAL_NZ) + continue; + + bool read_flag = false; + foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst, inst) { + if (inst->src[0].in_range(scan_inst->dst, + scan_inst->regs_written)) { + if ((scan_inst->predicate && scan_inst->opcode != BRW_OPCODE_SEL) || + scan_inst->dst.reg_offset != inst->src[0].reg_offset || + (scan_inst->dst.writemask != WRITEMASK_X && + scan_inst->dst.writemask != WRITEMASK_XYZW) || + (scan_inst->dst.writemask == WRITEMASK_XYZW && + inst->src[0].swizzle != BRW_SWIZZLE_XYZW) || + (inst->dst.writemask & ~scan_inst->dst.writemask) != 0) { + break; + } + + /* CMP's result is the same regardless of dest type. */ + if (inst->conditional_mod == BRW_CONDITIONAL_NZ && + scan_inst->opcode == BRW_OPCODE_CMP && + (inst->dst.type == BRW_REGISTER_TYPE_D || + inst->dst.type == BRW_REGISTER_TYPE_UD)) { + inst->remove(block); + progress = true; + break; + } + + /* If the AND wasn't handled by the previous case, it isn't safe + * to remove it. + */ + if (inst->opcode == BRW_OPCODE_AND) + break; + + /* Comparisons operate differently for ints and floats */ + if (scan_inst->dst.type != inst->dst.type && + (scan_inst->dst.type == BRW_REGISTER_TYPE_F || + inst->dst.type == BRW_REGISTER_TYPE_F)) + break; + + /* If the instruction generating inst's source also wrote the + * flag, and inst is doing a simple .nz comparison, then inst + * is redundant - the appropriate value is already in the flag + * register. Delete inst. + */ + if (inst->conditional_mod == BRW_CONDITIONAL_NZ && + !inst->src[0].negate && + scan_inst->writes_flag()) { + inst->remove(block); + progress = true; + break; + } + + /* Otherwise, try propagating the conditional. */ + enum brw_conditional_mod cond = + inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod) + : inst->conditional_mod; + + if (scan_inst->can_do_cmod() && + ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) || + scan_inst->conditional_mod == cond)) { + scan_inst->conditional_mod = cond; + inst->remove(block); + progress = true; + } + break; + } + + if (scan_inst->writes_flag()) + break; + + read_flag = read_flag || scan_inst->reads_flag(); + } + } + + return progress; +} + +bool +vec4_visitor::opt_cmod_propagation() +{ + bool progress = false; + + foreach_block_reverse(block, cfg) { + progress = opt_cmod_propagation_local(block) || progress; + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + +} /* namespace brw */ diff --git a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp index 8fc7a365bfc..284e0a8d0a5 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp @@ -78,13 +78,19 @@ vec4_visitor::dead_code_eliminate() sizeof(BITSET_WORD)); foreach_inst_in_block_reverse(vec4_instruction, inst, block) { - if (inst->dst.file == GRF && !inst->has_side_effects()) { + if ((inst->dst.file == GRF && !inst->has_side_effects()) || + (inst->dst.is_null() && inst->writes_flag())){ bool result_live[4] = { false }; - for (unsigned i = 0; i < inst->regs_written; i++) { - for (int c = 0; c < 4; c++) - result_live[c] |= BITSET_TEST( - live, var_from_reg(alloc, offset(inst->dst, i), c)); + if (inst->dst.file == GRF) { + for (unsigned i = 0; i < inst->regs_written; i++) { + for (int c = 0; c < 4; c++) + result_live[c] |= BITSET_TEST( + live, var_from_reg(alloc, offset(inst->dst, i), c)); + } + } else { + for (unsigned c = 0; c < 4; c++) + result_live[c] = BITSET_TEST(flag_live, c); } /* If the instruction can't do writemasking, then it's all or @@ -117,7 +123,11 @@ vec4_visitor::dead_code_eliminate() } if (inst->dst.is_null() && inst->writes_flag()) { - if (!BITSET_TEST(flag_live, 0)) { + bool combined_live = false; + for (unsigned c = 0; c < 4; c++) + combined_live |= BITSET_TEST(flag_live, c); + + if (!combined_live) { inst->opcode = BRW_OPCODE_NOP; progress = true; continue; @@ -136,7 +146,8 @@ vec4_visitor::dead_code_eliminate() } if (inst->writes_flag()) { - BITSET_CLEAR(flag_live, 0); + for (unsigned c = 0; c < 4; c++) + BITSET_CLEAR(flag_live, c); } for (int i = 0; i < 3; i++) { @@ -150,8 +161,10 @@ vec4_visitor::dead_code_eliminate() } } - if (inst->reads_flag()) { - BITSET_SET(flag_live, 0); + for (unsigned c = 0; c < 4; c++) { + if (inst->reads_flag(c)) { + BITSET_SET(flag_live, c); + } } } } diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp index a84f6c47471..8bc21df5ffc 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp @@ -20,146 +20,17 @@ * IN THE SOFTWARE. */ -#include <ctype.h> #include "glsl/glsl_parser_extras.h" #include "brw_vec4.h" #include "brw_cfg.h" -extern "C" { -#include "brw_eu.h" -#include "main/macros.h" -#include "program/prog_print.h" -#include "program/prog_parameter.h" -}; +using namespace brw; -namespace brw { - -struct brw_reg -vec4_instruction::get_dst(unsigned gen) -{ - struct brw_reg brw_reg; - - switch (dst.file) { - case GRF: - brw_reg = brw_vec8_grf(dst.reg + dst.reg_offset, 0); - brw_reg = retype(brw_reg, dst.type); - brw_reg.dw1.bits.writemask = dst.writemask; - break; - - case MRF: - assert(((dst.reg + dst.reg_offset) & ~(1 << 7)) < BRW_MAX_MRF(gen)); - brw_reg = brw_message_reg(dst.reg + dst.reg_offset); - brw_reg = retype(brw_reg, dst.type); - brw_reg.dw1.bits.writemask = dst.writemask; - break; - - case HW_REG: - assert(dst.type == dst.fixed_hw_reg.type); - brw_reg = dst.fixed_hw_reg; - break; - - case BAD_FILE: - brw_reg = brw_null_reg(); - break; - - default: - unreachable("not reached"); - } - return brw_reg; -} - -struct brw_reg -vec4_instruction::get_src(const struct brw_vue_prog_data *prog_data, int i) -{ - struct brw_reg brw_reg; - - switch (src[i].file) { - case GRF: - brw_reg = brw_vec8_grf(src[i].reg + src[i].reg_offset, 0); - brw_reg = retype(brw_reg, src[i].type); - brw_reg.dw1.bits.swizzle = src[i].swizzle; - if (src[i].abs) - brw_reg = brw_abs(brw_reg); - if (src[i].negate) - brw_reg = negate(brw_reg); - break; - - case IMM: - switch (src[i].type) { - case BRW_REGISTER_TYPE_F: - brw_reg = brw_imm_f(src[i].fixed_hw_reg.dw1.f); - break; - case BRW_REGISTER_TYPE_D: - brw_reg = brw_imm_d(src[i].fixed_hw_reg.dw1.d); - break; - case BRW_REGISTER_TYPE_UD: - brw_reg = brw_imm_ud(src[i].fixed_hw_reg.dw1.ud); - break; - case BRW_REGISTER_TYPE_VF: - brw_reg = brw_imm_vf(src[i].fixed_hw_reg.dw1.ud); - break; - default: - unreachable("not reached"); - } - break; - - case UNIFORM: - brw_reg = stride(brw_vec4_grf(prog_data->base.dispatch_grf_start_reg + - (src[i].reg + src[i].reg_offset) / 2, - ((src[i].reg + src[i].reg_offset) % 2) * 4), - 0, 4, 1); - brw_reg = retype(brw_reg, src[i].type); - brw_reg.dw1.bits.swizzle = src[i].swizzle; - if (src[i].abs) - brw_reg = brw_abs(brw_reg); - if (src[i].negate) - brw_reg = negate(brw_reg); - - /* This should have been moved to pull constants. */ - assert(!src[i].reladdr); - break; - - case HW_REG: - assert(src[i].type == src[i].fixed_hw_reg.type); - brw_reg = src[i].fixed_hw_reg; - break; - - case BAD_FILE: - /* Probably unused. */ - brw_reg = brw_null_reg(); - break; - case ATTR: - default: - unreachable("not reached"); - } - - return brw_reg; -} - -vec4_generator::vec4_generator(const struct brw_compiler *compiler, - void *log_data, - struct brw_vue_prog_data *prog_data, - void *mem_ctx, - bool debug_flag, - const char *stage_name, - const char *stage_abbrev) - : compiler(compiler), log_data(log_data), devinfo(compiler->devinfo), - prog_data(prog_data), - mem_ctx(mem_ctx), stage_name(stage_name), stage_abbrev(stage_abbrev), - debug_flag(debug_flag) -{ - p = rzalloc(mem_ctx, struct brw_codegen); - brw_init_codegen(devinfo, p, mem_ctx); -} - -vec4_generator::~vec4_generator() -{ -} - -void -vec4_generator::generate_math1_gen4(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src) +static void +generate_math1_gen4(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src) { gen4_math(p, dst, @@ -178,11 +49,12 @@ check_gen6_math_src_arg(struct brw_reg src) assert(src.dw1.bits.swizzle == BRW_SWIZZLE_XYZW); } -void -vec4_generator::generate_math_gen6(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src0, - struct brw_reg src1) +static void +generate_math_gen6(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1) { /* Can't do writemask because math can't be align16. */ assert(dst.dw1.bits.writemask == WRITEMASK_XYZW); @@ -196,11 +68,12 @@ vec4_generator::generate_math_gen6(vec4_instruction *inst, brw_set_default_access_mode(p, BRW_ALIGN_16); } -void -vec4_generator::generate_math2_gen4(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src0, - struct brw_reg src1) +static void +generate_math2_gen4(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1) { /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13 * "Message Payload": @@ -229,12 +102,15 @@ vec4_generator::generate_math2_gen4(vec4_instruction *inst, BRW_MATH_PRECISION_FULL); } -void -vec4_generator::generate_tex(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src, - struct brw_reg sampler_index) +static void +generate_tex(struct brw_codegen *p, + struct brw_vue_prog_data *prog_data, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src, + struct brw_reg sampler_index) { + const struct brw_device_info *devinfo = p->devinfo; int msg_type = -1; if (devinfo->gen >= 5) { @@ -440,8 +316,8 @@ vec4_generator::generate_tex(vec4_instruction *inst, } } -void -vec4_generator::generate_vs_urb_write(vec4_instruction *inst) +static void +generate_vs_urb_write(struct brw_codegen *p, vec4_instruction *inst) { brw_urb_WRITE(p, brw_null_reg(), /* dest */ @@ -454,8 +330,8 @@ vec4_generator::generate_vs_urb_write(vec4_instruction *inst) BRW_URB_SWIZZLE_INTERLEAVE); } -void -vec4_generator::generate_gs_urb_write(vec4_instruction *inst) +static void +generate_gs_urb_write(struct brw_codegen *p, vec4_instruction *inst) { struct brw_reg src = brw_message_reg(inst->base_mrf); brw_urb_WRITE(p, @@ -469,14 +345,14 @@ vec4_generator::generate_gs_urb_write(vec4_instruction *inst) BRW_URB_SWIZZLE_INTERLEAVE); } -void -vec4_generator::generate_gs_urb_write_allocate(vec4_instruction *inst) +static void +generate_gs_urb_write_allocate(struct brw_codegen *p, vec4_instruction *inst) { struct brw_reg src = brw_message_reg(inst->base_mrf); /* We pass the temporary passed in src0 as the writeback register */ brw_urb_WRITE(p, - inst->get_src(this->prog_data, 0), /* dest */ + inst->src[0].fixed_hw_reg, /* dest */ inst->base_mrf, /* starting mrf reg nr */ src, BRW_URB_WRITE_ALLOCATE_COMPLETE, @@ -489,14 +365,13 @@ vec4_generator::generate_gs_urb_write_allocate(vec4_instruction *inst) brw_push_insn_state(p); brw_set_default_access_mode(p, BRW_ALIGN_1); brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_MOV(p, get_element_ud(inst->get_dst(devinfo->gen), 0), - get_element_ud(inst->get_src(this->prog_data, 0), 0)); - brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_MOV(p, get_element_ud(inst->dst.fixed_hw_reg, 0), + get_element_ud(inst->src[0].fixed_hw_reg, 0)); brw_pop_insn_state(p); } -void -vec4_generator::generate_gs_thread_end(vec4_instruction *inst) +static void +generate_gs_thread_end(struct brw_codegen *p, vec4_instruction *inst) { struct brw_reg src = brw_message_reg(inst->base_mrf); brw_urb_WRITE(p, @@ -510,10 +385,11 @@ vec4_generator::generate_gs_thread_end(vec4_instruction *inst) BRW_URB_SWIZZLE_INTERLEAVE); } -void -vec4_generator::generate_gs_set_write_offset(struct brw_reg dst, - struct brw_reg src0, - struct brw_reg src1) +static void +generate_gs_set_write_offset(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1) { /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message * Header: M0.3): @@ -536,29 +412,29 @@ vec4_generator::generate_gs_set_write_offset(struct brw_reg dst, brw_push_insn_state(p); brw_set_default_access_mode(p, BRW_ALIGN_1); brw_set_default_mask_control(p, BRW_MASK_DISABLE); - assert(devinfo->gen >= 7 && + assert(p->devinfo->gen >= 7 && src1.file == BRW_IMMEDIATE_VALUE && src1.type == BRW_REGISTER_TYPE_UD && src1.dw1.ud <= USHRT_MAX); - if (src0.file == IMM) { + if (src0.file == BRW_IMMEDIATE_VALUE) { brw_MOV(p, suboffset(stride(dst, 2, 2, 1), 3), brw_imm_ud(src0.dw1.ud * src1.dw1.ud)); } else { brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4), retype(src1, BRW_REGISTER_TYPE_UW)); } - brw_set_default_access_mode(p, BRW_ALIGN_16); brw_pop_insn_state(p); } -void -vec4_generator::generate_gs_set_vertex_count(struct brw_reg dst, - struct brw_reg src) +static void +generate_gs_set_vertex_count(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src) { brw_push_insn_state(p); brw_set_default_mask_control(p, BRW_MASK_DISABLE); - if (devinfo->gen >= 8) { + if (p->devinfo->gen >= 8) { /* Move the vertex count into the second MRF for the EOT write. */ brw_MOV(p, retype(brw_message_reg(dst.nr + 1), BRW_REGISTER_TYPE_UD), src); @@ -580,16 +456,17 @@ vec4_generator::generate_gs_set_vertex_count(struct brw_reg dst, brw_MOV(p, suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4), stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0)); - brw_set_default_access_mode(p, BRW_ALIGN_16); } brw_pop_insn_state(p); } -void -vec4_generator::generate_gs_svb_write(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src0, - struct brw_reg src1) +static void +generate_gs_svb_write(struct brw_codegen *p, + struct brw_vue_prog_data *prog_data, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1) { int binding = inst->sol_binding; bool final_write = inst->sol_final_write; @@ -623,12 +500,12 @@ vec4_generator::generate_gs_svb_write(vec4_instruction *inst, brw_pop_insn_state(p); } -void -vec4_generator::generate_gs_svb_set_destination_index(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src) +static void +generate_gs_svb_set_destination_index(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src) { - int vertex = inst->sol_vertex; brw_push_insn_state(p); brw_set_default_access_mode(p, BRW_ALIGN_1); @@ -637,8 +514,10 @@ vec4_generator::generate_gs_svb_set_destination_index(vec4_instruction *inst, brw_pop_insn_state(p); } -void -vec4_generator::generate_gs_set_dword_2(struct brw_reg dst, struct brw_reg src) +static void +generate_gs_set_dword_2(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src) { brw_push_insn_state(p); brw_set_default_access_mode(p, BRW_ALIGN_1); @@ -647,8 +526,9 @@ vec4_generator::generate_gs_set_dword_2(struct brw_reg dst, struct brw_reg src) brw_pop_insn_state(p); } -void -vec4_generator::generate_gs_prepare_channel_masks(struct brw_reg dst) +static void +generate_gs_prepare_channel_masks(struct brw_codegen *p, + struct brw_reg dst) { /* We want to left shift just DWORD 4 (the x component belonging to the * second geometry shader invocation) by 4 bits. So generate the @@ -664,9 +544,10 @@ vec4_generator::generate_gs_prepare_channel_masks(struct brw_reg dst) brw_pop_insn_state(p); } -void -vec4_generator::generate_gs_set_channel_masks(struct brw_reg dst, - struct brw_reg src) +static void +generate_gs_set_channel_masks(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src) { /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message * Header: M0.5): @@ -727,8 +608,9 @@ vec4_generator::generate_gs_set_channel_masks(struct brw_reg dst, brw_pop_insn_state(p); } -void -vec4_generator::generate_gs_get_instance_id(struct brw_reg dst) +static void +generate_gs_get_instance_id(struct brw_codegen *p, + struct brw_reg dst) { /* We want to right shift R0.0 & R0.1 by GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT * and store into dst.0 & dst.4. So generate the instruction: @@ -744,11 +626,12 @@ vec4_generator::generate_gs_get_instance_id(struct brw_reg dst) brw_pop_insn_state(p); } -void -vec4_generator::generate_gs_ff_sync_set_primitives(struct brw_reg dst, - struct brw_reg src0, - struct brw_reg src1, - struct brw_reg src2) +static void +generate_gs_ff_sync_set_primitives(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1, + struct brw_reg src2) { brw_push_insn_state(p); brw_set_default_access_mode(p, BRW_ALIGN_1); @@ -765,11 +648,12 @@ vec4_generator::generate_gs_ff_sync_set_primitives(struct brw_reg dst, brw_pop_insn_state(p); } -void -vec4_generator::generate_gs_ff_sync(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src0, - struct brw_reg src1) +static void +generate_gs_ff_sync(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1) { /* This opcode uses an implied MRF register for: * - the header of the ff_sync message. And as such it is expected to be @@ -811,8 +695,8 @@ vec4_generator::generate_gs_ff_sync(vec4_instruction *inst, brw_pop_insn_state(p); } -void -vec4_generator::generate_gs_set_primitive_id(struct brw_reg dst) +static void +generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst) { /* In gen6, PrimitiveID is delivered in R0.1 of the payload */ struct brw_reg src = brw_vec8_grf(0, 0); @@ -823,13 +707,14 @@ vec4_generator::generate_gs_set_primitive_id(struct brw_reg dst) brw_pop_insn_state(p); } -void -vec4_generator::generate_oword_dual_block_offsets(struct brw_reg m1, - struct brw_reg index) +static void +generate_oword_dual_block_offsets(struct brw_codegen *p, + struct brw_reg m1, + struct brw_reg index) { int second_vertex_offset; - if (devinfo->gen >= 6) + if (p->devinfo->gen >= 6) second_vertex_offset = 1; else second_vertex_offset = 16; @@ -860,8 +745,9 @@ vec4_generator::generate_oword_dual_block_offsets(struct brw_reg m1, brw_pop_insn_state(p); } -void -vec4_generator::generate_unpack_flags(struct brw_reg dst) +static void +generate_unpack_flags(struct brw_codegen *p, + struct brw_reg dst) { brw_push_insn_state(p); brw_set_default_mask_control(p, BRW_MASK_DISABLE); @@ -878,16 +764,18 @@ vec4_generator::generate_unpack_flags(struct brw_reg dst) brw_pop_insn_state(p); } -void -vec4_generator::generate_scratch_read(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg index) +static void +generate_scratch_read(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg index) { + const struct brw_device_info *devinfo = p->devinfo; struct brw_reg header = brw_vec8_grf(0, 0); gen6_resolve_implied_move(p, &header, inst->base_mrf); - generate_oword_dual_block_offsets(brw_message_reg(inst->base_mrf + 1), + generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1), index); uint32_t msg_type; @@ -906,7 +794,7 @@ vec4_generator::generate_scratch_read(vec4_instruction *inst, brw_set_dest(p, send, dst); brw_set_src0(p, send, header); if (devinfo->gen < 6) - brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf); + brw_inst_set_cond_modifier(devinfo, send, inst->base_mrf); brw_set_dp_read_message(p, send, 255, /* binding table index: stateless access */ BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, @@ -917,12 +805,14 @@ vec4_generator::generate_scratch_read(vec4_instruction *inst, 1 /* rlen */); } -void -vec4_generator::generate_scratch_write(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src, - struct brw_reg index) +static void +generate_scratch_write(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src, + struct brw_reg index) { + const struct brw_device_info *devinfo = p->devinfo; struct brw_reg header = brw_vec8_grf(0, 0); bool write_commit; @@ -933,7 +823,7 @@ vec4_generator::generate_scratch_write(vec4_instruction *inst, gen6_resolve_implied_move(p, &header, inst->base_mrf); - generate_oword_dual_block_offsets(brw_message_reg(inst->base_mrf + 1), + generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1), index); brw_MOV(p, @@ -990,12 +880,15 @@ vec4_generator::generate_scratch_write(vec4_instruction *inst, write_commit); } -void -vec4_generator::generate_pull_constant_load(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg index, - struct brw_reg offset) +static void +generate_pull_constant_load(struct brw_codegen *p, + struct brw_vue_prog_data *prog_data, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg index, + struct brw_reg offset) { + const struct brw_device_info *devinfo = p->devinfo; assert(index.file == BRW_IMMEDIATE_VALUE && index.type == BRW_REGISTER_TYPE_UD); uint32_t surf_index = index.dw1.ud; @@ -1036,13 +929,15 @@ vec4_generator::generate_pull_constant_load(vec4_instruction *inst, brw_mark_surface_used(&prog_data->base, surf_index); } -void -vec4_generator::generate_get_buffer_size(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src, - struct brw_reg surf_index) +static void +generate_get_buffer_size(struct brw_codegen *p, + struct brw_vue_prog_data *prog_data, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src, + struct brw_reg surf_index) { - assert(devinfo->gen >= 7); + assert(p->devinfo->gen >= 7); assert(surf_index.type == BRW_REGISTER_TYPE_UD && surf_index.file == BRW_IMMEDIATE_VALUE); @@ -1062,11 +957,13 @@ vec4_generator::generate_get_buffer_size(vec4_instruction *inst, brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud); } -void -vec4_generator::generate_pull_constant_load_gen7(vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg surf_index, - struct brw_reg offset) +static void +generate_pull_constant_load_gen7(struct brw_codegen *p, + struct brw_vue_prog_data *prog_data, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg surf_index, + struct brw_reg offset) { assert(surf_index.type == BRW_REGISTER_TYPE_UD); @@ -1123,9 +1020,10 @@ vec4_generator::generate_pull_constant_load_gen7(vec4_instruction *inst, } } -void -vec4_generator::generate_set_simd4x2_header_gen9(vec4_instruction *inst, - struct brw_reg dst) +static void +generate_set_simd4x2_header_gen9(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst) { brw_push_insn_state(p); brw_set_default_mask_control(p, BRW_MASK_DISABLE); @@ -1140,9 +1038,18 @@ vec4_generator::generate_set_simd4x2_header_gen9(vec4_instruction *inst, brw_pop_insn_state(p); } -void -vec4_generator::generate_code(const cfg_t *cfg, const nir_shader *nir) +static void +generate_code(struct brw_codegen *p, + const struct brw_compiler *compiler, + void *log_data, + const nir_shader *nir, + struct brw_vue_prog_data *prog_data, + const struct cfg_t *cfg) { + const struct brw_device_info *devinfo = p->devinfo; + const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->stage); + bool debug_flag = INTEL_DEBUG & + intel_debug_flag_for_shader_stage(nir->stage); struct annotation_info annotation; memset(&annotation, 0, sizeof(annotation)); int loop_count = 0; @@ -1154,9 +1061,9 @@ vec4_generator::generate_code(const cfg_t *cfg, const nir_shader *nir) annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset); for (unsigned int i = 0; i < 3; i++) { - src[i] = inst->get_src(this->prog_data, i); + src[i] = inst->src[i].fixed_hw_reg; } - dst = inst->get_dst(devinfo->gen); + dst = inst->dst.fixed_hw_reg; brw_set_default_predicate_control(p, inst->predicate); brw_set_default_predicate_inverse(p, inst->predicate_inverse); @@ -1383,9 +1290,9 @@ vec4_generator::generate_code(const cfg_t *cfg, const nir_shader *nir) gen6_math(p, dst, brw_math_function(inst->opcode), src[0], brw_null_reg()); } else if (devinfo->gen == 6) { - generate_math_gen6(inst, dst, src[0], brw_null_reg()); + generate_math_gen6(p, inst, dst, src[0], brw_null_reg()); } else { - generate_math1_gen4(inst, dst, src[0]); + generate_math1_gen4(p, inst, dst, src[0]); } break; @@ -1396,9 +1303,9 @@ vec4_generator::generate_code(const cfg_t *cfg, const nir_shader *nir) if (devinfo->gen >= 7) { gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]); } else if (devinfo->gen == 6) { - generate_math_gen6(inst, dst, src[0], src[1]); + generate_math_gen6(p, inst, dst, src[0], src[1]); } else { - generate_math2_gen4(inst, dst, src[0], src[1]); + generate_math2_gen4(p, inst, dst, src[0], src[1]); } break; @@ -1412,92 +1319,92 @@ vec4_generator::generate_code(const cfg_t *cfg, const nir_shader *nir) case SHADER_OPCODE_TG4: case SHADER_OPCODE_TG4_OFFSET: case SHADER_OPCODE_SAMPLEINFO: - generate_tex(inst, dst, src[0], src[1]); + generate_tex(p, prog_data, inst, dst, src[0], src[1]); break; case VS_OPCODE_URB_WRITE: - generate_vs_urb_write(inst); + generate_vs_urb_write(p, inst); break; case SHADER_OPCODE_GEN4_SCRATCH_READ: - generate_scratch_read(inst, dst, src[0]); + generate_scratch_read(p, inst, dst, src[0]); break; case SHADER_OPCODE_GEN4_SCRATCH_WRITE: - generate_scratch_write(inst, dst, src[0], src[1]); + generate_scratch_write(p, inst, dst, src[0], src[1]); break; case VS_OPCODE_PULL_CONSTANT_LOAD: - generate_pull_constant_load(inst, dst, src[0], src[1]); + generate_pull_constant_load(p, prog_data, inst, dst, src[0], src[1]); break; case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: - generate_pull_constant_load_gen7(inst, dst, src[0], src[1]); + generate_pull_constant_load_gen7(p, prog_data, inst, dst, src[0], src[1]); break; case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9: - generate_set_simd4x2_header_gen9(inst, dst); + generate_set_simd4x2_header_gen9(p, inst, dst); break; case VS_OPCODE_GET_BUFFER_SIZE: - generate_get_buffer_size(inst, dst, src[0], src[1]); + generate_get_buffer_size(p, prog_data, inst, dst, src[0], src[1]); break; case GS_OPCODE_URB_WRITE: - generate_gs_urb_write(inst); + generate_gs_urb_write(p, inst); break; case GS_OPCODE_URB_WRITE_ALLOCATE: - generate_gs_urb_write_allocate(inst); + generate_gs_urb_write_allocate(p, inst); break; case GS_OPCODE_SVB_WRITE: - generate_gs_svb_write(inst, dst, src[0], src[1]); + generate_gs_svb_write(p, prog_data, inst, dst, src[0], src[1]); break; case GS_OPCODE_SVB_SET_DST_INDEX: - generate_gs_svb_set_destination_index(inst, dst, src[0]); + generate_gs_svb_set_destination_index(p, inst, dst, src[0]); break; case GS_OPCODE_THREAD_END: - generate_gs_thread_end(inst); + generate_gs_thread_end(p, inst); break; case GS_OPCODE_SET_WRITE_OFFSET: - generate_gs_set_write_offset(dst, src[0], src[1]); + generate_gs_set_write_offset(p, dst, src[0], src[1]); break; case GS_OPCODE_SET_VERTEX_COUNT: - generate_gs_set_vertex_count(dst, src[0]); + generate_gs_set_vertex_count(p, dst, src[0]); break; case GS_OPCODE_FF_SYNC: - generate_gs_ff_sync(inst, dst, src[0], src[1]); + generate_gs_ff_sync(p, inst, dst, src[0], src[1]); break; case GS_OPCODE_FF_SYNC_SET_PRIMITIVES: - generate_gs_ff_sync_set_primitives(dst, src[0], src[1], src[2]); + generate_gs_ff_sync_set_primitives(p, dst, src[0], src[1], src[2]); break; case GS_OPCODE_SET_PRIMITIVE_ID: - generate_gs_set_primitive_id(dst); + generate_gs_set_primitive_id(p, dst); break; case GS_OPCODE_SET_DWORD_2: - generate_gs_set_dword_2(dst, src[0]); + generate_gs_set_dword_2(p, dst, src[0]); break; case GS_OPCODE_PREPARE_CHANNEL_MASKS: - generate_gs_prepare_channel_masks(dst); + generate_gs_prepare_channel_masks(p, dst); break; case GS_OPCODE_SET_CHANNEL_MASKS: - generate_gs_set_channel_masks(dst, src[0]); + generate_gs_set_channel_masks(p, dst, src[0]); break; case GS_OPCODE_GET_INSTANCE_ID: - generate_gs_get_instance_id(dst); + generate_gs_get_instance_id(p, dst); break; case SHADER_OPCODE_SHADER_TIME_ADD: @@ -1556,7 +1463,7 @@ vec4_generator::generate_code(const cfg_t *cfg, const nir_shader *nir) break; case VS_OPCODE_UNPACK_FLAGS_SIMD4X2: - generate_unpack_flags(dst); + generate_unpack_flags(p, dst); break; case VEC4_OPCODE_MOV_BYTES: { @@ -1651,10 +1558,10 @@ vec4_generator::generate_code(const cfg_t *cfg, const nir_shader *nir) nir->info.label ? nir->info.label : "unnamed", _mesa_shader_stage_to_string(nir->stage), nir->info.name); - fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. Compacted %d to %d" - " bytes (%.0f%%)\n", + fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles." + "Compacted %d to %d bytes (%.0f%%)\n", stage_abbrev, - before_size / 16, loop_count, before_size, after_size, + before_size / 16, loop_count, cfg->cycle_count, before_size, after_size, 100.0f * (before_size - after_size) / before_size); dump_assembly(p->store, annotation.ann_count, annotation.ann, @@ -1663,21 +1570,27 @@ vec4_generator::generate_code(const cfg_t *cfg, const nir_shader *nir) } compiler->shader_debug_log(log_data, - "%s vec4 shader: %d inst, %d loops, " + "%s vec4 shader: %d inst, %d loops, %u cycles, " "compacted %d to %d bytes.\n", - stage_abbrev, before_size / 16, loop_count, + stage_abbrev, before_size / 16, + loop_count, cfg->cycle_count, before_size, after_size); } -const unsigned * -vec4_generator::generate_assembly(const cfg_t *cfg, - unsigned *assembly_size, - const nir_shader *nir) +extern "C" const unsigned * +brw_vec4_generate_assembly(const struct brw_compiler *compiler, + void *log_data, + void *mem_ctx, + const nir_shader *nir, + struct brw_vue_prog_data *prog_data, + const struct cfg_t *cfg, + unsigned *out_assembly_size) { + struct brw_codegen *p = rzalloc(mem_ctx, struct brw_codegen); + brw_init_codegen(compiler->devinfo, p, mem_ctx); brw_set_default_access_mode(p, BRW_ALIGN_16); - generate_code(cfg, nir); - return brw_get_program(p, assembly_size); -} + generate_code(p, compiler, log_data, nir, prog_data, cfg); -} /* namespace brw */ + return brw_get_program(p, out_assembly_size); +} diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp index 9402489e628..cfb5cd95cb1 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp @@ -768,7 +768,7 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data, output_size_bytes += 32; assert(output_size_bytes >= 1); - int max_output_size_bytes = GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES; + unsigned max_output_size_bytes = GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES; if (compiler->devinfo->gen == 6) max_output_size_bytes = GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES; if (output_size_bytes > max_output_size_bytes) @@ -824,9 +824,9 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data, vec4_gs_visitor v(compiler, log_data, &c, prog_data, shader, mem_ctx, true /* no_spills */, shader_time_index); if (v.run()) { - vec4_generator g(compiler, log_data, &prog_data->base, mem_ctx, - INTEL_DEBUG & DEBUG_GS, "geometry", "GS"); - return g.generate_assembly(v.cfg, final_assembly_size, shader); + return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, + shader, &prog_data->base, v.cfg, + final_assembly_size); } } } @@ -875,9 +875,9 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data, if (error_str) *error_str = ralloc_strdup(mem_ctx, gs->fail_msg); } else { - vec4_generator g(compiler, log_data, &prog_data->base, mem_ctx, - INTEL_DEBUG & DEBUG_GS, "geometry", "GS"); - ret = g.generate_assembly(gs->cfg, final_assembly_size, shader); + ret = brw_vec4_generate_assembly(compiler, log_data, mem_ctx, shader, + &prog_data->base, gs->cfg, + final_assembly_size); } delete gs; diff --git a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp index 678237901f2..aa9a6572eee 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp @@ -86,9 +86,10 @@ vec4_live_variables::setup_def_use() } } } - if (inst->reads_flag()) { - if (!BITSET_TEST(bd->flag_def, 0)) { - BITSET_SET(bd->flag_use, 0); + for (unsigned c = 0; c < 4; c++) { + if (inst->reads_flag(c) && + !BITSET_TEST(bd->flag_def, c)) { + BITSET_SET(bd->flag_use, c); } } @@ -110,8 +111,11 @@ vec4_live_variables::setup_def_use() } } if (inst->writes_flag()) { - if (!BITSET_TEST(bd->flag_use, 0)) { - BITSET_SET(bd->flag_def, 0); + for (unsigned c = 0; c < 4; c++) { + if ((inst->dst.writemask & (1 << c)) && + !BITSET_TEST(bd->flag_use, c)) { + BITSET_SET(bd->flag_def, c); + } } } diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp index e79a9f3b5b9..1fb1773f856 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp @@ -193,7 +193,9 @@ vec4_visitor::nir_emit_if(nir_if *if_stmt) vec4_instruction *inst = emit(MOV(dst_null_d(), condition)); inst->conditional_mod = BRW_CONDITIONAL_NZ; - emit(IF(BRW_PREDICATE_NORMAL)); + /* We can just predicate based on the X channel, as the condition only + * goes on its own line */ + emit(IF(BRW_PREDICATE_ALIGN16_REPLICATE_X)); nir_emit_cf_list(&if_stmt->then_list); @@ -806,6 +808,16 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) break; } + case nir_intrinsic_shader_clock: { + /* We cannot do anything if there is an event, so ignore it for now */ + const src_reg shader_clock = get_timestamp(); + const enum brw_reg_type type = brw_type_for_base_type(glsl_type::uvec2_type); + + dest = get_nir_dest(instr->dest, type); + emit(MOV(dest, shader_clock)); + break; + } + default: unreachable("Unknown intrinsic"); } @@ -1144,26 +1156,10 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) case nir_op_ball_iequal3: case nir_op_ball_fequal4: case nir_op_ball_iequal4: { - dst_reg tmp = dst_reg(this, glsl_type::bool_type); - - switch (instr->op) { - case nir_op_ball_fequal2: - case nir_op_ball_iequal2: - tmp.writemask = WRITEMASK_XY; - break; - case nir_op_ball_fequal3: - case nir_op_ball_iequal3: - tmp.writemask = WRITEMASK_XYZ; - break; - case nir_op_ball_fequal4: - case nir_op_ball_iequal4: - tmp.writemask = WRITEMASK_XYZW; - break; - default: - unreachable("not reached"); - } + unsigned swiz = + brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]); - emit(CMP(tmp, op[0], op[1], + emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz), brw_conditional_for_nir_comparison(instr->op))); emit(MOV(dst, src_reg(0))); inst = emit(MOV(dst, src_reg(~0))); @@ -1177,26 +1173,10 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) case nir_op_bany_inequal3: case nir_op_bany_fnequal4: case nir_op_bany_inequal4: { - dst_reg tmp = dst_reg(this, glsl_type::bool_type); + unsigned swiz = + brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]); - switch (instr->op) { - case nir_op_bany_fnequal2: - case nir_op_bany_inequal2: - tmp.writemask = WRITEMASK_XY; - break; - case nir_op_bany_fnequal3: - case nir_op_bany_inequal3: - tmp.writemask = WRITEMASK_XYZ; - break; - case nir_op_bany_fnequal4: - case nir_op_bany_inequal4: - tmp.writemask = WRITEMASK_XYZW; - break; - default: - unreachable("not reached"); - } - - emit(CMP(tmp, op[0], op[1], + emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz), brw_conditional_for_nir_comparison(instr->op))); emit(MOV(dst, src_reg(0))); @@ -1321,26 +1301,18 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) case nir_op_ufind_msb: case nir_op_ifind_msb: { - src_reg temp = src_reg(this, glsl_type::uint_type); - - inst = emit(FBH(dst_reg(temp), op[0])); - inst->dst.writemask = WRITEMASK_XYZW; + emit(FBH(retype(dst, BRW_REGISTER_TYPE_UD), op[0])); /* FBH counts from the MSB side, while GLSL's findMSB() wants the count * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then * subtract the result from 31 to convert the MSB count into an LSB count. */ + src_reg src(dst); + emit(CMP(dst_null_d(), src, src_reg(-1), BRW_CONDITIONAL_NZ)); - /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */ - temp.swizzle = BRW_SWIZZLE_NOOP; - emit(MOV(dst, temp)); - - src_reg src_tmp = src_reg(dst); - emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ)); - - src_tmp.negate = true; - inst = emit(ADD(dst, src_tmp, src_reg(31))); + inst = emit(ADD(dst, src, src_reg(31))); inst->predicate = BRW_PREDICATE_NORMAL; + inst->src[0].negate = true; break; } @@ -1461,11 +1433,11 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) case nir_op_bany2: case nir_op_bany3: case nir_op_bany4: { - dst_reg tmp = dst_reg(this, glsl_type::bool_type); - tmp.writemask = brw_writemask_for_size(nir_op_infos[instr->op].input_sizes[0]); - - emit(CMP(tmp, op[0], src_reg(0), BRW_CONDITIONAL_NZ)); + unsigned swiz = + brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]); + emit(CMP(dst_null_d(), swizzle(op[0], swiz), src_reg(0), + BRW_CONDITIONAL_NZ)); emit(MOV(dst, src_reg(0))); inst = emit(MOV(dst, src_reg(~0))); inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H; diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp index 6d155285820..92b089d7ff6 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp @@ -883,6 +883,18 @@ vec4_visitor::emit_texture(ir_texture_opcode op, uint32_t sampler, src_reg sampler_reg) { + /* The sampler can only meaningfully compute LOD for fragment shader + * messages. For all other stages, we change the opcode to TXL and hardcode + * the LOD to 0. + * + * textureQueryLevels() is implemented in terms of TXS so we need to pass a + * valid LOD argument. + */ + if (op == ir_tex || op == ir_query_levels) { + assert(lod.file == BAD_FILE); + lod = src_reg(0.0f); + } + enum opcode opcode; switch (op) { case ir_tex: opcode = SHADER_OPCODE_TXL; break; diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c index 5db4b3a86af..0b805b1c0c4 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.c +++ b/src/mesa/drivers/dri/i965/brw_vs.c @@ -311,7 +311,8 @@ brw_vs_populate_key(struct brw_context *brw, key->program_string_id = vp->id; if (ctx->Transform.ClipPlanesEnabled != 0 && - ctx->API == API_OPENGL_COMPAT && + (ctx->API == API_OPENGL_COMPAT || + ctx->API == API_OPENGLES) && vp->program.Base.ClipDistanceArraySize == 0) { key->nr_userclip_plane_consts = _mesa_logbase2(ctx->Transform.ClipPlanesEnabled) + 1; diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c index f65258a52a5..d7473845c72 100644 --- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c @@ -177,8 +177,8 @@ brw_upload_vs_abo_surfaces(struct brw_context *brw) if (prog) { /* BRW_NEW_VS_PROG_DATA */ - brw_upload_abo_surfaces(brw, prog, &brw->vs.base, - &brw->vs.prog_data->base.base); + brw_upload_abo_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_VERTEX], + &brw->vs.base, &brw->vs.prog_data->base.base); } } diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c index 6ebe6481c32..f88f8d59196 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c @@ -1029,7 +1029,7 @@ const struct brw_tracked_state brw_cs_ubo_surfaces = { void brw_upload_abo_surfaces(struct brw_context *brw, - struct gl_shader_program *prog, + struct gl_shader *shader, struct brw_stage_state *stage_state, struct brw_stage_prog_data *prog_data) { @@ -1037,21 +1037,22 @@ brw_upload_abo_surfaces(struct brw_context *brw, uint32_t *surf_offsets = &stage_state->surf_offset[prog_data->binding_table.abo_start]; - for (unsigned i = 0; i < prog->NumAtomicBuffers; i++) { - struct gl_atomic_buffer_binding *binding = - &ctx->AtomicBufferBindings[prog->AtomicBuffers[i].Binding]; - struct intel_buffer_object *intel_bo = - intel_buffer_object(binding->BufferObject); - drm_intel_bo *bo = intel_bufferobj_buffer( - brw, intel_bo, binding->Offset, intel_bo->Base.Size - binding->Offset); - - brw->vtbl.emit_buffer_surface_state(brw, &surf_offsets[i], bo, - binding->Offset, BRW_SURFACEFORMAT_RAW, - bo->size - binding->Offset, 1, true); - } + if (shader && shader->NumAtomicBuffers) { + for (unsigned i = 0; i < shader->NumAtomicBuffers; i++) { + struct gl_atomic_buffer_binding *binding = + &ctx->AtomicBufferBindings[shader->AtomicBuffers[i]->Binding]; + struct intel_buffer_object *intel_bo = + intel_buffer_object(binding->BufferObject); + drm_intel_bo *bo = intel_bufferobj_buffer( + brw, intel_bo, binding->Offset, intel_bo->Base.Size - binding->Offset); + + brw->vtbl.emit_buffer_surface_state(brw, &surf_offsets[i], bo, + binding->Offset, BRW_SURFACEFORMAT_RAW, + bo->size - binding->Offset, 1, true); + } - if (prog->NumAtomicBuffers) brw->ctx.NewDriverState |= BRW_NEW_SURFACES; + } } static void @@ -1063,8 +1064,8 @@ brw_upload_wm_abo_surfaces(struct brw_context *brw) if (prog) { /* BRW_NEW_FS_PROG_DATA */ - brw_upload_abo_surfaces(brw, prog, &brw->wm.base, - &brw->wm.prog_data->base); + brw_upload_abo_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_FRAGMENT], + &brw->wm.base, &brw->wm.prog_data->base); } } @@ -1088,8 +1089,8 @@ brw_upload_cs_abo_surfaces(struct brw_context *brw) if (prog) { /* BRW_NEW_CS_PROG_DATA */ - brw_upload_abo_surfaces(brw, prog, &brw->cs.base, - &brw->cs.prog_data->base); + brw_upload_abo_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_COMPUTE], + &brw->cs.base, &brw->cs.prog_data->base); } } diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c index 4068f2844a2..2634e6ba6fd 100644 --- a/src/mesa/drivers/dri/i965/gen6_sf_state.c +++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c @@ -60,6 +60,23 @@ get_attr_override(const struct brw_vue_map *vue_map, int urb_entry_read_offset, /* Find the VUE slot for this attribute. */ int slot = vue_map->varying_to_slot[fs_attr]; + /* Viewport and Layer are stored in the VUE header. We need to override + * them to zero if earlier stages didn't write them, as GL requires that + * they read back as zero when not explicitly set. + */ + if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) { + unsigned override = + ATTRIBUTE_0_OVERRIDE_X | ATTRIBUTE_0_OVERRIDE_W | + ATTRIBUTE_CONST_0000 << ATTRIBUTE_0_CONST_SOURCE_SHIFT; + + if (!(vue_map->slots_valid & VARYING_BIT_LAYER)) + override |= ATTRIBUTE_0_OVERRIDE_Y; + if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT)) + override |= ATTRIBUTE_0_OVERRIDE_Z; + + return override; + } + /* If there was only a back color written but not front, use back * as the color instead of undefined */ @@ -159,14 +176,30 @@ calculate_attr_overrides(const struct brw_context *brw, uint16_t *attr_overrides, uint32_t *point_sprite_enables, uint32_t *flat_enables, - uint32_t *urb_entry_read_length) + uint32_t *urb_entry_read_length, + uint32_t *urb_entry_read_offset) { - const int urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET; uint32_t max_source_attr = 0; *point_sprite_enables = 0; *flat_enables = 0; + *urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET; + + /* BRW_NEW_FRAGMENT_PROGRAM + * + * If the fragment shader reads VARYING_SLOT_LAYER, then we need to pass in + * the full vertex header. Otherwise, we can program the SF to start + * reading at an offset of 1 (2 varying slots) to skip unnecessary data: + * - VARYING_SLOT_PSIZ and BRW_VARYING_SLOT_NDC on gen4-5 + * - VARYING_SLOT_{PSIZ,LAYER} and VARYING_SLOT_POS on gen6+ + */ + + bool fs_needs_vue_header = brw->fragment_program->Base.InputsRead & + (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT); + + *urb_entry_read_offset = fs_needs_vue_header ? 0 : 1; + /* _NEW_LIGHT */ bool shade_model_flat = brw->ctx.Light.ShadeModel == GL_FLAT; @@ -228,7 +261,7 @@ calculate_attr_overrides(const struct brw_context *brw, /* BRW_NEW_VUE_MAP_GEOM_OUT | _NEW_LIGHT | _NEW_PROGRAM */ uint16_t attr_override = point_sprite ? 0 : get_attr_override(&brw->vue_map_geom_out, - urb_entry_read_offset, attr, + *urb_entry_read_offset, attr, brw->ctx.VertexProgram._TwoSideEnabled, &max_source_attr); @@ -276,7 +309,6 @@ upload_sf_state(struct brw_context *brw) bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer); const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1; - const int urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET; float point_size; uint16_t attr_overrides[16]; uint32_t point_sprite_origin; @@ -411,8 +443,10 @@ upload_sf_state(struct brw_context *brw) * _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM | BRW_NEW_FS_PROG_DATA */ uint32_t urb_entry_read_length; + uint32_t urb_entry_read_offset; calculate_attr_overrides(brw, attr_overrides, &point_sprite_enables, - &flat_enables, &urb_entry_read_length); + &flat_enables, &urb_entry_read_length, + &urb_entry_read_offset); dw1 |= (urb_entry_read_length << GEN6_SF_URB_ENTRY_READ_LENGTH_SHIFT | urb_entry_read_offset << GEN6_SF_URB_ENTRY_READ_OFFSET_SHIFT); diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c index 6aeb0cb243f..2d7c04f4ad2 100644 --- a/src/mesa/drivers/dri/i965/gen7_cs_state.c +++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c @@ -285,3 +285,34 @@ const struct brw_tracked_state gen7_cs_push_constants = { }, .emit = gen7_upload_cs_push_constants, }; + +/** + * Creates a new CS constant buffer reflecting the current CS program's + * constants, if needed by the CS program. + */ +static void +brw_upload_cs_pull_constants(struct brw_context *brw) +{ + struct brw_stage_state *stage_state = &brw->cs.base; + + /* BRW_NEW_COMPUTE_PROGRAM */ + struct brw_compute_program *cp = + (struct brw_compute_program *) brw->compute_program; + + /* BRW_NEW_CS_PROG_DATA */ + const struct brw_stage_prog_data *prog_data = &brw->cs.prog_data->base; + + /* _NEW_PROGRAM_CONSTANTS */ + brw_upload_pull_constants(brw, BRW_NEW_SURFACES, &cp->program.Base, + stage_state, prog_data, true); +} + +const struct brw_tracked_state brw_cs_pull_constants = { + .dirty = { + .mesa = _NEW_PROGRAM_CONSTANTS, + .brw = BRW_NEW_BATCH | + BRW_NEW_COMPUTE_PROGRAM | + BRW_NEW_CS_PROG_DATA, + }, + .emit = brw_upload_cs_pull_constants, +}; diff --git a/src/mesa/drivers/dri/i965/gen7_sf_state.c b/src/mesa/drivers/dri/i965/gen7_sf_state.c index 698b3d491bc..b1f13aceba4 100644 --- a/src/mesa/drivers/dri/i965/gen7_sf_state.c +++ b/src/mesa/drivers/dri/i965/gen7_sf_state.c @@ -40,7 +40,6 @@ upload_sbe_state(struct brw_context *brw) uint32_t point_sprite_enables; uint32_t flat_enables; int i; - const int urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET; uint16_t attr_overrides[16]; /* _NEW_BUFFERS */ bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer); @@ -65,8 +64,10 @@ upload_sbe_state(struct brw_context *brw) * _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM | BRW_NEW_FS_PROG_DATA */ uint32_t urb_entry_read_length; + uint32_t urb_entry_read_offset; calculate_attr_overrides(brw, attr_overrides, &point_sprite_enables, - &flat_enables, &urb_entry_read_length); + &flat_enables, &urb_entry_read_length, + &urb_entry_read_offset); dw1 |= urb_entry_read_length << GEN7_SBE_URB_ENTRY_READ_LENGTH_SHIFT | urb_entry_read_offset << GEN7_SBE_URB_ENTRY_READ_OFFSET_SHIFT; diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c index 8f0507413a7..10e433b1d59 100644 --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c @@ -95,6 +95,11 @@ gen8_upload_ps_extra(struct brw_context *brw, !brw_color_buffer_write_enabled(brw)) dw1 |= GEN8_PSX_SHADER_HAS_UAV; + if (prog_data->computed_stencil) { + assert(brw->gen >= 9); + dw1 |= GEN9_PSX_SHADER_COMPUTES_STENCIL; + } + BEGIN_BATCH(2); OUT_BATCH(_3DSTATE_PS_EXTRA << 16 | (2 - 2)); OUT_BATCH(dw1); diff --git a/src/mesa/drivers/dri/i965/gen8_sf_state.c b/src/mesa/drivers/dri/i965/gen8_sf_state.c index 6b655ee493e..8b6f31f3be6 100644 --- a/src/mesa/drivers/dri/i965/gen8_sf_state.c +++ b/src/mesa/drivers/dri/i965/gen8_sf_state.c @@ -37,6 +37,7 @@ upload_sbe(struct brw_context *brw) uint32_t num_outputs = brw->wm.prog_data->num_varying_inputs; uint16_t attr_overrides[VARYING_SLOT_MAX]; uint32_t urb_entry_read_length; + uint32_t urb_entry_read_offset; uint32_t point_sprite_enables; uint32_t flat_enables; int sbe_cmd_length; @@ -66,7 +67,8 @@ upload_sbe(struct brw_context *brw) calculate_attr_overrides(brw, attr_overrides, &point_sprite_enables, &flat_enables, - &urb_entry_read_length); + &urb_entry_read_length, + &urb_entry_read_offset); /* Typically, the URB entry read length and offset should be programmed in * 3DSTATE_VS and 3DSTATE_GS; SBE inherits it from the last active stage @@ -78,7 +80,7 @@ upload_sbe(struct brw_context *brw) */ dw1 |= urb_entry_read_length << GEN7_SBE_URB_ENTRY_READ_LENGTH_SHIFT | - BRW_SF_URB_ENTRY_READ_OFFSET << GEN8_SBE_URB_ENTRY_READ_OFFSET_SHIFT | + urb_entry_read_offset << GEN8_SBE_URB_ENTRY_READ_OFFSET_SHIFT | GEN8_SBE_FORCE_URB_ENTRY_READ_LENGTH | GEN8_SBE_FORCE_URB_ENTRY_READ_OFFSET; diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c b/src/mesa/drivers/dri/i965/gen8_surface_state.c index 18b86652fd2..140a6544983 100644 --- a/src/mesa/drivers/dri/i965/gen8_surface_state.c +++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c @@ -183,6 +183,14 @@ gen8_emit_buffer_surface_state(struct brw_context *brw, } static void +gen8_emit_fast_clear_color(struct brw_context *brw, + struct intel_mipmap_tree *mt, + uint32_t *surf) +{ + surf[7] |= mt->fast_clear_color_value; +} + +static void gen8_emit_texture_surface_state(struct brw_context *brw, struct intel_mipmap_tree *mt, GLenum target, @@ -284,11 +292,10 @@ gen8_emit_texture_surface_state(struct brw_context *brw, SET_FIELD((aux_mt->pitch / tile_w) - 1, GEN8_SURFACE_AUX_PITCH) | aux_mode; - } else { - surf[6] = 0; } - surf[7] = mt->fast_clear_color_value | + gen8_emit_fast_clear_color(brw, mt, surf); + surf[7] |= SET_FIELD(swizzle_to_scs(GET_SWZ(swizzle, 0)), GEN7_SURFACE_SCS_R) | SET_FIELD(swizzle_to_scs(GET_SWZ(swizzle, 1)), GEN7_SURFACE_SCS_G) | SET_FIELD(swizzle_to_scs(GET_SWZ(swizzle, 2)), GEN7_SURFACE_SCS_B) | @@ -302,11 +309,7 @@ gen8_emit_texture_surface_state(struct brw_context *brw, aux_mt->bo, 0, I915_GEM_DOMAIN_SAMPLER, (rw ? I915_GEM_DOMAIN_SAMPLER : 0)); - } else { - surf[10] = 0; - surf[11] = 0; } - surf[12] = 0; /* Emit relocation to surface contents */ drm_intel_bo_emit_reloc(brw->batch.bo, @@ -514,15 +517,13 @@ gen8_update_renderbuffer_surface(struct brw_context *brw, SET_FIELD((aux_mt->pitch / tile_w) - 1, GEN8_SURFACE_AUX_PITCH) | aux_mode; - } else { - surf[6] = 0; } - surf[7] = mt->fast_clear_color_value | - SET_FIELD(HSW_SCS_RED, GEN7_SURFACE_SCS_R) | - SET_FIELD(HSW_SCS_GREEN, GEN7_SURFACE_SCS_G) | - SET_FIELD(HSW_SCS_BLUE, GEN7_SURFACE_SCS_B) | - SET_FIELD(HSW_SCS_ALPHA, GEN7_SURFACE_SCS_A); + gen8_emit_fast_clear_color(brw, mt, surf); + surf[7] |= SET_FIELD(HSW_SCS_RED, GEN7_SURFACE_SCS_R) | + SET_FIELD(HSW_SCS_GREEN, GEN7_SURFACE_SCS_G) | + SET_FIELD(HSW_SCS_BLUE, GEN7_SURFACE_SCS_B) | + SET_FIELD(HSW_SCS_ALPHA, GEN7_SURFACE_SCS_A); assert(mt->offset % mt->cpp == 0); *((uint64_t *) &surf[8]) = mt->bo->offset64 + mt->offset; /* reloc */ @@ -533,11 +534,7 @@ gen8_update_renderbuffer_surface(struct brw_context *brw, offset + 10 * 4, aux_mt->bo, 0, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER); - } else { - surf[10] = 0; - surf[11] = 0; } - surf[12] = 0; drm_intel_bo_emit_reloc(brw->batch.bo, offset + 8 * 4, diff --git a/src/mesa/drivers/dri/i965/intel_debug.c b/src/mesa/drivers/dri/i965/intel_debug.c index f7c02c8a38d..c00d2e786f3 100644 --- a/src/mesa/drivers/dri/i965/intel_debug.c +++ b/src/mesa/drivers/dri/i965/intel_debug.c @@ -73,6 +73,8 @@ static const struct debug_control debug_control[] = { { "spill_fs", DEBUG_SPILL_FS }, { "spill_vec4", DEBUG_SPILL_VEC4 }, { "cs", DEBUG_CS }, + { "hex", DEBUG_HEX }, + { "nocompact", DEBUG_NO_COMPACTION }, { NULL, 0 } }; diff --git a/src/mesa/drivers/dri/i965/intel_debug.h b/src/mesa/drivers/dri/i965/intel_debug.h index 0a6e1b90b98..98bd7e93956 100644 --- a/src/mesa/drivers/dri/i965/intel_debug.h +++ b/src/mesa/drivers/dri/i965/intel_debug.h @@ -67,6 +67,8 @@ extern uint64_t INTEL_DEBUG; #define DEBUG_SPILL_FS (1ull << 31) #define DEBUG_SPILL_VEC4 (1ull << 32) #define DEBUG_CS (1ull << 33) +#define DEBUG_HEX (1ull << 34) +#define DEBUG_NO_COMPACTION (1ull << 35) #ifdef HAVE_ANDROID_PLATFORM #define LOG_TAG "INTEL-MESA" diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c index 3f9afd16c71..4643ea3e87b 100644 --- a/src/mesa/drivers/dri/i965/intel_extensions.c +++ b/src/mesa/drivers/dri/i965/intel_extensions.c @@ -287,6 +287,7 @@ intelInitExtensions(struct gl_context *ctx) ctx->Extensions.ARB_conditional_render_inverted = true; ctx->Extensions.ARB_draw_buffers_blend = true; ctx->Extensions.ARB_ES3_compatibility = true; + ctx->Extensions.ARB_fragment_layer_viewport = true; ctx->Extensions.ARB_sample_shading = true; ctx->Extensions.ARB_shading_language_420pack = true; ctx->Extensions.ARB_shading_language_packing = true; @@ -324,6 +325,7 @@ intelInitExtensions(struct gl_context *ctx) ctx->Extensions.ARB_framebuffer_no_attachments = true; ctx->Extensions.ARB_gpu_shader5 = true; ctx->Extensions.ARB_shader_atomic_counters = true; + ctx->Extensions.ARB_shader_clock = true; ctx->Extensions.ARB_shader_image_load_store = true; ctx->Extensions.ARB_shader_image_size = true; ctx->Extensions.ARB_shader_texture_image_samples = true; @@ -358,6 +360,7 @@ intelInitExtensions(struct gl_context *ctx) if (brw->gen >= 9) { ctx->Extensions.KHR_texture_compression_astc_ldr = true; ctx->Extensions.KHR_texture_compression_astc_hdr = true; + ctx->Extensions.ARB_shader_stencil_export = true; } if (ctx->API == API_OPENGL_CORE) diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c index 5a6b0dd1ec5..3a4a53a07e6 100644 --- a/src/mesa/drivers/dri/i965/intel_fbo.c +++ b/src/mesa/drivers/dri/i965/intel_fbo.c @@ -343,19 +343,15 @@ intel_image_target_renderbuffer_storage(struct gl_context *ctx, if (image->planar_format && image->planar_format->nplanes > 1) { _mesa_error(ctx, GL_INVALID_OPERATION, "glEGLImageTargetRenderbufferStorage(planar buffers are not " - "supported as render targets."); + "supported as render targets.)"); return; } /* __DRIimage is opaque to the core so it has to be checked here */ - switch (image->format) { - case MESA_FORMAT_R8G8B8A8_UNORM: + if (!brw->format_supported_as_render_target[image->format]) { _mesa_error(ctx, GL_INVALID_OPERATION, - "glEGLImageTargetRenderbufferStorage(unsupported image format"); + "glEGLImageTargetRenderbufferStorage(unsupported image format)"); return; - break; - default: - break; } irb = intel_renderbuffer(rb); diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c index 590c45d93ea..fb95fb629ad 100644 --- a/src/mesa/drivers/dri/i965/intel_screen.c +++ b/src/mesa/drivers/dri/i965/intel_screen.c @@ -1357,7 +1357,16 @@ set_max_gl_versions(struct intel_screen *screen) } } -static int +/** + * Return the revision (generally the revid field of the PCI header) of the + * graphics device. + * + * XXX: This function is useful to keep around even if it is not currently in + * use. It is necessary for new platforms and revision specific workarounds or + * features. Please don't remove it so that we know it at least continues to + * build. + */ +static __attribute__((__unused__)) int brw_get_revision(int fd) { struct drm_i915_getparam gp; @@ -1416,8 +1425,7 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp) return false; intelScreen->deviceID = drm_intel_bufmgr_gem_get_devid(intelScreen->bufmgr); - intelScreen->devinfo = brw_get_device_info(intelScreen->deviceID, - brw_get_revision(psp->fd)); + intelScreen->devinfo = brw_get_device_info(intelScreen->deviceID); if (!intelScreen->devinfo) return false; diff --git a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp index 5f80f90a91d..62d39f70ec4 100644 --- a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp +++ b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp @@ -84,7 +84,7 @@ instruction(bblock_t *block, int num) static bool cmod_propagation(fs_visitor *v) { - const bool print = false; + const bool print = getenv("TEST_DEBUG"); if (print) { fprintf(stderr, "= Before =\n"); diff --git a/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp new file mode 100644 index 00000000000..9aa2fcc7907 --- /dev/null +++ b/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp @@ -0,0 +1,822 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Based on test_fs_cmod_propagation.cpp + */ + +#include <gtest/gtest.h> +#include "brw_vec4.h" +#include "brw_vec4_builder.h" +#include "brw_cfg.h" +#include "program/program.h" + +using namespace brw; + +class cmod_propagation_test : public ::testing::Test { + virtual void SetUp(); + +public: + struct brw_compiler *compiler; + struct brw_device_info *devinfo; + struct gl_context *ctx; + struct gl_shader_program *shader_prog; + struct brw_vertex_program *vp; + vec4_visitor *v; +}; + +class cmod_propagation_vec4_visitor : public vec4_visitor +{ +public: + cmod_propagation_vec4_visitor(struct brw_compiler *compiler, + nir_shader *shader) + : vec4_visitor(compiler, NULL, NULL, NULL, shader, NULL, + false, -1) {} + +protected: + /* Dummy implementation for pure virtual methods */ + virtual dst_reg *make_reg_for_system_value(int location, + const glsl_type *type) + { + unreachable("Not reached"); + } + + virtual void setup_payload() + { + unreachable("Not reached"); + } + + virtual void emit_prolog() + { + unreachable("Not reached"); + } + + virtual void emit_program_code() + { + unreachable("Not reached"); + } + + virtual void emit_thread_end() + { + unreachable("Not reached"); + } + + virtual void emit_urb_write_header(int mrf) + { + unreachable("Not reached"); + } + + virtual vec4_instruction *emit_urb_write_opcode(bool complete) + { + unreachable("Not reached"); + } +}; + + +void cmod_propagation_test::SetUp() +{ + ctx = (struct gl_context *)calloc(1, sizeof(*ctx)); + compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler)); + devinfo = (struct brw_device_info *)calloc(1, sizeof(*devinfo)); + compiler->devinfo = devinfo; + + vp = ralloc(NULL, struct brw_vertex_program); + + nir_shader *shader = nir_shader_create(NULL, MESA_SHADER_VERTEX, NULL); + + v = new cmod_propagation_vec4_visitor(compiler, shader); + + _mesa_init_gl_program(&vp->program.Base, GL_VERTEX_SHADER, 0); + + devinfo->gen = 4; +} + +static vec4_instruction * +instruction(bblock_t *block, int num) +{ + vec4_instruction *inst = (vec4_instruction *)block->start(); + for (int i = 0; i < num; i++) { + inst = (vec4_instruction *)inst->next; + } + return inst; +} + +static bool +cmod_propagation(vec4_visitor *v) +{ + const bool print = getenv("TEST_DEBUG"); + + if (print) { + fprintf(stderr, "= Before =\n"); + v->dump_instructions(); + } + + bool ret = v->opt_cmod_propagation(); + + if (print) { + fprintf(stderr, "\n= After =\n"); + v->dump_instructions(); + } + + return ret; +} + +TEST_F(cmod_propagation_test, basic) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::float_type); + src_reg src0 = src_reg(v, glsl_type::float_type); + src_reg src1 = src_reg(v, glsl_type::float_type); + src_reg zero(0.0f); + dst_reg dest_null = bld.null_reg_f(); + dest_null.writemask = WRITEMASK_X; + + bld.ADD(dest, src0, src1); + bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add dest.x src0.xxxx src1.xxxx + * 1: cmp.ge.f0 null.x dest.xxxx 0.0f + * + * = After = + * 0: add.ge.f0 dest.x src0.xxxx src1.xxxx + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(0, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod); +} + +TEST_F(cmod_propagation_test, basic_different_dst_writemask) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::float_type); + src_reg src0 = src_reg(v, glsl_type::float_type); + src_reg src1 = src_reg(v, glsl_type::float_type); + src_reg zero(0.0f); + dst_reg dest_null = bld.null_reg_f(); + + bld.ADD(dest, src0, src1); + bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add dest.x src0 src1 + * 1: cmp.ge.f0 null.xyzw dest 0.0f + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); +} + +TEST_F(cmod_propagation_test, andz_one) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::int_type); + src_reg src0 = src_reg(v, glsl_type::float_type); + src_reg zero(0.0f); + src_reg one(1); + + bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L); + set_condmod(BRW_CONDITIONAL_Z, + bld.AND(bld.null_reg_d(), src_reg(dest), one)); + + /* = Before = + * 0: cmp.l.f0 dest:F src0:F 0F + * 1: and.z.f0 null:D dest:D 1D + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_EQ, instruction(block0, 1)->conditional_mod); +} + +TEST_F(cmod_propagation_test, non_cmod_instruction) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::uint_type); + src_reg src0 = src_reg(v, glsl_type::uint_type); + src_reg zero(0u); + bld.FBL(dest, src0); + bld.CMP(bld.null_reg_ud(), src_reg(dest), zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: fbl dest src0 + * 1: cmp.ge.f0 null dest 0u + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_FBL, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); +} + +TEST_F(cmod_propagation_test, intervening_flag_write) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::float_type); + src_reg src0 = src_reg(v, glsl_type::float_type); + src_reg src1 = src_reg(v, glsl_type::float_type); + src_reg src2 = src_reg(v, glsl_type::float_type); + src_reg zero(0.0f); + bld.ADD(dest, src0, src1); + bld.CMP(bld.null_reg_f(), src2, zero, BRW_CONDITIONAL_GE); + bld.CMP(bld.null_reg_f(), src_reg(dest), zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add dest src0 src1 + * 1: cmp.ge.f0 null src2 0.0f + * 2: cmp.ge.f0 null dest 0.0f + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod); +} + +TEST_F(cmod_propagation_test, intervening_flag_read) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest0 = dst_reg(v, glsl_type::float_type); + dst_reg dest1 = dst_reg(v, glsl_type::float_type); + src_reg src0 = src_reg(v, glsl_type::float_type); + src_reg src1 = src_reg(v, glsl_type::float_type); + src_reg src2 = src_reg(v, glsl_type::float_type); + src_reg zero(0.0f); + bld.ADD(dest0, src0, src1); + set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero)); + bld.CMP(bld.null_reg_f(), src_reg(dest0), zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add dest0 src0 src1 + * 1: (+f0) sel dest1 src2 0.0f + * 2: cmp.ge.f0 null dest0 0.0f + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod); +} + +TEST_F(cmod_propagation_test, intervening_dest_write) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::vec4_type); + src_reg src0 = src_reg(v, glsl_type::float_type); + src_reg src1 = src_reg(v, glsl_type::float_type); + src_reg src2 = src_reg(v, glsl_type::vec2_type); + src_reg zero(0.0f); + bld.ADD(offset(dest, 2), src0, src1); + bld.emit(SHADER_OPCODE_TEX, dest, src2) + ->regs_written = 4; + bld.CMP(bld.null_reg_f(), offset(src_reg(dest), 2), zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add dest+2 src0 src1 + * 1: tex rlen 4 dest+0 src2 + * 2: cmp.ge.f0 null dest+2 0.0f + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(SHADER_OPCODE_TEX, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod); +} + +TEST_F(cmod_propagation_test, intervening_flag_read_same_value) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest0 = dst_reg(v, glsl_type::float_type); + dst_reg dest1 = dst_reg(v, glsl_type::float_type); + src_reg src0 = src_reg(v, glsl_type::float_type); + src_reg src1 = src_reg(v, glsl_type::float_type); + src_reg src2 = src_reg(v, glsl_type::float_type); + src_reg zero(0.0f); + dst_reg dest_null = bld.null_reg_f(); + dest_null.writemask = WRITEMASK_X; + + set_condmod(BRW_CONDITIONAL_GE, bld.ADD(dest0, src0, src1)); + set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero)); + bld.CMP(dest_null, src_reg(dest0), zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add.ge.f0 dest0 src0 src1 + * 1: (+f0) sel dest1 src2 0.0f + * 2: cmp.ge.f0 null.x dest0 0.0f + * + * = After = + * 0: add.ge.f0 dest0 src0 src1 + * 1: (+f0) sel dest1 src2 0.0f + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + + EXPECT_TRUE(cmod_propagation(v)); + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate); +} + +TEST_F(cmod_propagation_test, negate) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::float_type); + src_reg src0 = src_reg(v, glsl_type::float_type); + src_reg src1 = src_reg(v, glsl_type::float_type); + src_reg zero(0.0f); + bld.ADD(dest, src0, src1); + src_reg tmp_src = src_reg(dest); + tmp_src.negate = true; + dst_reg dest_null = bld.null_reg_f(); + dest_null.writemask = WRITEMASK_X; + bld.CMP(dest_null, tmp_src, zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add dest src0 src1 + * 1: cmp.ge.f0 null.x -dest 0.0f + * + * = After = + * 0: add.le.f0 dest src0 src1 + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(0, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_LE, instruction(block0, 0)->conditional_mod); +} + +TEST_F(cmod_propagation_test, movnz) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::float_type); + src_reg src0 = src_reg(v, glsl_type::float_type); + src_reg src1 = src_reg(v, glsl_type::float_type); + dst_reg dest_null = bld.null_reg_f(); + dest_null.writemask = WRITEMASK_X; + + bld.CMP(dest, src0, src1, BRW_CONDITIONAL_L); + set_condmod(BRW_CONDITIONAL_NZ, + bld.MOV(dest_null, src_reg(dest))); + + /* = Before = + * + * 0: cmp.l.f0 dest:F src0:F src1:F + * 1: mov.nz.f0 null.x dest:F + * + * = After = + * 0: cmp.l.f0 dest src0:F src1:F + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(0, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod); +} + +TEST_F(cmod_propagation_test, different_types_cmod_with_zero) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::int_type); + src_reg src0 = src_reg(v, glsl_type::int_type); + src_reg src1 = src_reg(v, glsl_type::int_type); + src_reg zero(0.0f); + bld.ADD(dest, src0, src1); + bld.CMP(bld.null_reg_f(), retype(src_reg(dest), BRW_REGISTER_TYPE_F), zero, + BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add dest:D src0:D src1:D + * 1: cmp.ge.f0 null:F dest:F 0.0f + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); +} + +TEST_F(cmod_propagation_test, andnz_non_one) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::int_type); + src_reg src0 = src_reg(v, glsl_type::float_type); + src_reg zero(0.0f); + src_reg nonone(38); + + bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L); + set_condmod(BRW_CONDITIONAL_NZ, + bld.AND(bld.null_reg_d(), src_reg(dest), nonone)); + + /* = Before = + * 0: cmp.l.f0 dest:F src0:F 0F + * 1: and.nz.f0 null:D dest:D 38D + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod); +} + +/* Note that basic is using glsl_type:float types, while this one is using + * glsl_type::vec4 */ +TEST_F(cmod_propagation_test, basic_vec4) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::vec4_type); + src_reg src0 = src_reg(v, glsl_type::vec4_type); + src_reg src1 = src_reg(v, glsl_type::vec4_type); + src_reg zero(0.0f); + + bld.MUL(dest, src0, src1); + bld.CMP(bld.null_reg_f(), src_reg(dest), zero, BRW_CONDITIONAL_NZ); + + /* = Before = + * 0: mul dest.xyzw src0.xyzw src1.xyzw + * 1: cmp.nz.f0.0 null.xyzw dest.xyzw 0.0f + * + * = After = + * 0: mul.nz.f0.0 dest.xyzw src0.xyzw src1.xyzw + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(0, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod); +} + +TEST_F(cmod_propagation_test, basic_vec4_different_dst_writemask) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::vec4_type); + dest.writemask = WRITEMASK_X; + src_reg src0 = src_reg(v, glsl_type::vec4_type); + src_reg src1 = src_reg(v, glsl_type::vec4_type); + src_reg zero(0.0f); + dst_reg dest_null = bld.null_reg_f(); + + bld.MUL(dest, src0, src1); + bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_NZ); + + /* = Before = + * 0: mul dest.x src0 src1 + * 1: cmp.nz.f0.0 null dest 0.0f + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod); +} + +TEST_F(cmod_propagation_test, mad_one_component_vec4) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::vec4_type); + dest.writemask = WRITEMASK_X; + src_reg src0 = src_reg(v, glsl_type::vec4_type); + src_reg src1 = src_reg(v, glsl_type::vec4_type); + src_reg src2 = src_reg(v, glsl_type::vec4_type); + src0.swizzle = src1.swizzle = src2.swizzle = BRW_SWIZZLE_XXXX; + src2.negate = true; + src_reg zero(0.0f); + src_reg tmp(dest); + tmp.swizzle = BRW_SWIZZLE_XXXX; + dst_reg dest_null = bld.null_reg_f(); + dest_null.writemask = WRITEMASK_X; + + bld.MAD(dest, src0, src1, src2); + bld.CMP(dest_null, tmp, zero, BRW_CONDITIONAL_L); + + /* = Before = + * + * 0: mad dest.x:F src0.xxxx:F src10.xxxx:F -src2.xxxx:F + * 1: cmp.l.f0.0 null.x:F dest.xxxx:F 0.0f + * + * = After = + * 0: mad.l.f0 dest.x:F src0.xxxx:F src10.xxxx:F -src2.xxxx:F + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(0, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_MAD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod); +} + +TEST_F(cmod_propagation_test, mad_more_one_component_vec4) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::vec4_type); + dest.writemask = WRITEMASK_XW; + src_reg src0 = src_reg(v, glsl_type::vec4_type); + src_reg src1 = src_reg(v, glsl_type::vec4_type); + src_reg src2 = src_reg(v, glsl_type::vec4_type); + src0.swizzle = src1.swizzle = src2.swizzle = BRW_SWIZZLE_XXXX; + src2.negate = true; + src_reg zero(0.0f); + src_reg tmp(dest); + tmp.swizzle = BRW_SWIZZLE_XXXX; + dst_reg dest_null = bld.null_reg_f(); + + bld.MAD(dest, src0, src1, src2); + bld.CMP(dest_null, tmp, zero, BRW_CONDITIONAL_L); + + /* = Before = + * + * 0: mad dest.xw:F src0.xxxx:F src10.xxxx:F -src2.xxxx:F + * 1: cmp.l.f0.0 null:F dest.xxxx:F zeroF + * + * = After = + * (No changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_MAD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 1)->conditional_mod); +} + +TEST_F(cmod_propagation_test, cmp_mov_vec4) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::ivec4_type); + dest.writemask = WRITEMASK_X; + src_reg src0 = src_reg(v, glsl_type::ivec4_type); + src0.swizzle = BRW_SWIZZLE_XXXX; + src0.file = UNIFORM; + src_reg nonone = retype(src_reg(16), BRW_REGISTER_TYPE_D); + src_reg mov_src = src_reg(dest); + mov_src.swizzle = BRW_SWIZZLE_XXXX; + dst_reg dest_null = bld.null_reg_d(); + dest_null.writemask = WRITEMASK_X; + + bld.CMP(dest, src0, nonone, BRW_CONDITIONAL_GE); + set_condmod(BRW_CONDITIONAL_NZ, + bld.MOV(dest_null, mov_src)); + + /* = Before = + * + * 0: cmp.ge.f0 dest.x:D u.xxxx:D 16D + * 1: mov.nz.f0 null.x:D dest.xxxx:D + * + * = After = + * 0: cmp.ge.f0 dest.x:D u.xxxx:D 16D + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(0, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod); +} + +TEST_F(cmod_propagation_test, mul_cmp_different_channels_vec4) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::vec4_type); + src_reg src0 = src_reg(v, glsl_type::vec4_type); + src_reg src1 = src_reg(v, glsl_type::vec4_type); + src_reg zero(0.0f); + src_reg cmp_src = src_reg(dest); + cmp_src.swizzle = BRW_SWIZZLE4(0,1,3,2); + + bld.MUL(dest, src0, src1); + bld.CMP(bld.null_reg_f(), cmp_src, zero, BRW_CONDITIONAL_NZ); + + /* = Before = + * 0: mul dest src0 src1 + * 1: cmp.nz.f0.0 null dest.xywz 0.0f + * + * = After = + * (No changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod); +} diff --git a/src/mesa/drivers/dri/nouveau/nouveau_context.c b/src/mesa/drivers/dri/nouveau/nouveau_context.c index a049d9b8de7..cb854b81933 100644 --- a/src/mesa/drivers/dri/nouveau/nouveau_context.c +++ b/src/mesa/drivers/dri/nouveau/nouveau_context.c @@ -188,7 +188,7 @@ nouveau_context_init(struct gl_context *ctx, gl_api api, ctx->Extensions.EXT_blend_minmax = true; ctx->Extensions.EXT_texture_filter_anisotropic = true; ctx->Extensions.NV_texture_env_combine4 = true; - ctx->Const.MaxColorAttachments = 1; + ctx->Const.MaxDrawBuffers = ctx->Const.MaxColorAttachments = 1; /* This effectively disables 3D textures */ ctx->Const.Max3DTextureLevels = 1; |