diff options
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_defines.h | 2 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.cpp | 39 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.h | 7 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 54 |
4 files changed, 50 insertions, 52 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index d9b7f9aeb15..6414e69892d 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -727,7 +727,7 @@ enum opcode { FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7, FS_OPCODE_MOV_DISPATCH_TO_FLAGS, FS_OPCODE_DISCARD_JUMP, - FS_OPCODE_SET_GLOBAL_OFFSET, + FS_OPCODE_SET_SIMD4X2_OFFSET, FS_OPCODE_PACK_HALF_2x16_SPLIT, FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index b97a19e0510..5380abfe2f4 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -2461,6 +2461,11 @@ fs_visitor::insert_gen4_send_dependency_workarounds() * scheduling full flexibility, while the conversion to native instructions * allows the post-register-allocation scheduler the best information * possible. + * + * Note that execution masking for setting up pull constant loads is special: + * the channels that need to be written are unrelated to the current execution + * mask, since a later instruction will use one of the result channels as a + * source operand for all 8 or 16 of its channels. */ void fs_visitor::lower_uniform_pull_constant_loads() @@ -2477,26 +2482,24 @@ fs_visitor::lower_uniform_pull_constant_loads() const_offset_reg.type == BRW_REGISTER_TYPE_UD); const_offset_reg.imm.u /= 16; fs_reg payload = fs_reg(this, glsl_type::uint_type); - struct brw_reg g0 = retype(brw_vec8_grf(0, 0), - BRW_REGISTER_TYPE_UD); - - fs_inst *setup1 = MOV(payload, fs_reg(g0)); - setup1->force_writemask_all = true; - /* We don't need the second half of this vgrf to be filled with g1 - * in the 16-wide case, but if we use force_uncompressed then live - * variable analysis won't consider this a def! + + /* This is actually going to be a MOV, but since only the first dword + * is accessed, we have a special opcode to do just that one. Note + * that this needs to be an operation that will be considered a def + * by live variable analysis, or register allocation will explode. */ + fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET, + payload, const_offset_reg); + setup->force_writemask_all = true; - fs_inst *setup2 = new(mem_ctx) fs_inst(FS_OPCODE_SET_GLOBAL_OFFSET, - payload, payload, - const_offset_reg); + setup->ir = inst->ir; + setup->annotation = inst->annotation; + inst->insert_before(setup); - setup1->ir = inst->ir; - setup1->annotation = inst->annotation; - inst->insert_before(setup1); - setup2->ir = inst->ir; - setup2->annotation = inst->annotation; - inst->insert_before(setup2); + /* Similarly, this will only populate the first 4 channels of the + * result register (since we only use smear values from 0-3), but we + * don't tell the optimizer. + */ inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7; inst->src[1] = payload; @@ -2533,7 +2536,7 @@ fs_visitor::dump_instruction(fs_inst *inst) case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7: printf("uniform_pull_const_gen7"); break; - case FS_OPCODE_SET_GLOBAL_OFFSET: + case FS_OPCODE_SET_SIMD4X2_OFFSET: printf("set_global_offset"); break; default: diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index f7ccc7909e2..febd56bfe2e 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -546,10 +546,9 @@ private: struct brw_reg index, struct brw_reg offset); void generate_mov_dispatch_to_flags(fs_inst *inst); - void generate_set_global_offset(fs_inst *inst, - struct brw_reg dst, - struct brw_reg src, - struct brw_reg offset); + void generate_set_simd4x2_offset(fs_inst *inst, + struct brw_reg dst, + struct brw_reg offset); void generate_discard_jump(fs_inst *inst); void generate_pack_half_2x16_split(fs_inst *inst, diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp index 2391ad12026..712fef6e093 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp @@ -647,6 +647,8 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst, uint32_t surf_index = index.dw1.ud; assert(offset.file == BRW_GENERAL_REGISTER_FILE); + /* Reference just the dword we need, to avoid angering validate_reg(). */ + offset = brw_vec1_grf(offset.nr, 0); brw_push_insn_state(p); brw_set_compression_control(p, BRW_COMPRESSION_NONE); @@ -654,20 +656,22 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst, struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND); brw_pop_insn_state(p); + /* We use the SIMD4x2 mode because we want to end up with 4 components in + * the destination loaded consecutively from the same offset (which appears + * in the first component, and the rest are ignored). + */ + dst.width = BRW_WIDTH_4; brw_set_dest(p, send, dst); brw_set_src0(p, send, offset); - - uint32_t msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS; - uint32_t msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ; - bool header_present = true; - brw_set_dp_read_message(p, send, + brw_set_sampler_message(p, send, surf_index, - msg_control, - msg_type, - BRW_DATAPORT_READ_TARGET_DATA_CACHE, - 1, - header_present, - 1); + 0, /* LD message ignores sampler unit */ + GEN5_SAMPLER_MESSAGE_SAMPLE_LD, + 1, /* rlen */ + 1, /* mlen */ + false, /* no header */ + BRW_SAMPLER_SIMD_MODE_SIMD4X2, + 0); } void @@ -858,31 +862,23 @@ brw_reg_from_fs_reg(fs_reg *reg) } /** - * Sets the second dword of a vgrf for gen7+ message setup. + * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant + * sampler LD messages. * - * For setting up gen7 messages in VGRFs, we need to be able to set the second - * dword for some payloads where in the MRF world we'd have just used - * brw_message_reg(). We don't want to bake it into the send message's code - * generation because that means we don't get a chance to schedule the - * instructions. + * We don't want to bake it into the send message's code generation because + * that means we don't get a chance to schedule the instructions. */ void -fs_generator::generate_set_global_offset(fs_inst *inst, - struct brw_reg dst, - struct brw_reg src, - struct brw_reg value) +fs_generator::generate_set_simd4x2_offset(fs_inst *inst, + struct brw_reg dst, + struct brw_reg value) { - /* We use a matching src and dst to get the information on how this - * instruction works exposed to various optimization passes that would - * otherwise treat it as completely overwriting the dst. - */ - assert(src.file == dst.file && src.nr == dst.nr); assert(value.file == BRW_IMMEDIATE_VALUE); brw_push_insn_state(p); brw_set_compression_control(p, BRW_COMPRESSION_NONE); brw_set_mask_control(p, BRW_MASK_DISABLE); - brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 2), value.type), value); + brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value); brw_pop_insn_state(p); } @@ -1298,8 +1294,8 @@ fs_generator::generate_code(exec_list *instructions) brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_WM_SHADER_TIME); break; - case FS_OPCODE_SET_GLOBAL_OFFSET: - generate_set_global_offset(inst, dst, src[0], src[1]); + case FS_OPCODE_SET_SIMD4X2_OFFSET: + generate_set_simd4x2_offset(inst, dst, src[0]); break; case FS_OPCODE_PACK_HALF_2x16_SPLIT: |