diff options
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.cpp | 73 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_builder.h | 8 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp | 24 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_cse.cpp | 4 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 16 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 57 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 3 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp | 2 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp | 2 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 2 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_ir_fs.h | 3 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp | 6 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_shader.h | 1 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp | 2 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp | 2 |
15 files changed, 107 insertions, 98 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 802aa9f76f4..0244f593149 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -76,11 +76,10 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, case FIXED_GRF: case MRF: case ATTR: - this->regs_written = DIV_ROUND_UP(dst.component_size(exec_size), - REG_SIZE); + this->size_written = dst.component_size(exec_size); break; case BAD_FILE: - this->regs_written = 0; + this->size_written = 0; break; case IMM: case UNIFORM: @@ -192,7 +191,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld, fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4); fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL, vec4_result, surf_index, vec4_offset); - inst->regs_written = 4 * bld.dispatch_width() / 8; + inst->size_written = 4 * bld.dispatch_width() / 8 * REG_SIZE; if (type_sz(dst.type) == 8) { shuffle_32bit_load_result_to_64bit_data( @@ -244,7 +243,7 @@ fs_inst::equals(fs_inst *inst) const bool fs_inst::overwrites_reg(const fs_reg ®) const { - return reg.in_range(dst, regs_written); + return reg.in_range(dst, DIV_ROUND_UP(size_written, REG_SIZE)); } bool @@ -357,7 +356,7 @@ fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const if (reg.file != VGRF || reg.offset / REG_SIZE != 0 || reg.stride == 0) return false; - if (grf_alloc.sizes[reg.nr] != this->regs_written) + if (grf_alloc.sizes[reg.nr] * REG_SIZE != this->size_written) return false; for (int i = 0; i < this->sources; i++) { @@ -2548,7 +2547,7 @@ fs_visitor::opt_sampler_eot() for (unsigned i = 0; i < FB_WRITE_LOGICAL_NUM_SRCS; i++) { if (i == FB_WRITE_LOGICAL_SRC_COLOR0) { if (!fb_write->src[i].equals(tex_inst->dst) || - fb_write->regs_read(i) != tex_inst->regs_written) + fb_write->regs_read(i) * REG_SIZE != tex_inst->size_written) return false; } else if (i != FB_WRITE_LOGICAL_SRC_COMPONENTS) { if (fb_write->src[i].file != BAD_FILE) @@ -2564,7 +2563,7 @@ fs_visitor::opt_sampler_eot() tex_inst->offset |= fb_write->target << 24; tex_inst->eot = true; tex_inst->dst = ibld.null_reg_ud(); - tex_inst->regs_written = 0; + tex_inst->size_written = 0; fb_write->remove(cfg->blocks[cfg->num_blocks - 1]); /* Marking EOT is sufficient, lower_logical_sends() will notice the EOT @@ -2606,7 +2605,7 @@ fs_visitor::opt_register_renaming() if (depth == 0 && inst->dst.file == VGRF && - alloc.sizes[inst->dst.nr] == inst->regs_written && + alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written && !inst->is_partial_write()) { if (remap[dst] == -1) { remap[dst] = dst; @@ -2730,7 +2729,7 @@ fs_visitor::compute_to_mrf() unsigned regs_left = (1 << regs_read(inst, 0)) - 1; foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { - if (regions_overlap(scan_inst->dst, scan_inst->regs_written * REG_SIZE, + if (regions_overlap(scan_inst->dst, scan_inst->size_written, inst->src[0], inst->regs_read(0) * REG_SIZE)) { /* Found the last thing to write our reg we want to turn * into a compute-to-MRF. @@ -2749,7 +2748,7 @@ fs_visitor::compute_to_mrf() * a time. */ if (scan_inst->dst.offset / REG_SIZE < inst->src[0].offset / REG_SIZE || - scan_inst->dst.offset / REG_SIZE + scan_inst->regs_written > + scan_inst->dst.offset / REG_SIZE + DIV_ROUND_UP(scan_inst->size_written, REG_SIZE) > inst->src[0].offset / REG_SIZE + inst->regs_read(0)) break; @@ -2768,7 +2767,8 @@ fs_visitor::compute_to_mrf() /* Clear the bits for any registers this instruction overwrites. */ regs_left &= ~mask_relative_to( - inst->src[0], scan_inst->dst, scan_inst->regs_written); + inst->src[0], scan_inst->dst, DIV_ROUND_UP(scan_inst->size_written, + REG_SIZE)); if (!regs_left) break; } @@ -2793,8 +2793,8 @@ fs_visitor::compute_to_mrf() if (interfered) break; - if (regions_overlap(scan_inst->dst, scan_inst->regs_written * REG_SIZE, - inst->dst, inst->regs_written * REG_SIZE)) { + if (regions_overlap(scan_inst->dst, scan_inst->size_written, + inst->dst, inst->size_written)) { /* If somebody else writes our MRF here, we can't * compute-to-MRF before that. */ @@ -2803,7 +2803,7 @@ fs_visitor::compute_to_mrf() if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 && regions_overlap(fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE, - inst->dst, inst->regs_written * REG_SIZE)) { + inst->dst, inst->size_written)) { /* Found a SEND instruction, which means that there are * live values in MRFs from base_mrf to base_mrf + * scan_inst->mlen - 1. Don't go pushing our MRF write up @@ -2822,11 +2822,12 @@ fs_visitor::compute_to_mrf() regs_left = (1 << regs_read(inst, 0)) - 1; foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { - if (regions_overlap(scan_inst->dst, scan_inst->regs_written * REG_SIZE, + if (regions_overlap(scan_inst->dst, scan_inst->size_written, inst->src[0], inst->regs_read(0) * REG_SIZE)) { /* Clear the bits for any registers this instruction overwrites. */ regs_left &= ~mask_relative_to( - inst->src[0], scan_inst->dst, scan_inst->regs_written); + inst->src[0], scan_inst->dst, DIV_ROUND_UP(scan_inst->size_written, + REG_SIZE)); const unsigned rel_offset = (reg_offset(scan_inst->dst) - reg_offset(inst->src[0])) / REG_SIZE; @@ -2841,7 +2842,7 @@ fs_visitor::compute_to_mrf() /* Clear the COMPR4 bit if the generating instruction is not * compressed. */ - if (scan_inst->regs_written < 2) + if (scan_inst->size_written < 2 * REG_SIZE) scan_inst->dst.nr &= ~BRW_MRF_COMPR4; } else { @@ -3024,7 +3025,7 @@ fs_visitor::remove_duplicate_mrf_writes() /* Clear out any MRF move records whose sources got overwritten. */ for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) { if (last_mrf_move[i] && - regions_overlap(inst->dst, inst->regs_written * REG_SIZE, + regions_overlap(inst->dst, inst->size_written, last_mrf_move[i]->src[0], last_mrf_move[i]->regs_read(0) * REG_SIZE)) { last_mrf_move[i] = NULL; @@ -4603,7 +4604,7 @@ get_fpu_lowered_simd_width(const struct gen_device_info *devinfo, * which is the one that is going to limit the overall execution size of * the instruction due to this rule. */ - unsigned reg_count = inst->regs_written; + unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE); for (unsigned i = 0; i < inst->sources; i++) reg_count = MAX2(reg_count, (unsigned)inst->regs_read(i)); @@ -4630,13 +4631,14 @@ get_fpu_lowered_simd_width(const struct gen_device_info *devinfo, */ if (devinfo->gen < 8) { for (unsigned i = 0; i < inst->sources; i++) { - if (inst->regs_written == 2 && + if (DIV_ROUND_UP(inst->size_written, REG_SIZE) == 2 && inst->regs_read(i) != 0 && inst->regs_read(i) != 2 && !is_uniform(inst->src[i]) && !(type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 && - type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1)) - max_width = MIN2(max_width, inst->exec_size / - inst->regs_written); + type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1)) { + const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE); + max_width = MIN2(max_width, inst->exec_size / reg_count); + } } } @@ -4681,9 +4683,10 @@ get_fpu_lowered_simd_width(const struct gen_device_info *devinfo, * In this situation we calculate the maximum size of the split * instructions so they only ever write to a single register. */ - if (devinfo->gen < 8 && inst->regs_written > 1 && + if (devinfo->gen < 8 && inst->size_written > REG_SIZE && !inst->force_writemask_all) { - const unsigned channels_per_grf = inst->exec_size / inst->regs_written; + const unsigned channels_per_grf = inst->exec_size / + DIV_ROUND_UP(inst->size_written, REG_SIZE); unsigned exec_type_size = 0; for (int i = 0; i < inst->sources; i++) { if (inst->src[i].file != BAD_FILE) @@ -5087,8 +5090,7 @@ needs_dst_copy(const fs_builder &lbld, const fs_inst *inst) * the results of multiple lowered instructions in order to make sure that * they end up arranged correctly in the original destination region. */ - if (inst->regs_written * REG_SIZE > - inst->dst.component_size(inst->exec_size)) + if (inst->size_written > inst->dst.component_size(inst->exec_size)) return true; /* If the lowered execution size is larger than the original the result of @@ -5111,7 +5113,7 @@ needs_dst_copy(const fs_builder &lbld, const fs_inst *inst) * group which could cause one of the lowered instructions to overwrite * the data read from the same source by other lowered instructions. */ - if (regions_overlap(inst->dst, inst->regs_written * REG_SIZE, + if (regions_overlap(inst->dst, inst->size_written, inst->src[i], inst->regs_read(i) * REG_SIZE) && !inst->dst.equals(inst->src[i])) return true; @@ -5138,8 +5140,8 @@ emit_zip(const fs_builder &lbld, bblock_t *block, fs_inst *inst) /* Specified channel group from the destination region. */ const fs_reg dst = horiz_offset(inst->dst, lbld.group()); - const unsigned dst_size = inst->regs_written * REG_SIZE / - inst->dst.component_size(inst->exec_size); + const unsigned dst_size = inst->size_written / + inst->dst.component_size(inst->exec_size); if (needs_dst_copy(lbld, inst)) { const fs_reg tmp = lbld.vgrf(inst->dst.type, dst_size); @@ -5191,7 +5193,7 @@ fs_visitor::lower_simd_width() * original or the lowered instruction, whichever is lower. */ const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width); - const unsigned dst_size = inst->regs_written * REG_SIZE / + const unsigned dst_size = inst->size_written / inst->dst.component_size(inst->exec_size); assert(!inst->writes_accumulator && !inst->mlen); @@ -5215,9 +5217,8 @@ fs_visitor::lower_simd_width() split_inst.src[j] = emit_unzip(lbld, block, inst, j); split_inst.dst = emit_zip(lbld, block, inst); - split_inst.regs_written = DIV_ROUND_UP( - split_inst.dst.component_size(lower_width) * dst_size, - REG_SIZE); + split_inst.size_written = + split_inst.dst.component_size(lower_width) * dst_size; lbld.emit(split_inst); } @@ -5314,7 +5315,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) switch (inst->dst.file) { case VGRF: fprintf(file, "vgrf%d", inst->dst.nr); - if (alloc.sizes[inst->dst.nr] != inst->regs_written || + if (alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written || inst->dst.offset % REG_SIZE) fprintf(file, "+%d.%d", inst->dst.offset / REG_SIZE, inst->dst.offset % REG_SIZE); diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h index 483672fbd96..bae151ca042 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_builder.h +++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h @@ -569,11 +569,11 @@ namespace brw { { instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources); inst->header_size = header_size; - inst->regs_written = header_size; + inst->size_written = header_size * REG_SIZE; for (unsigned i = header_size; i < sources; i++) { - inst->regs_written += - DIV_ROUND_UP(dispatch_width() * type_sz(src[i].type) * - dst.stride, REG_SIZE); + inst->size_written += + ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride, + REG_SIZE); } return inst; diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp index 10f0a5b2820..0e239d28d44 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp @@ -43,7 +43,7 @@ namespace { /* avoid conflict with opt_copy_propagation_elements */ struct acp_entry : public exec_node { fs_reg dst; fs_reg src; - uint8_t regs_written; + uint8_t size_written; uint8_t regs_read; enum opcode opcode; bool saturate; @@ -368,7 +368,8 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) * that entry is writing. */ if (!region_contained_in(inst->src[arg], inst->regs_read(arg), - entry->dst, entry->regs_written)) + entry->dst, DIV_ROUND_UP(entry->size_written, + REG_SIZE))) return false; /* we can't generally copy-propagate UD negations because we @@ -524,7 +525,8 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry) * that entry is writing. */ if (!region_contained_in(inst->src[i], inst->regs_read(i), - entry->dst, entry->regs_written)) + entry->dst, DIV_ROUND_UP(entry->size_written, + REG_SIZE))) continue; /* If the type sizes don't match each channel of the instruction is @@ -770,8 +772,8 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block, /* kill the destination from the ACP */ if (inst->dst.file == VGRF) { foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) { - if (regions_overlap(entry->dst, entry->regs_written * REG_SIZE, - inst->dst, inst->regs_written * REG_SIZE)) + if (regions_overlap(entry->dst, entry->size_written, + inst->dst, inst->size_written)) entry->remove(); } @@ -784,7 +786,7 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block, * _any_ of the registers that it reads */ if (regions_overlap(entry->src, entry->regs_read * REG_SIZE, - inst->dst, inst->regs_written * REG_SIZE)) + inst->dst, inst->size_written)) entry->remove(); } } @@ -797,7 +799,7 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block, acp_entry *entry = ralloc(copy_prop_ctx, acp_entry); entry->dst = inst->dst; entry->src = inst->src[0]; - entry->regs_written = inst->regs_written; + entry->size_written = inst->size_written; entry->regs_read = inst->regs_read(0); entry->opcode = inst->opcode; entry->saturate = inst->saturate; @@ -808,14 +810,14 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block, for (int i = 0; i < inst->sources; i++) { int effective_width = i < inst->header_size ? 8 : inst->exec_size; assert(effective_width * type_sz(inst->src[i].type) % REG_SIZE == 0); - int regs_written = effective_width * - type_sz(inst->src[i].type) / REG_SIZE; + const unsigned size_written = effective_width * + type_sz(inst->src[i].type); if (inst->src[i].file == VGRF) { acp_entry *entry = ralloc(copy_prop_ctx, acp_entry); entry->dst = inst->dst; entry->dst.offset += offset * REG_SIZE; entry->src = inst->src[i]; - entry->regs_written = regs_written; + entry->size_written = size_written; entry->regs_read = inst->regs_read(i); entry->opcode = inst->opcode; if (!entry->dst.equals(inst->src[i])) { @@ -824,7 +826,7 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block, ralloc_free(entry); } } - offset += regs_written; + offset += DIV_ROUND_UP(size_written, REG_SIZE); } } } diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp index 4744142a4b6..2acbfea71f0 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp @@ -185,7 +185,7 @@ instructions_match(fs_inst *a, fs_inst *b, bool *negate) a->dst.type == b->dst.type && a->offset == b->offset && a->mlen == b->mlen && - a->regs_written == b->regs_written && + a->size_written == b->size_written && a->base_mrf == b->base_mrf && a->eot == b->eot && a->header_size == b->header_size && @@ -296,7 +296,7 @@ fs_visitor::opt_cse_local(bblock_t *block) /* dest <- temp */ if (!inst->dst.is_null()) { - assert(inst->regs_written == entry->generator->regs_written); + assert(inst->size_written == entry->generator->size_written); assert(inst->dst.type == entry->tmp.type); const fs_builder ibld(this, block, inst); diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index 12ab7b3fe66..8a581c9f02c 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -357,13 +357,14 @@ void fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst, struct brw_reg payload) { + assert(inst->size_written % REG_SIZE == 0); brw_wm_prog_data *prog_data = reinterpret_cast<brw_wm_prog_data *>(this->prog_data); const unsigned surf_index = prog_data->binding_table.render_target_start + inst->target; gen9_fb_READ(p, dst, payload, surf_index, - inst->header_size, inst->regs_written, + inst->header_size, inst->size_written / REG_SIZE, prog_data->persample_dispatch); brw_mark_surface_used(&prog_data->base, surf_index); @@ -452,6 +453,7 @@ fs_generator::generate_urb_read(fs_inst *inst, struct brw_reg dst, struct brw_reg header) { + assert(inst->size_written % REG_SIZE == 0); assert(header.file == BRW_GENERAL_REGISTER_FILE); assert(header.type == BRW_REGISTER_TYPE_UD); @@ -467,7 +469,7 @@ fs_generator::generate_urb_read(fs_inst *inst, brw_inst_set_urb_per_slot_offset(p->devinfo, send, true); brw_inst_set_mlen(p->devinfo, send, inst->mlen); - brw_inst_set_rlen(p->devinfo, send, inst->regs_written); + brw_inst_set_rlen(p->devinfo, send, inst->size_written / REG_SIZE); brw_inst_set_header_present(p->devinfo, send, true); brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset); } @@ -625,6 +627,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src struct brw_reg surface_index, struct brw_reg sampler_index) { + assert(inst->size_written % REG_SIZE == 0); int msg_type = -1; uint32_t simd_mode; uint32_t return_format; @@ -895,7 +898,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src surface + base_binding_table_index, sampler % 16, msg_type, - inst->regs_written, + inst->size_written / REG_SIZE, inst->mlen, inst->header_size != 0, simd_mode, @@ -932,7 +935,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src 0 /* surface */, 0 /* sampler */, msg_type, - inst->regs_written, + inst->size_written / REG_SIZE, inst->mlen /* mlen */, inst->header_size != 0 /* header */, simd_mode, @@ -1263,7 +1266,7 @@ fs_generator::generate_varying_pull_constant_load_gen4(fs_inst *inst, */ msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD; assert(inst->mlen == 3); - assert(inst->regs_written == 8); + assert(inst->size_written == 8 * REG_SIZE); rlen = 8; simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; } @@ -1399,6 +1402,7 @@ fs_generator::generate_pixel_interpolator_query(fs_inst *inst, struct brw_reg msg_data, unsigned msg_type) { + assert(inst->size_written % REG_SIZE == 0); assert(msg_data.type == BRW_REGISTER_TYPE_UD); brw_pixel_interpolator_query(p, @@ -1408,7 +1412,7 @@ fs_generator::generate_pixel_interpolator_query(fs_inst *inst, msg_type, msg_data, inst->mlen, - inst->regs_written); + inst->size_written / REG_SIZE); } diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index cd4005c0e60..42ed131854e 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -1661,7 +1661,7 @@ emit_pixel_interpolater_send(const fs_builder &bld, inst = bld.emit(opcode, dst, payload, desc); inst->mlen = mlen; /* 2 floats per slot returned */ - inst->regs_written = 2 * bld.dispatch_width() / 8; + inst->size_written = 2 * bld.dispatch_width() / 8 * REG_SIZE; inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE; wm_prog_data->pulls_bary = true; @@ -2144,7 +2144,7 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst, unsigned read_components = num_components + first_component; fs_reg tmp = bld.vgrf(dst.type, read_components); inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle); - inst->regs_written = read_components * type_sz(tmp_dst.type) / 4; + inst->size_written = read_components * type_sz(tmp_dst.type) / 4 * REG_SIZE; for (unsigned i = 0; i < num_components; i++) { bld.MOV(offset(tmp_dst, bld, i), offset(tmp, bld, i + first_component)); @@ -2152,7 +2152,7 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst, } else { inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst, icp_handle); - inst->regs_written = num_components * type_sz(tmp_dst.type) / 4; + inst->size_written = num_components * type_sz(tmp_dst.type) / 4 * REG_SIZE; } inst->offset = base_offset + offset_const->u32[0]; inst->mlen = 1; @@ -2166,7 +2166,7 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst, if (first_component != 0) { inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, payload); - inst->regs_written = read_components * type_sz(tmp_dst.type) / 4; + inst->size_written = read_components * type_sz(tmp_dst.type) / 4 * REG_SIZE; for (unsigned i = 0; i < num_components; i++) { bld.MOV(offset(tmp_dst, bld, i), offset(tmp, bld, i + first_component)); @@ -2174,7 +2174,7 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst, } else { inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst, payload); - inst->regs_written = num_components * type_sz(tmp_dst.type) / 4; + inst->size_written = num_components * type_sz(tmp_dst.type) / 4 * REG_SIZE; } inst->offset = base_offset; inst->mlen = 2; @@ -2204,7 +2204,7 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst, /* Read the whole VUE header (because of alignment) and read .w. */ fs_reg tmp = bld.vgrf(dst.type, 4); inst->dst = tmp; - inst->regs_written = 4; + inst->size_written = 4 * REG_SIZE; bld.MOV(dst, offset(tmp, bld, 3)); } } @@ -2510,8 +2510,8 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, inst->offset = imm_offset; inst->mlen = 2; } - inst->regs_written = - ((num_components + first_component) * type_sz(dst.type) / 4); + inst->size_written = + ((num_components + first_component) * type_sz(dst.type) / 4) * REG_SIZE; /* If we are reading 64-bit data using 32-bit read messages we need * build proper 64-bit data elements by shuffling the low and high @@ -2535,7 +2535,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, if (inst->offset == 0 && indirect_offset.file == BAD_FILE) { assert(type_sz(dst.type) < 8); inst->dst = bld.vgrf(dst.type, 4); - inst->regs_written = 4; + inst->size_written = 4 * REG_SIZE; bld.MOV(dst, offset(inst->dst, bld, 3)); } @@ -2576,7 +2576,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, patch_handle); inst->offset = 0; inst->mlen = 1; - inst->regs_written = 4; + inst->size_written = 4 * REG_SIZE; /* dst.xy = tmp.wz */ bld.MOV(dst, offset(tmp, bld, 3)); @@ -2584,11 +2584,11 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, break; } case GL_TRIANGLES: - /* DWord 4; hardcode offset = 1 and regs_written = 1 */ + /* DWord 4; hardcode offset = 1 and size_written = REG_SIZE */ inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, patch_handle); inst->offset = 1; inst->mlen = 1; - inst->regs_written = 1; + inst->size_written = REG_SIZE; break; case GL_ISOLINES: /* All channels are undefined. */ @@ -2606,7 +2606,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, patch_handle); inst->offset = 1; inst->mlen = 1; - inst->regs_written = 4; + inst->size_written = 4 * REG_SIZE; /* Reswizzle: WZYX */ fs_reg srcs[4] = { @@ -2641,7 +2641,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, fs_reg tmp = bld.vgrf(dst.type, read_components); inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, patch_handle); - inst->regs_written = read_components; + inst->size_written = read_components * REG_SIZE; for (unsigned i = 0; i < instr->num_components; i++) { bld.MOV(offset(dst, bld, i), offset(tmp, bld, i + first_component)); @@ -2649,7 +2649,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, } else { inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, patch_handle); - inst->regs_written = instr->num_components; + inst->size_written = instr->num_components * REG_SIZE; } inst->offset = imm_offset; inst->mlen = 1; @@ -2668,7 +2668,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, fs_reg tmp = bld.vgrf(dst.type, read_components); inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, payload); - inst->regs_written = read_components; + inst->size_written = read_components * REG_SIZE; for (unsigned i = 0; i < instr->num_components; i++) { bld.MOV(offset(dst, bld, i), offset(tmp, bld, i + first_component)); @@ -2676,7 +2676,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, } else { inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, payload); - inst->regs_written = instr->num_components; + inst->size_written = instr->num_components * REG_SIZE; } inst->offset = imm_offset; inst->mlen = 2; @@ -2976,7 +2976,7 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld, fs_reg tmp = bld.vgrf(dest.type, read_components); inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, patch_handle); - inst->regs_written = read_components; + inst->size_written = read_components * REG_SIZE; for (unsigned i = 0; i < instr->num_components; i++) { bld.MOV(offset(dest, bld, i), offset(tmp, bld, i + first_component)); @@ -2984,7 +2984,7 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld, } else { inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest, patch_handle); - inst->regs_written = instr->num_components; + inst->size_written = instr->num_components * REG_SIZE; } inst->mlen = 1; inst->offset = imm_offset; @@ -3032,8 +3032,9 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld, } inst->mlen = 2; inst->offset = imm_offset; - inst->regs_written = - ((num_components + first_component) * type_sz(dest.type) / 4); + inst->size_written = + ((num_components + first_component) * type_sz(dest.type) / 4) * + REG_SIZE; /* If we are reading 64-bit data using 32-bit read messages we need * build proper 64-bit data elements by shuffling the low and high @@ -3207,8 +3208,7 @@ fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, STATIC_ASSERT(ARRAY_SIZE(srcs) == TEX_LOGICAL_NUM_SRCS); fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs)); - inst->regs_written = 4 * inst->dst.component_size(inst->exec_size) / - REG_SIZE; + inst->size_written = 4 * inst->dst.component_size(inst->exec_size); return inst; } @@ -3223,8 +3223,7 @@ emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target) assert(bld.shader->devinfo->gen >= 9); fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst); inst->target = target; - inst->regs_written = 4 * inst->dst.component_size(inst->exec_size) / - REG_SIZE; + inst->size_written = 4 * inst->dst.component_size(inst->exec_size); return inst; } @@ -3903,7 +3902,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr const fs_builder ubld = bld.group(8, 0); const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp) - ->regs_written = 2; + ->size_written = 2 * REG_SIZE; break; } @@ -4338,7 +4337,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr src_payload, brw_imm_ud(index)); inst->header_size = 0; inst->mlen = 1; - inst->regs_written = 4; + inst->size_written = 4 * REG_SIZE; bld.MOV(retype(dest, ret_payload.type), component(ret_payload, 0)); brw_mark_surface_used(prog_data, index); @@ -4685,9 +4684,9 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) nir_ssa_def_components_read(&instr->dest.ssa): (1 << dest_size) - 1; assert(write_mask != 0); /* dead code should have been eliminated */ - inst->regs_written = util_last_bit(write_mask) * dispatch_width / 8; + inst->size_written = util_last_bit(write_mask) * dispatch_width / 8 * REG_SIZE; } else { - inst->regs_written = 4 * dispatch_width / 8; + inst->size_written = 4 * dispatch_width / 8 * REG_SIZE; } if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE) diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp index 572735a379a..5c6f3d490f0 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp @@ -826,7 +826,8 @@ fs_visitor::choose_spill_reg(struct ra_graph *g) } if (inst->dst.file == VGRF) - spill_costs[inst->dst.nr] += inst->regs_written * loop_scale; + spill_costs[inst->dst.nr] += DIV_ROUND_UP(inst->size_written, REG_SIZE) + * loop_scale; switch (inst->opcode) { diff --git a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp index 3dd0fbfc1c1..310e8019fcb 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp @@ -204,7 +204,7 @@ fs_visitor::register_coalesce() continue; } dst_reg_offset[offset] = inst->dst.offset / REG_SIZE; - if (inst->regs_written > 1) + if (inst->size_written > REG_SIZE) dst_reg_offset[offset + 1] = inst->dst.offset / REG_SIZE + 1; mov[offset] = inst; channels_remaining -= regs_written(inst); diff --git a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp index 5fa7c42d2ee..37e893bb89e 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp @@ -50,7 +50,7 @@ namespace brw { const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize); fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs)); - inst->regs_written = rsize * bld.dispatch_width() / 8; + inst->size_written = rsize * bld.dispatch_width() / 8 * REG_SIZE; inst->predicate = pred; return dst; } diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index d0f504c1a2e..5aea62c4cbc 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -97,7 +97,7 @@ fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components, /* We only care about one or two regs of response, but the sampler always * writes 4/8. */ - inst->regs_written = 4 * dispatch_width / 8; + inst->size_written = 4 * dispatch_width / 8 * REG_SIZE; return dest; } diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h index de08a691055..cea81e4646a 100644 --- a/src/mesa/drivers/dri/i965/brw_ir_fs.h +++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h @@ -421,7 +421,8 @@ inline unsigned regs_written(const fs_inst *inst) { /* XXX - Take into account register-misaligned offsets correctly. */ - return inst->regs_written; + assert(inst->dst.file != UNIFORM && inst->dst.file != IMM); + return DIV_ROUND_UP(inst->size_written, REG_SIZE); } /** diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp index c12bf09e835..5e1e61683a2 100644 --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp @@ -1494,11 +1494,11 @@ fs_instruction_scheduler::choose_instruction_to_schedule() * single-result send is probably actually reducing register * pressure. */ - if (inst->regs_written <= inst->exec_size / 8 && - chosen_inst->regs_written > chosen_inst->exec_size / 8) { + if (inst->size_written <= inst->exec_size / 8 * REG_SIZE && + chosen_inst->size_written > chosen_inst->exec_size / 8 * REG_SIZE) { chosen = n; continue; - } else if (inst->regs_written > chosen_inst->regs_written) { + } else if (inst->size_written > chosen_inst->size_written) { continue; } } diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h index 66264b4ea7e..2173f3226e1 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.h +++ b/src/mesa/drivers/dri/i965/brw_shader.h @@ -138,6 +138,7 @@ struct backend_instruction { int8_t base_mrf; /**< First MRF in the SEND message, if mlen is nonzero. */ uint8_t target; /**< MRT target. */ uint8_t regs_written; /**< Number of registers written by the instruction. */ + unsigned size_written; /**< Data written to the destination register in bytes. */ enum opcode opcode; /* BRW_OPCODE_* or FS_OPCODE_* */ enum brw_conditional_mod conditional_mod; /**< BRW_CONDITIONAL_* */ diff --git a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp index 8ba7bc59481..f71c6ee1e42 100644 --- a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp +++ b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp @@ -281,7 +281,7 @@ TEST_F(cmod_propagation_test, intervening_dest_write) fs_reg zero(brw_imm_f(0.0f)); bld.ADD(offset(dest, bld, 2), src0, src1); bld.emit(SHADER_OPCODE_TEX, dest, src2) - ->regs_written = 4; + ->size_written = 4 * REG_SIZE; bld.CMP(bld.null_reg_f(), offset(dest, bld, 2), zero, BRW_CONDITIONAL_GE); /* = Before = diff --git a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp index fd623a5e00f..680fe72dfd5 100644 --- a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp +++ b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp @@ -525,7 +525,7 @@ TEST_F(saturate_propagation_test, intervening_dest_write) fs_reg src2 = v->vgrf(glsl_type::vec2_type); bld.ADD(offset(dst0, bld, 2), src0, src1); bld.emit(SHADER_OPCODE_TEX, dst0, src2) - ->regs_written = 4; + ->size_written = 4 * REG_SIZE; set_saturate(true, bld.MOV(dst1, offset(dst0, bld, 2))); /* = Before = |