diff options
author | Daniel Schürmann <[email protected]> | 2020-05-25 11:51:27 +0200 |
---|---|---|
committer | Marge Bot <[email protected]> | 2020-06-09 21:25:38 +0000 |
commit | 5cde4989d3c8c25b0ba2a11ec450625e30092b16 (patch) | |
tree | 11cb176982112bde27a0e29ae3cebce23b0126cd /src/amd | |
parent | 5446e3cf2e381d061e625291ce3d5d587db2e6e0 (diff) |
aco: remove unnecessary split- and create_vector instructions for subdword loads
This helps GFX6/7 by removing unnecessary shuffle code.
Reviewed-by: Rhys Perry <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5226>
Diffstat (limited to 'src/amd')
-rw-r--r-- | src/amd/compiler/aco_instruction_selection.cpp | 145 |
1 files changed, 66 insertions, 79 deletions
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 76462b1b23b..5ef3776cfa0 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -413,38 +413,66 @@ void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst) } } -/* this function trims subdword vectors: - * if dst is vgpr - split the src and create a shrunk version according to the mask. - * if dst is sgpr - split the src, but move the original to sgpr. */ -void trim_subdword_vector(isel_context *ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask) +void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, unsigned component_size) { - assert(vec_src.type() == RegType::vgpr); - emit_split_vector(ctx, vec_src, num_components); - Builder bld(ctx->program, ctx->block); - std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems; - unsigned component_size = vec_src.bytes() / num_components; - RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword(); + if (offset.isTemp()) { + Temp tmp[4] = {vec, vec, vec, vec}; - unsigned k = 0; - for (unsigned i = 0; i < num_components; i++) { - if (mask & (1 << i)) - elems[k++] = emit_extract_vector(ctx, vec_src, i, rc); + if (vec.size() == 4) { + tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), Definition(tmp[3]), vec); + } else if (vec.size() == 3) { + tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec); + } else if (vec.size() == 2) { + tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1]; + bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec); + } + for (unsigned i = 0; i < dst.size(); i++) + tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset); + + vec = tmp[0]; + if (dst.size() == 2) + vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]); + + offset = Operand(0u); + } + + unsigned num_components = dst.bytes() / component_size; + if (vec.regClass() == dst.regClass()) { + assert(offset.constantValue() == 0); + bld.copy(Definition(dst), vec); + emit_split_vector(ctx, dst, num_components); + return; } + emit_split_vector(ctx, vec, vec.bytes() / component_size); + std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems; + RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword(); + + assert(offset.constantValue() % component_size == 0); + unsigned skip = offset.constantValue() / component_size; + for (unsigned i = 0; i < num_components; i++) + elems[i] = emit_extract_vector(ctx, vec, i + skip, rc); + + /* if dst is vgpr - split the src and create a shrunk version according to the mask. */ if (dst.type() == RegType::vgpr) { - assert(dst.bytes() == k * component_size); - aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, k, 1)}; - for (unsigned i = 0; i < k; i++) - vec->operands[i] = Operand(elems[i]); - vec->definitions[0] = Definition(dst); - bld.insert(std::move(vec)); + aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; + for (unsigned i = 0; i < num_components; i++) + create_vec->operands[i] = Operand(elems[i]); + create_vec->definitions[0] = Definition(dst); + bld.insert(std::move(create_vec)); + + /* if dst is sgpr - split the src, but move the original to sgpr. */ + } else if (skip) { + vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec); + byte_align_scalar(ctx, vec, offset, dst); } else { - // TODO: alignbyte if mask doesn't start with 1? - assert(mask & 1); - assert(dst.size() == vec_src.size()); - bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src); + assert(dst.size() == vec.size()); + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec); } + ctx->allocated_vec.emplace(dst.id(), elems); } @@ -3023,38 +3051,6 @@ uint32_t widen_mask(uint32_t mask, unsigned multiplier) return new_mask; } -void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst) -{ - Builder bld(ctx->program, ctx->block); - if (offset.isTemp()) { - Temp tmp[4] = {vec, vec, vec, vec}; - - if (vec.size() == 4) { - tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1); - bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), Definition(tmp[3]), vec); - } else if (vec.size() == 3) { - tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1); - bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec); - } else if (vec.size() == 2) { - tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1]; - bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec); - } - for (unsigned i = 0; i < dst.size(); i++) - tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset); - - vec = tmp[0]; - if (dst.size() == 2) - vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]); - - offset = Operand(0u); - } - - if (vec.bytes() == dst.bytes() && offset.constantValue() == 0) - bld.copy(Definition(dst), vec); - else - trim_subdword_vector(ctx, vec, dst, vec.bytes(), ((1 << dst.bytes()) - 1) << offset.constantValue()); -} - struct LoadEmitInfo { Operand offset; Temp dst; @@ -3185,8 +3181,15 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info) Temp val = callback(bld, info, aligned_offset_tmp, bytes_needed, align, reduced_const_offset, byte_align ? Temp() : info->dst); + /* the callback wrote directly to dst */ + if (val == info->dst) { + assert(num_vals == 0); + emit_split_vector(ctx, info->dst, info->num_components); + return; + } + /* shift result right if needed */ - if (byte_align) { + if (info->component_size < 4) { Operand align((uint32_t)byte_align); if (byte_align == -1) { if (offset.isConstant()) @@ -3197,15 +3200,12 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info) align = offset; } - if (align.isTemp() || align.constantValue()) { - assert(val.bytes() >= load_size && "unimplemented"); - Temp new_val = bld.tmp(RegClass::get(val.type(), load_size)); - if (val.type() == RegType::sgpr) - byte_align_scalar(ctx, val, align, new_val); - else - byte_align_vector(ctx, val, align, new_val); - val = new_val; - } + assert(val.bytes() >= load_size && "unimplemented"); + if (val.type() == RegType::sgpr) + byte_align_scalar(ctx, val, align, info->dst); + else + byte_align_vector(ctx, val, align, info->dst, component_size); + return; } /* add result to list and advance */ @@ -3221,13 +3221,6 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info) vals[num_vals++] = val; } - /* the callback wrote directly to dst */ - if (vals[0] == info->dst) { - assert(num_vals == 1); - emit_split_vector(ctx, info->dst, info->num_components); - return; - } - /* create array of components */ unsigned components_split = 0; std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec; @@ -3479,9 +3472,6 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info, mubuf->definitions[0] = Definition(val); bld.insert(std::move(mubuf)); - if (bytes_size < 4) - val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, bytes_size)), val, Operand(0u)); - return val; } @@ -3554,9 +3544,6 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo *info, bld.insert(std::move(flat)); } - if (bytes_size < 4) - val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, bytes_size)), val, Operand(0u)); - return val; } |