aboutsummaryrefslogtreecommitdiffstats
path: root/src/amd
diff options
context:
space:
mode:
authorRhys Perry <[email protected]>2019-12-09 12:18:51 +0000
committerRhys Perry <[email protected]>2020-01-28 11:39:57 +0000
commit525b1073474e070c8ade47856e649747ed12f775 (patch)
tree070d56734ffeb71e9bc49cd5a7e48577c8135355 /src/amd
parent4363a1f75b3a2638297c5d4f8dca06737bdab7fc (diff)
aco: rework vertex fetching a bit
This will make it easier to skip unused channels at the start and to split unaligned loads on GFX10. Signed-off-by: Rhys Perry <[email protected]> Reviewed-by: Daniel Schürmann <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3086>
Diffstat (limited to 'src/amd')
-rw-r--r--src/amd/compiler/aco_builder_h.py1
-rw-r--r--src/amd/compiler/aco_instruction_selection.cpp197
-rw-r--r--src/amd/compiler/aco_opcodes.py4
3 files changed, 128 insertions, 74 deletions
diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py
index 18e4bf752ec..4e1d6f72b63 100644
--- a/src/amd/compiler/aco_builder_h.py
+++ b/src/amd/compiler/aco_builder_h.py
@@ -472,6 +472,7 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod
("smem", [Format.SMEM], 'SMEM_instruction', [(0, 4), (0, 3), (1, 0), (1, 3), (1, 2), (0, 0)]),
("ds", [Format.DS], 'DS_instruction', [(1, 1), (1, 2), (0, 3), (0, 4)]),
("mubuf", [Format.MUBUF], 'MUBUF_instruction', [(0, 4), (1, 3)]),
+ ("mtbuf", [Format.MTBUF], 'MTBUF_instruction', [(0, 4), (1, 3)]),
("mimg", [Format.MIMG], 'MIMG_instruction', [(0, 4), (1, 3), (0, 3), (1, 2)]), #TODO(pendingchaos): less shapes?
("exp", [Format.EXP], 'Export_instruction', [(0, 4)]),
("branch", [Format.PSEUDO_BRANCH], 'Pseudo_branch_instruction', itertools.product([0], [0, 1])),
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 17be816d020..11f7805b56d 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -3066,32 +3066,59 @@ void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr
}
}
-unsigned get_num_channels_from_data_format(unsigned data_format)
+bool check_vertex_fetch_size(isel_context *ctx, const ac_data_format_info *vtx_info,
+ unsigned offset, unsigned stride, unsigned channels)
{
- switch (data_format) {
+ unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;
+ if (vtx_info->chan_byte_size != 4 && channels == 3)
+ return false;
+ return true;
+}
+
+uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_info,
+ unsigned offset, unsigned stride, unsigned *channels)
+{
+ if (!vtx_info->chan_byte_size) {
+ *channels = vtx_info->num_channels;
+ return vtx_info->chan_format;
+ }
+
+ unsigned num_channels = *channels;
+ if (!check_vertex_fetch_size(ctx, vtx_info, offset, stride, *channels)) {
+ unsigned new_channels = num_channels + 1;
+ /* first, assume more loads is worse and try using a larger data format */
+ while (new_channels <= 4 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels)) {
+ new_channels++;
+ /* don't make the attribute potentially out-of-bounds */
+ if (offset + new_channels * vtx_info->chan_byte_size > stride)
+ new_channels = 5;
+ }
+
+ if (new_channels == 5) {
+ /* then try decreasing load size (at the cost of more loads) */
+ new_channels = *channels;
+ while (new_channels > 1 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels))
+ new_channels--;
+ }
+
+ if (new_channels < *channels)
+ *channels = new_channels;
+ num_channels = new_channels;
+ }
+
+ switch (vtx_info->chan_format) {
case V_008F0C_BUF_DATA_FORMAT_8:
+ return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8,
+ V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1];
case V_008F0C_BUF_DATA_FORMAT_16:
+ return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16,
+ V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1];
case V_008F0C_BUF_DATA_FORMAT_32:
- return 1;
- case V_008F0C_BUF_DATA_FORMAT_8_8:
- case V_008F0C_BUF_DATA_FORMAT_16_16:
- case V_008F0C_BUF_DATA_FORMAT_32_32:
- return 2;
- case V_008F0C_BUF_DATA_FORMAT_10_11_11:
- case V_008F0C_BUF_DATA_FORMAT_11_11_10:
- case V_008F0C_BUF_DATA_FORMAT_32_32_32:
- return 3;
- case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
- case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
- case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
- case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
- case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
- return 4;
- default:
- break;
+ return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
+ V_008F0C_BUF_DATA_FORMAT_32_32_32, V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1];
}
-
- return 4;
+ unreachable("shouldn't reach here");
+ return V_008F0C_BUF_DATA_FORMAT_INVALID;
}
/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
@@ -3148,11 +3175,11 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
unsigned dfmt = attrib_format & 0xf;
-
unsigned nfmt = (attrib_format >> 4) & 0x7;
- unsigned num_dfmt_channels = get_num_channels_from_data_format(dfmt);
+ const struct ac_data_format_info *vtx_info = ac_get_data_format_info(dfmt);
+
unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
- unsigned num_channels = MIN2(util_last_bit(mask), num_dfmt_channels);
+ unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels);
unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3;
bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
if (post_shuffle)
@@ -3183,53 +3210,74 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
get_arg(ctx, ctx->args->ac.vertex_id));
}
- if (attrib_stride != 0 && attrib_offset > attrib_stride) {
- index = bld.vadd32(bld.def(v1), Operand(attrib_offset / attrib_stride), index);
- attrib_offset = attrib_offset % attrib_stride;
- }
+ Temp channels[num_channels];
+ unsigned channel_start = 0;
+ bool direct_fetch = false;
- Operand soffset(0u);
- if (attrib_offset >= 4096) {
- soffset = bld.copy(bld.def(s1), Operand(attrib_offset));
- attrib_offset = 0;
- }
+ /* load channels */
+ while (channel_start < num_channels) {
+ unsigned fetch_size = num_channels - channel_start;
+ unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size;
+ unsigned fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_size);
- aco_opcode opcode;
- switch (num_channels) {
- case 1:
- opcode = aco_opcode::tbuffer_load_format_x;
- break;
- case 2:
- opcode = aco_opcode::tbuffer_load_format_xy;
- break;
- case 3:
- opcode = aco_opcode::tbuffer_load_format_xyz;
- break;
- case 4:
- opcode = aco_opcode::tbuffer_load_format_xyzw;
- break;
- default:
- unreachable("Unimplemented load_input vector size");
- }
+ Temp fetch_index = index;
+ if (attrib_stride != 0 && fetch_offset > attrib_stride) {
+ fetch_index = bld.vadd32(bld.def(v1), Operand(fetch_offset / attrib_stride), fetch_index);
+ fetch_offset = fetch_offset % attrib_stride;
+ }
- Temp tmp = post_shuffle || num_channels != dst.size() || alpha_adjust != RADV_ALPHA_ADJUST_NONE || component ? bld.tmp(RegType::vgpr, num_channels) : dst;
+ Operand soffset(0u);
+ if (fetch_offset >= 4096) {
+ soffset = bld.copy(bld.def(s1), Operand(fetch_offset / 4096 * 4096));
+ fetch_offset %= 4096;
+ }
- aco_ptr<MTBUF_instruction> mubuf{create_instruction<MTBUF_instruction>(opcode, Format::MTBUF, 3, 1)};
- mubuf->operands[0] = Operand(index);
- mubuf->operands[1] = Operand(list);
- mubuf->operands[2] = soffset;
- mubuf->definitions[0] = Definition(tmp);
- mubuf->idxen = true;
- mubuf->can_reorder = true;
- mubuf->dfmt = dfmt;
- mubuf->nfmt = nfmt;
- assert(attrib_offset < 4096);
- mubuf->offset = attrib_offset;
- ctx->block->instructions.emplace_back(std::move(mubuf));
+ aco_opcode opcode;
+ switch (fetch_size) {
+ case 1:
+ opcode = aco_opcode::tbuffer_load_format_x;
+ break;
+ case 2:
+ opcode = aco_opcode::tbuffer_load_format_xy;
+ break;
+ case 3:
+ opcode = aco_opcode::tbuffer_load_format_xyz;
+ break;
+ case 4:
+ opcode = aco_opcode::tbuffer_load_format_xyzw;
+ break;
+ default:
+ unreachable("Unimplemented load_input vector size");
+ }
+
+ Temp fetch_dst;
+ if (channel_start == 0 && fetch_size == dst.size() && !post_shuffle &&
+ (alpha_adjust == RADV_ALPHA_ADJUST_NONE || num_channels <= 3)) {
+ direct_fetch = true;
+ fetch_dst = dst;
+ } else {
+ fetch_dst = bld.tmp(RegType::vgpr, fetch_size);
+ }
- emit_split_vector(ctx, tmp, tmp.size());
+ Instruction *mtbuf = bld.mtbuf(opcode,
+ Definition(fetch_dst), fetch_index, list, soffset,
+ fetch_dfmt, nfmt, fetch_offset,
+ false, true).instr;
+ static_cast<MTBUF_instruction*>(mtbuf)->can_reorder = true;
- if (tmp.id() != dst.id()) {
+ emit_split_vector(ctx, fetch_dst, fetch_dst.size());
+
+ if (fetch_size == 1) {
+ channels[channel_start] = fetch_dst;
+ } else {
+ for (unsigned i = 0; i < MIN2(fetch_size, num_channels - channel_start); i++)
+ channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i, v1);
+ }
+
+ channel_start += fetch_size;
+ }
+
+ if (!direct_fetch) {
bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT &&
nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
@@ -3238,13 +3286,18 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
+ std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
+ unsigned num_temp = 0;
for (unsigned i = 0; i < dst.size(); i++) {
unsigned idx = i + component;
- if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE && num_channels >= 4) {
- Temp alpha = emit_extract_vector(ctx, tmp, swizzle[3], v1);
- vec->operands[3] = Operand(adjust_vertex_fetch_alpha(ctx, alpha_adjust, alpha));
- } else if (idx < num_channels) {
- vec->operands[i] = Operand(emit_extract_vector(ctx, tmp, swizzle[idx], v1));
+ if (idx < num_channels && channels[swizzle[idx]].id()) {
+ Temp channel = channels[swizzle[idx]];
+ if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE)
+ channel = adjust_vertex_fetch_alpha(ctx, alpha_adjust, channel);
+ vec->operands[i] = Operand(channel);
+
+ num_temp++;
+ elems[i] = channel;
} else if (is_float && idx == 3) {
vec->operands[i] = Operand(0x3f800000u);
} else if (!is_float && idx == 3) {
@@ -3256,8 +3309,10 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
vec->definitions[0] = Definition(dst);
ctx->block->instructions.emplace_back(std::move(vec));
emit_split_vector(ctx, dst, dst.size());
- }
+ if (num_temp == dst.size())
+ ctx->allocated_vec.emplace(dst.id(), elems);
+ }
} else if (ctx->stage == fragment_fs) {
nir_instr *off_instr = instr->src[0].ssa->parent_instr;
if (off_instr->type != nir_instr_type_load_const ||
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py
index d537133a6dc..d3b7252414f 100644
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@@ -77,7 +77,6 @@ class Format(Enum):
elif self == Format.MTBUF:
return [('unsigned', 'dfmt', None),
('unsigned', 'nfmt', None),
- ('unsigned', 'img_format', None),
('unsigned', 'offset', None),
('bool', 'offen', None),
('bool', 'idxen', 'false'),
@@ -85,8 +84,7 @@ class Format(Enum):
('bool', 'glc', 'false'),
('bool', 'dlc', 'false'),
('bool', 'slc', 'false'),
- ('bool', 'tfe', 'false'),
- ('bool', 'lds', 'false')]
+ ('bool', 'tfe', 'false')]
elif self == Format.MUBUF:
return [('unsigned', 'offset', None),
('bool', 'offen', None),