diff options
author | Iago Toral Quiroga <[email protected]> | 2019-08-07 08:20:35 +0200 |
---|---|---|
committer | Iago Toral Quiroga <[email protected]> | 2019-08-13 08:38:19 +0200 |
commit | 99e9809cab8aacb326b35b3bf50c6cd33edab6b8 (patch) | |
tree | 7cf55afcbb1a59cde0ec7ffaae11b5d1ace34d22 /src | |
parent | 3d65d2a4883bcf0cdc2eb3a2eeafda1d3c784b9b (diff) |
v3d: honor the write mask on store operations
v2:
- Fix incremental update of the const offset when we need to emit a sequence
with more than one write because of the writemask.
- Do not move the tmu write emission to a separate helper.
v3:
- Get the store writemask before the loop, use ffs to get the first component
to write and clear writemask bits as we process the components (Eric).
- Simplified the code that figured out the number of components for the TMU
config based on the number of tmu writes for stores and atomics.
v4:
- Code clean-ups (Eric).
Fixes:
KHR-GLES31.core.shader_image_load_store.advanced-cast-cs
KHR-GLES31.core.shader_image_load_store.advanced-cast-fs
KHR-GLES31.core.shader_storage_buffer_object.advanced-switchBuffers-cs
KHR-GLES31.core.shader_storage_buffer_object.advanced-switchPrograms-cs
KHR-GLES31.core.shader_storage_buffer_object.basic-operations-case1-cs
Reviewed-by: Eric Anholt <[email protected]>
Diffstat (limited to 'src')
-rw-r--r-- | src/broadcom/compiler/nir_to_vir.c | 205 |
1 files changed, 120 insertions, 85 deletions
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 3857f96a9bd..92252b03218 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -229,19 +229,19 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, if (!dynamic_src) const_offset = nir_src_as_uint(instr->src[offset_src]); - struct qreg offset; + struct qreg base_offset; if (instr->intrinsic == nir_intrinsic_load_uniform) { const_offset += nir_intrinsic_base(instr); - offset = vir_uniform(c, QUNIFORM_UBO_ADDR, - v3d_unit_data_create(0, const_offset)); + base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR, + v3d_unit_data_create(0, const_offset)); const_offset = 0; } else if (instr->intrinsic == nir_intrinsic_load_ubo) { uint32_t index = nir_src_as_uint(instr->src[0]) + 1; /* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by * 1 (0 is gallium's constant buffer 0). */ - offset = vir_uniform(c, QUNIFORM_UBO_ADDR, - v3d_unit_data_create(index, const_offset)); + base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR, + v3d_unit_data_create(index, const_offset)); const_offset = 0; } else if (is_shared_or_scratch) { /* Shared and scratch variables have no buffer index, and all @@ -250,109 +250,144 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, */ if (instr->intrinsic == nir_intrinsic_load_scratch || instr->intrinsic == nir_intrinsic_store_scratch) { - offset = c->spill_base; + base_offset = c->spill_base; } else { - offset = c->cs_shared_offset; + base_offset = c->cs_shared_offset; const_offset += nir_intrinsic_base(instr); } } else { - offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET, - nir_src_as_uint(instr->src[is_store ? - 1 : 0])); + base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET, + nir_src_as_uint(instr->src[is_store ? + 1 : 0])); } - int tmu_writes = 1; /* address */ - if (is_store) { - for (int i = 0; i < instr->num_components; i++) { - vir_MOV_dest(c, - vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), - ntq_get_src(c, instr->src[0], i)); - tmu_writes++; - } - } else if (!is_load && !atomic_add_replaced) { - vir_MOV_dest(c, - vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), - ntq_get_src(c, instr->src[1 + has_index], 0)); - tmu_writes++; - if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) { + unsigned writemask = is_store ? nir_intrinsic_write_mask(instr) : 0; + uint32_t base_const_offset = const_offset; + int first_component = -1; + int last_component = -1; + do { + int tmu_writes = 1; /* address */ + + if (is_store) { + /* Find the first set of consecutive components that + * are enabled in the writemask and emit the TMUD + * instructions for them. + */ + first_component = ffs(writemask) - 1; + last_component = first_component; + while (writemask & BITFIELD_BIT(last_component + 1)) + last_component++; + + assert(first_component >= 0 && + first_component <= last_component && + last_component < instr->num_components); + + struct qreg tmud = vir_reg(QFILE_MAGIC, + V3D_QPU_WADDR_TMUD); + for (int i = first_component; i <= last_component; i++) { + struct qreg data = + ntq_get_src(c, instr->src[0], i); + vir_MOV_dest(c, tmud, data); + tmu_writes++; + } + + /* Update the offset for the TMU write based on the + * the first component we are writing. + */ + const_offset = base_const_offset + first_component * 4; + + /* Clear these components from the writemask */ + uint32_t written_mask = + BITFIELD_RANGE(first_component, tmu_writes - 1); + writemask &= ~written_mask; + } else if (!is_load && !atomic_add_replaced) { vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), - ntq_get_src(c, instr->src[2 + has_index], - 0)); + ntq_get_src(c, instr->src[1 + has_index], 0)); tmu_writes++; + if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) { + vir_MOV_dest(c, + vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), + ntq_get_src(c, instr->src[2 + has_index], + 0)); + tmu_writes++; + } } - } - - /* Make sure we won't exceed the 16-entry TMU fifo if each thread is - * storing at the same time. - */ - while (tmu_writes > 16 / c->threads) - c->threads /= 2; - /* The spec says that for atomics, the TYPE field is ignored, but that - * doesn't seem to be the case for CMPXCHG. Just use the number of - * tmud writes we did to decide the type (or choose "32bit" for atomic - * reads, which has been fine). - */ - int num_components; - if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) - num_components = 2; - else - num_components = instr->num_components; - - uint32_t config = (0xffffff00 | - tmu_op << 3| - GENERAL_TMU_LOOKUP_PER_PIXEL); - if (num_components == 1) { - config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; - } else { - config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 + num_components - 2; - } + /* Make sure we won't exceed the 16-entry TMU fifo if each thread is + * storing at the same time. + */ + while (tmu_writes > 16 / c->threads) + c->threads /= 2; - if (vir_in_nonuniform_control_flow(c)) { - vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), - V3D_QPU_PF_PUSHZ); - } + /* The spec says that for atomics, the TYPE field is ignored, but that + * doesn't seem to be the case for CMPXCHG. Just use the number of + * tmud writes we did to decide the type (or choose "32bit" for atomic + * reads, which has been fine). + */ + uint32_t num_components; + if (is_load || atomic_add_replaced) { + num_components = instr->num_components; + } else { + assert(tmu_writes > 1); + num_components = tmu_writes - 1; + } - struct qreg tmua; - if (config == ~0) - tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA); - else - tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU); + uint32_t config = (0xffffff00 | + tmu_op << 3| + GENERAL_TMU_LOOKUP_PER_PIXEL); + if (num_components == 1) { + config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; + } else { + config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 + num_components - 2; + } - struct qinst *tmu; - if (dynamic_src) { - if (const_offset != 0) { - offset = vir_ADD(c, offset, - vir_uniform_ui(c, const_offset)); + if (vir_in_nonuniform_control_flow(c)) { + vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); } - tmu = vir_ADD_dest(c, tmua, offset, - ntq_get_src(c, instr->src[offset_src], 0)); - } else { - if (const_offset != 0) { + + struct qreg tmua; + if (config == ~0) + tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA); + else + tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU); + + struct qinst *tmu; + if (dynamic_src) { + struct qreg offset = base_offset; + if (const_offset != 0) { + offset = vir_ADD(c, offset, + vir_uniform_ui(c, const_offset)); + } tmu = vir_ADD_dest(c, tmua, offset, - vir_uniform_ui(c, const_offset)); + ntq_get_src(c, instr->src[offset_src], 0)); } else { - tmu = vir_MOV_dest(c, tmua, offset); + if (const_offset != 0) { + tmu = vir_ADD_dest(c, tmua, base_offset, + vir_uniform_ui(c, const_offset)); + } else { + tmu = vir_MOV_dest(c, tmua, base_offset); + } } - } - if (config != ~0) { - tmu->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, - config); - } + if (config != ~0) { + tmu->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, + config); + } - if (vir_in_nonuniform_control_flow(c)) - vir_set_cond(tmu, V3D_QPU_COND_IFA); + if (vir_in_nonuniform_control_flow(c)) + vir_set_cond(tmu, V3D_QPU_COND_IFA); - vir_emit_thrsw(c); + vir_emit_thrsw(c); - /* Read the result, or wait for the TMU op to complete. */ - for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) - ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c))); + /* Read the result, or wait for the TMU op to complete. */ + for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) + ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c))); - if (nir_intrinsic_dest_components(instr) == 0) - vir_TMUWT(c); + if (nir_intrinsic_dest_components(instr) == 0) + vir_TMUWT(c); + } while (is_store && writemask != 0); } static struct qreg * |