diff options
Diffstat (limited to 'src/mesa')
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 64 |
1 files changed, 57 insertions, 7 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 17eb82ed56f..0ff197f3bde 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -3366,6 +3366,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr nir->info.num_ubos - 1); } + /* Number of 32-bit slots in the type */ + unsigned type_slots = MAX2(1, type_sz(dest.type) / 4); + nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); if (const_offset == NULL) { fs_reg base_offset = retype(get_nir_src(instr->src[1]), @@ -3373,19 +3376,66 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr for (int i = 0; i < instr->num_components; i++) VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index, - base_offset, i * 4); + base_offset, i * type_sz(dest.type)); } else { + /* Even if we are loading doubles, a pull constant load will load + * a 32-bit vec4, so should only reserve vgrf space for that. If we + * need to load a full dvec4 we will have to emit 2 loads. This is + * similar to demote_pull_constants(), except that in that case we + * see individual accesses to each component of the vector and then + * we let CSE deal with duplicate loads. Here we see a vector access + * and we have to split it if necessary. + */ fs_reg packed_consts = vgrf(glsl_type::float_type); packed_consts.type = dest.type; - struct brw_reg const_offset_reg = brw_imm_ud(const_offset->u32[0] & ~15); - bld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts, - surf_index, const_offset_reg); + unsigned const_offset_aligned = const_offset->u32[0] & ~15; + + /* A vec4 only contains half of a dvec4, if we need more than 2 + * components of a dvec4 we will have to issue another load for + * components z and w. + */ + int num_components; + if (type_slots == 1) + num_components = instr->num_components; + else + num_components = MIN2(2, instr->num_components); + + /* The computation of num_components doesn't take into account + * misalignment, which should be okay according to std140 vector + * alignment rules. + */ + assert(const_offset->u32[0] % 16 + + type_sz(dest.type) * num_components <= 16); + + int remaining_components = instr->num_components; + while (remaining_components > 0) { + /* Read the vec4 from a 16-byte aligned offset */ + struct brw_reg const_offset_reg = brw_imm_ud(const_offset_aligned); + bld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, + retype(packed_consts, BRW_REGISTER_TYPE_F), + surf_index, const_offset_reg); + + const fs_reg consts = byte_offset(packed_consts, (const_offset->u32[0] % 16)); + unsigned dest_offset = instr->num_components - remaining_components; + + /* XXX: This doesn't update the sub-16B offset across iterations of + * the loop, which should work for std140 vector alignment rules. + */ + assert(dest_offset == 0 || const_offset->u32[0] % 16 == 0); - const fs_reg consts = byte_offset(packed_consts, const_offset->u32[0] % 16); + for (int i = 0; i < num_components; i++) + bld.MOV(offset(dest, bld, i + dest_offset), component(consts, i)); - for (unsigned i = 0; i < instr->num_components; i++) - bld.MOV(offset(dest, bld, i), component(consts, i)); + /* If this is a large enough 64-bit load, we will need to emit + * another message + */ + remaining_components -= num_components; + assert(remaining_components == 0 || + (remaining_components <= 2 && type_slots == 2)); + num_components = remaining_components; + const_offset_aligned += 16; + } } break; } |