diff options
author | Eric Anholt <[email protected]> | 2017-02-24 12:57:03 -0800 |
---|---|---|
committer | Eric Anholt <[email protected]> | 2017-02-24 17:01:29 -0800 |
commit | 292c24ddac5acc35676424f05291c101fcd47b3e (patch) | |
tree | 1cc326dc2c1dd5c8abd664dae0b4e1fcfa4bf373 /src/gallium/drivers/vc4/vc4_program.c | |
parent | f06915d7b71eb955cc0db4b5555f5c6474926a01 (diff) |
vc4: Lazily emit our FS/VS input loads.
This reduces register pressure in both types of shaders, by reordering the
input loads from the var->data.driver_location order to whatever order
they appear first in the NIR shader. These instructions aren't
reorderable at our QIR scheduling level because the FS takes two in
lockstep to do an interpolation, and the VS takes multiple read
instructions in a row to get a whole vec4-level attribute read.
shader-db impact:
total instructions in shared programs: 76666 -> 76590 (-0.10%)
instructions in affected programs: 42945 -> 42869 (-0.18%)
total max temps in shared programs: 9395 -> 9208 (-1.99%)
max temps in affected programs: 2951 -> 2764 (-6.34%)
Some programs get their max temps hurt, depending on the order that the
load_input intrinsics appear, because we end up being unable to copy
propagate an older VPM read into its only use.
Diffstat (limited to 'src/gallium/drivers/vc4/vc4_program.c')
-rw-r--r-- | src/gallium/drivers/vc4/vc4_program.c | 149 |
1 files changed, 80 insertions, 69 deletions
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 05e596e733b..21753439cf6 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -733,11 +733,14 @@ emit_vertex_input(struct vc4_compile *c, int attr) { enum pipe_format format = c->vs_key->attr_formats[attr]; uint32_t attr_size = util_format_get_blocksize(format); + uint32_t vpm_attr = c->next_vpm_input++; - c->vattr_sizes[attr] = align(attr_size, 4); + c->vpm_input_order[vpm_attr] = attr; + + c->vattr_sizes[vpm_attr] = align(attr_size, 4); for (int i = 0; i < align(attr_size, 4) / 4; i++) { c->inputs[attr * 4 + i] = - qir_MOV(c, qir_reg(QFILE_VPM, attr * 4 + i)); + qir_MOV(c, qir_reg(QFILE_VPM, vpm_attr * 4 + i)); c->num_inputs++; } } @@ -1466,6 +1469,7 @@ emit_stub_vpm_read(struct vc4_compile *c) if (c->num_inputs) return; + c->next_vpm_input++; c->vattr_sizes[0] = 4; (void)qir_MOV(c, qir_reg(QFILE_VPM, 0)); c->num_inputs++; @@ -1552,64 +1556,6 @@ vc4_optimize_nir(struct nir_shader *s) } while (progress); } -static int -driver_location_compare(const void *in_a, const void *in_b) -{ - const nir_variable *const *a = in_a; - const nir_variable *const *b = in_b; - - return (*a)->data.driver_location - (*b)->data.driver_location; -} - -static void -ntq_setup_inputs(struct vc4_compile *c) -{ - unsigned num_entries = 0; - nir_foreach_variable(var, &c->s->inputs) - num_entries++; - - nir_variable *vars[num_entries]; - - unsigned i = 0; - nir_foreach_variable(var, &c->s->inputs) - vars[i++] = var; - - /* Sort the variables so that we emit the input setup in - * driver_location order. This is required for VPM reads, whose data - * is fetched into the VPM in driver_location (TGSI register index) - * order. - */ - qsort(&vars, num_entries, sizeof(*vars), driver_location_compare); - - for (unsigned i = 0; i < num_entries; i++) { - nir_variable *var = vars[i]; - unsigned array_len = MAX2(glsl_get_length(var->type), 1); - unsigned loc = var->data.driver_location; - - assert(array_len == 1); - (void)array_len; - resize_qreg_array(c, &c->inputs, &c->inputs_array_size, - (loc + 1) * 4); - - if (c->stage == QSTAGE_FRAG) { - if (var->data.location == VARYING_SLOT_POS) { - emit_fragcoord_input(c, loc); - } else if (var->data.location == VARYING_SLOT_PNTC || - (var->data.location >= VARYING_SLOT_VAR0 && - (c->fs_key->point_sprite_mask & - (1 << (var->data.location - - VARYING_SLOT_VAR0))))) { - c->inputs[loc * 4 + 0] = c->point_x; - c->inputs[loc * 4 + 1] = c->point_y; - } else { - emit_fragment_input(c, loc, var->data.location); - } - } else { - emit_vertex_input(c, loc); - } - } -} - static void ntq_setup_outputs(struct vc4_compile *c) { @@ -1740,10 +1686,73 @@ ntq_emit_load_input(struct vc4_compile *c, nir_intrinsic_instr *instr) return; } - uint32_t offset = nir_intrinsic_base(instr) + const_offset->u32[0]; + /* Size our inputs array as far as this input. Input arrays are + * small, and we don't have a shader_info field that tells us up front + * what the maximum driver_location is. + */ + uint32_t loc = nir_intrinsic_base(instr) + const_offset->u32[0]; + if ((loc + 1) * 4 > c->inputs_array_size) { + resize_qreg_array(c, &c->inputs, &c->inputs_array_size, + (loc + 1) * 4); + } + + /* If we've already loaded this input, just return it. This would + * happen for VPM loads, where we load an entire vertex attribute at + * once, or possibly also in the FS if we haven't CSEed away repeated + * loads. + */ int comp = nir_intrinsic_component(instr); + if (c->inputs[loc * 4 + comp].file != QFILE_NULL) { + ntq_store_dest(c, &instr->dest, 0, + qir_MOV(c, c->inputs[loc * 4 + comp])); + return; + } + + /* In the FS, we always have to fully drain our FS FIFO before + * terminating the shader. For the VS we only have to drain whatever + * VPM setup we configure, but vc4_qpu_emit.c configures it for the + * entire vertex attribute space. Because of this, we emit our lazy + * varying/VPM loads at the last top level basic block. + */ + struct qblock *saved_cur_block = c->cur_block; + c->cur_block = c->last_top_block; + + /* Look up the NIR variable for this input, so we can see how big the + * input is, or what sort of interpolation is necessary. + */ + nir_variable *var = NULL; + nir_foreach_variable(search_var, &c->s->inputs) { + unsigned search_len = MAX2(glsl_get_length(search_var->type), 1); + unsigned search_loc = search_var->data.driver_location; + + if (loc >= search_loc && loc < search_loc + search_len) { + var = search_var; + break; + } + } + assert(var); + + if (c->stage == QSTAGE_FRAG) { + if (var->data.location == VARYING_SLOT_POS) { + emit_fragcoord_input(c, loc); + } else if (var->data.location == VARYING_SLOT_PNTC || + (var->data.location >= VARYING_SLOT_VAR0 && + (c->fs_key->point_sprite_mask & + (1 << (var->data.location - + VARYING_SLOT_VAR0))))) { + c->inputs[loc * 4 + 0] = c->point_x; + c->inputs[loc * 4 + 1] = c->point_y; + } else { + emit_fragment_input(c, loc, var->data.location); + } + } else { + emit_vertex_input(c, loc); + } + + c->cur_block = saved_cur_block; + ntq_store_dest(c, &instr->dest, 0, - qir_MOV(c, c->inputs[offset * 4 + comp])); + qir_MOV(c, c->inputs[loc * 4 + comp])); } static void @@ -2161,7 +2170,6 @@ nir_to_qir(struct vc4_compile *c) if (c->stage == QSTAGE_FRAG && c->s->info->fs.uses_discard) c->discard = qir_MOV(c, qir_uniform_ui(c, 0)); - ntq_setup_inputs(c); ntq_setup_outputs(c); ntq_setup_uniforms(c); ntq_setup_registers(c, &c->s->registers); @@ -2587,14 +2595,17 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage, } else { shader->num_inputs = c->num_inputs; - shader->vattr_offsets[0] = 0; - for (int i = 0; i < 8; i++) { - shader->vattr_offsets[i + 1] = - shader->vattr_offsets[i] + c->vattr_sizes[i]; + uint8_t next_vattr_offset = 0; + for (int i = 0; i < c->next_vpm_input; i++) { + if (!c->vattr_sizes[i]) + continue; - if (c->vattr_sizes[i]) - shader->vattrs_live |= (1 << i); + uint32_t nir_attr = c->vpm_input_order[i]; + shader->vattr_offsets[nir_attr] = next_vattr_offset; + next_vattr_offset += c->vattr_sizes[i]; + shader->vattrs_live |= (1 << nir_attr); } + shader->vattr_total_size = next_vattr_offset; } shader->failed = c->failed; |