From 61359324c1ed511edc8b1d6b23ced1e43b08ec42 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Mon, 6 Mar 2017 18:04:12 -0800 Subject: Revert "vc4: Lazily emit our FS/VS input loads." This reverts commit 292c24ddac5acc35676424f05291c101fcd47b3e. It broke a lot of GLES2 deqp, and I see at least one problem that will require some serious rework to fix. --- src/gallium/drivers/vc4/vc4_context.h | 8 +- src/gallium/drivers/vc4/vc4_draw.c | 4 +- src/gallium/drivers/vc4/vc4_program.c | 149 ++++++++++++++++------------------ src/gallium/drivers/vc4/vc4_qir.h | 7 -- 4 files changed, 75 insertions(+), 93 deletions(-) (limited to 'src/gallium/drivers/vc4') diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h index f346474abe0..6bd2424ec79 100644 --- a/src/gallium/drivers/vc4/vc4_context.h +++ b/src/gallium/drivers/vc4/vc4_context.h @@ -174,10 +174,10 @@ struct vc4_compiled_shader { uint8_t num_inputs; - /** Byte offsets for the start of the vertex attributes. */ - uint8_t vattr_offsets[8]; - /** Total size of the vertex inputs, in bytes. */ - uint8_t vattr_total_size; + /* Byte offsets for the start of the vertex attributes 0-7, and the + * total size as "attribute" 8. + */ + uint8_t vattr_offsets[9]; uint8_t vattrs_live; const struct vc4_fs_inputs *fs_inputs; diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c index 9f3765db1af..ebd080298a4 100644 --- a/src/gallium/drivers/vc4/vc4_draw.c +++ b/src/gallium/drivers/vc4/vc4_draw.c @@ -170,14 +170,14 @@ vc4_emit_gl_shader_state(struct vc4_context *vc4, /* VC4_DIRTY_COMPILED_VS */ cl_u16(&shader_rec, 0); /* vs num uniforms */ cl_u8(&shader_rec, vc4->prog.vs->vattrs_live); - cl_u8(&shader_rec, vc4->prog.vs->vattr_total_size); + cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[8]); cl_reloc(job, &job->shader_rec, &shader_rec, vc4->prog.vs->bo, 0); cl_u32(&shader_rec, 0); /* UBO offset written by kernel */ /* VC4_DIRTY_COMPILED_CS */ cl_u16(&shader_rec, 0); /* cs num uniforms */ cl_u8(&shader_rec, vc4->prog.cs->vattrs_live); - cl_u8(&shader_rec, vc4->prog.cs->vattr_total_size); + cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[8]); cl_reloc(job, &job->shader_rec, &shader_rec, vc4->prog.cs->bo, 0); cl_u32(&shader_rec, 0); /* UBO offset written by kernel */ diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 3d45723b62d..2c9447fa023 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -733,14 +733,11 @@ emit_vertex_input(struct vc4_compile *c, int attr) { enum pipe_format format = c->vs_key->attr_formats[attr]; uint32_t attr_size = util_format_get_blocksize(format); - uint32_t vpm_attr = c->next_vpm_input++; - c->vpm_input_order[vpm_attr] = attr; - - c->vattr_sizes[vpm_attr] = align(attr_size, 4); + c->vattr_sizes[attr] = align(attr_size, 4); for (int i = 0; i < align(attr_size, 4) / 4; i++) { c->inputs[attr * 4 + i] = - qir_MOV(c, qir_reg(QFILE_VPM, vpm_attr * 4 + i)); + qir_MOV(c, qir_reg(QFILE_VPM, attr * 4 + i)); c->num_inputs++; } } @@ -1469,7 +1466,6 @@ emit_stub_vpm_read(struct vc4_compile *c) if (c->num_inputs) return; - c->next_vpm_input++; c->vattr_sizes[0] = 4; (void)qir_MOV(c, qir_reg(QFILE_VPM, 0)); c->num_inputs++; @@ -1556,6 +1552,64 @@ vc4_optimize_nir(struct nir_shader *s) } while (progress); } +static int +driver_location_compare(const void *in_a, const void *in_b) +{ + const nir_variable *const *a = in_a; + const nir_variable *const *b = in_b; + + return (*a)->data.driver_location - (*b)->data.driver_location; +} + +static void +ntq_setup_inputs(struct vc4_compile *c) +{ + unsigned num_entries = 0; + nir_foreach_variable(var, &c->s->inputs) + num_entries++; + + nir_variable *vars[num_entries]; + + unsigned i = 0; + nir_foreach_variable(var, &c->s->inputs) + vars[i++] = var; + + /* Sort the variables so that we emit the input setup in + * driver_location order. This is required for VPM reads, whose data + * is fetched into the VPM in driver_location (TGSI register index) + * order. + */ + qsort(&vars, num_entries, sizeof(*vars), driver_location_compare); + + for (unsigned i = 0; i < num_entries; i++) { + nir_variable *var = vars[i]; + unsigned array_len = MAX2(glsl_get_length(var->type), 1); + unsigned loc = var->data.driver_location; + + assert(array_len == 1); + (void)array_len; + resize_qreg_array(c, &c->inputs, &c->inputs_array_size, + (loc + 1) * 4); + + if (c->stage == QSTAGE_FRAG) { + if (var->data.location == VARYING_SLOT_POS) { + emit_fragcoord_input(c, loc); + } else if (var->data.location == VARYING_SLOT_PNTC || + (var->data.location >= VARYING_SLOT_VAR0 && + (c->fs_key->point_sprite_mask & + (1 << (var->data.location - + VARYING_SLOT_VAR0))))) { + c->inputs[loc * 4 + 0] = c->point_x; + c->inputs[loc * 4 + 1] = c->point_y; + } else { + emit_fragment_input(c, loc, var->data.location); + } + } else { + emit_vertex_input(c, loc); + } + } +} + static void ntq_setup_outputs(struct vc4_compile *c) { @@ -1686,73 +1740,10 @@ ntq_emit_load_input(struct vc4_compile *c, nir_intrinsic_instr *instr) return; } - /* Size our inputs array as far as this input. Input arrays are - * small, and we don't have a shader_info field that tells us up front - * what the maximum driver_location is. - */ - uint32_t loc = nir_intrinsic_base(instr) + const_offset->u32[0]; - if ((loc + 1) * 4 > c->inputs_array_size) { - resize_qreg_array(c, &c->inputs, &c->inputs_array_size, - (loc + 1) * 4); - } - - /* If we've already loaded this input, just return it. This would - * happen for VPM loads, where we load an entire vertex attribute at - * once, or possibly also in the FS if we haven't CSEed away repeated - * loads. - */ + uint32_t offset = nir_intrinsic_base(instr) + const_offset->u32[0]; int comp = nir_intrinsic_component(instr); - if (c->inputs[loc * 4 + comp].file != QFILE_NULL) { - ntq_store_dest(c, &instr->dest, 0, - qir_MOV(c, c->inputs[loc * 4 + comp])); - return; - } - - /* In the FS, we always have to fully drain our FS FIFO before - * terminating the shader. For the VS we only have to drain whatever - * VPM setup we configure, but vc4_qpu_emit.c configures it for the - * entire vertex attribute space. Because of this, we emit our lazy - * varying/VPM loads at the last top level basic block. - */ - struct qblock *saved_cur_block = c->cur_block; - c->cur_block = c->last_top_block; - - /* Look up the NIR variable for this input, so we can see how big the - * input is, or what sort of interpolation is necessary. - */ - nir_variable *var = NULL; - nir_foreach_variable(search_var, &c->s->inputs) { - unsigned search_len = MAX2(glsl_get_length(search_var->type), 1); - unsigned search_loc = search_var->data.driver_location; - - if (loc >= search_loc && loc < search_loc + search_len) { - var = search_var; - break; - } - } - assert(var); - - if (c->stage == QSTAGE_FRAG) { - if (var->data.location == VARYING_SLOT_POS) { - emit_fragcoord_input(c, loc); - } else if (var->data.location == VARYING_SLOT_PNTC || - (var->data.location >= VARYING_SLOT_VAR0 && - (c->fs_key->point_sprite_mask & - (1 << (var->data.location - - VARYING_SLOT_VAR0))))) { - c->inputs[loc * 4 + 0] = c->point_x; - c->inputs[loc * 4 + 1] = c->point_y; - } else { - emit_fragment_input(c, loc, var->data.location); - } - } else { - emit_vertex_input(c, loc); - } - - c->cur_block = saved_cur_block; - ntq_store_dest(c, &instr->dest, 0, - qir_MOV(c, c->inputs[loc * 4 + comp])); + qir_MOV(c, c->inputs[offset * 4 + comp])); } static void @@ -2170,6 +2161,7 @@ nir_to_qir(struct vc4_compile *c) if (c->stage == QSTAGE_FRAG && c->s->info->fs.uses_discard) c->discard = qir_MOV(c, qir_uniform_ui(c, 0)); + ntq_setup_inputs(c); ntq_setup_outputs(c); ntq_setup_uniforms(c); ntq_setup_registers(c, &c->s->registers); @@ -2596,17 +2588,14 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage, } else { shader->num_inputs = c->num_inputs; - uint8_t next_vattr_offset = 0; - for (int i = 0; i < c->next_vpm_input; i++) { - if (!c->vattr_sizes[i]) - continue; + shader->vattr_offsets[0] = 0; + for (int i = 0; i < 8; i++) { + shader->vattr_offsets[i + 1] = + shader->vattr_offsets[i] + c->vattr_sizes[i]; - uint32_t nir_attr = c->vpm_input_order[i]; - shader->vattr_offsets[nir_attr] = next_vattr_offset; - next_vattr_offset += c->vattr_sizes[i]; - shader->vattrs_live |= (1 << nir_attr); + if (c->vattr_sizes[i]) + shader->vattrs_live |= (1 << i); } - shader->vattr_total_size = next_vattr_offset; } shader->failed = c->failed; diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index fe86232aeb2..6469e51b051 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -461,13 +461,6 @@ struct vc4_compile { uint8_t vattr_sizes[8]; - /** - * Order in which the vattrs were loaded by the program, to arrange - * vattr_offsets[] in the program data appropriately. - */ - uint8_t vpm_input_order[8]; - uint8_t next_vpm_input; - /** * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads. * -- cgit v1.2.3