From 42652ea51e643af9dfa0f1f7409b473b95d0a406 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Mon, 10 Sep 2018 08:19:48 -0700 Subject: v3d: Use combined input/output segments. The HW apparently has some issues (or at least a much more complicated VCM calculation) with non-combined segments, and the closed source driver also uses combined I/O. Until I get the last CTS failure resolved (which does look plausibly like some VPM stomping), let's use combined I/O too. --- src/broadcom/compiler/qpu_schedule.c | 22 +++++++++++++++++++++- src/broadcom/compiler/v3d_compiler.h | 5 +++++ src/broadcom/compiler/vir.c | 8 ++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) (limited to 'src/broadcom') diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c index 4f3b621fd29..944059c9778 100644 --- a/src/broadcom/compiler/qpu_schedule.c +++ b/src/broadcom/compiler/qpu_schedule.c @@ -280,6 +280,11 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) const struct v3d_device_info *devinfo = state->devinfo; struct qinst *qinst = n->inst; struct v3d_qpu_instr *inst = &qinst->qpu; + /* If the input and output segments are shared, then all VPM reads to + * a location need to happen before all writes. We handle this by + * serializing all VPM operations for now. + */ + bool separate_vpm_segment = false; if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) @@ -321,6 +326,14 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) add_write_dep(state, &state->last_vpm, n); break; + case V3D_QPU_A_LDVPMV_IN: + case V3D_QPU_A_LDVPMD_IN: + case V3D_QPU_A_LDVPMG_IN: + case V3D_QPU_A_LDVPMP: + if (!separate_vpm_segment) + add_write_dep(state, &state->last_vpm, n); + break; + case V3D_QPU_A_VPMWT: add_read_dep(state, state->last_vpm, n); break; @@ -414,9 +427,16 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) if (inst->sig.ldtlb | inst->sig.ldtlbu) add_read_dep(state, state->last_tlb, n); - if (inst->sig.ldvpm) + if (inst->sig.ldvpm) { add_write_dep(state, &state->last_vpm_read, n); + /* At least for now, we're doing shared I/O segments, so queue + * all writes after all reads. + */ + if (!separate_vpm_segment) + add_write_dep(state, &state->last_vpm, n); + } + /* inst->sig.ldunif or sideband uniform read */ if (qinst->uniform != ~0) add_write_dep(state, &state->last_unif, n); diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index 070e6a3aa59..1e899393441 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -649,6 +649,11 @@ struct v3d_vs_prog_data { /* Total number of components written, for the shader state record. */ uint32_t vpm_output_size; + /* Set if there should be separate VPM segments for input and output. + * If unset, vpm_input_size will be 0. + */ + bool separate_segments; + /* Value to be programmed in VCM_CACHE_SIZE. */ uint8_t vcm_cache_size; }; diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index a52205b1a2a..518aace991a 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -789,6 +789,14 @@ uint64_t *v3d_compile_vs(const struct v3d_compiler *compiler, prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8; prog_data->vpm_output_size = align(c->num_vpm_writes, 8) / 8; + /* Set us up for shared input/output segments. This is apparently + * necessary for our VCM setup to avoid varying corruption. + */ + prog_data->separate_segments = false; + prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size, + prog_data->vpm_input_size); + prog_data->vpm_input_size = 0; + /* Compute VCM cache size. We set up our program to take up less than * half of the VPM, so that any set of bin and render programs won't * run out of space. We need space for at least one input segment, -- cgit v1.2.3