diff options
-rw-r--r-- | src/broadcom/common/v3d_limits.h | 5 | ||||
-rw-r--r-- | src/broadcom/compiler/nir_to_vir.c | 175 | ||||
-rw-r--r-- | src/broadcom/compiler/v3d_compiler.h | 33 | ||||
-rw-r--r-- | src/broadcom/compiler/v3d_nir_lower_io.c | 357 | ||||
-rw-r--r-- | src/broadcom/compiler/vir.c | 119 | ||||
-rw-r--r-- | src/gallium/drivers/v3d/v3d_context.h | 30 | ||||
-rw-r--r-- | src/gallium/drivers/v3d/v3d_program.c | 169 |
7 files changed, 778 insertions, 110 deletions
diff --git a/src/broadcom/common/v3d_limits.h b/src/broadcom/common/v3d_limits.h index d65edddab74..e02582035f1 100644 --- a/src/broadcom/common/v3d_limits.h +++ b/src/broadcom/common/v3d_limits.h @@ -30,8 +30,11 @@ #define V3D_CHANNELS 16 #define V3D_MAX_FS_INPUTS 64 +#define V3D_MAX_GS_INPUTS 64 #define V3D_MAX_VS_INPUTS 64 -#define V3D_MAX_ANY_STAGE_INPUTS MAX2(V3D_MAX_VS_INPUTS, V3D_MAX_FS_INPUTS) +#define V3D_MAX_ANY_STAGE_INPUTS MAX3(V3D_MAX_VS_INPUTS, \ + V3D_MAX_GS_INPUTS, \ + V3D_MAX_FS_INPUTS) /* Not specifically a hardware limit, just coordination between compiler and * driver. diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 6b566c68e07..d7bef12fef9 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -1367,11 +1367,20 @@ emit_frag_end(struct v3d_compile *c) vir_emit_tlb_color_write(c, rt); } +static inline void +vir_VPM_WRITE_indirect(struct v3d_compile *c, + struct qreg val, + struct qreg vpm_index) +{ + assert(c->devinfo->ver >= 40); + vir_STVPMV(c, vpm_index, val); +} + static void vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index) { if (c->devinfo->ver >= 40) { - vir_STVPMV(c, vir_uniform_ui(c, vpm_index), val); + vir_VPM_WRITE_indirect(c, val, vir_uniform_ui(c, vpm_index)); } else { /* XXX: v3d33_vir_vpm_write_setup(c); */ vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val); @@ -1387,6 +1396,15 @@ emit_vert_end(struct v3d_compile *c) vir_VPMWT(c); } +static void +emit_geom_end(struct v3d_compile *c) +{ + /* GFXH-1684: VPM writes need to be complete by the end of the shader. + */ + if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42) + vir_VPMWT(c); +} + void v3d_optimize_nir(struct nir_shader *s) { @@ -1474,7 +1492,7 @@ ntq_emit_vpm_read(struct v3d_compile *c, } static void -ntq_setup_vpm_inputs(struct v3d_compile *c) +ntq_setup_vs_inputs(struct v3d_compile *c) { /* Figure out how many components of each vertex attribute the shader * uses. Each variable should have been split to individual @@ -1565,24 +1583,69 @@ program_reads_point_coord(struct v3d_compile *c) } static void -ntq_setup_fs_inputs(struct v3d_compile *c) +get_sorted_input_variables(struct v3d_compile *c, + unsigned *num_entries, + nir_variable ***vars) { - unsigned num_entries = 0; + *num_entries = 0; nir_foreach_variable(var, &c->s->inputs) - num_entries++; + (*num_entries)++; - nir_variable *vars[num_entries]; + *vars = ralloc_array(c, nir_variable *, *num_entries); unsigned i = 0; nir_foreach_variable(var, &c->s->inputs) - vars[i++] = var; + (*vars)[i++] = var; /* Sort the variables so that we emit the input setup in * driver_location order. This is required for VPM reads, whose data * is fetched into the VPM in driver_location (TGSI register index) * order. */ - qsort(&vars, num_entries, sizeof(*vars), driver_location_compare); + qsort(*vars, *num_entries, sizeof(**vars), driver_location_compare); +} + +static void +ntq_setup_gs_inputs(struct v3d_compile *c) +{ + nir_variable **vars; + unsigned num_entries; + get_sorted_input_variables(c, &num_entries, &vars); + + for (unsigned i = 0; i < num_entries; i++) { + nir_variable *var = vars[i]; + + /* All GS inputs are arrays with as many entries as vertices + * in the input primitive, but here we only care about the + * per-vertex input type. + */ + const struct glsl_type *type = glsl_without_array(var->type); + unsigned array_len = MAX2(glsl_get_length(type), 1); + unsigned loc = var->data.driver_location; + + resize_qreg_array(c, &c->inputs, &c->inputs_array_size, + (loc + array_len) * 4); + + for (unsigned j = 0; j < array_len; j++) { + unsigned num_elements = glsl_get_vector_elements(type); + for (unsigned k = 0; k < num_elements; k++) { + unsigned chan = var->data.location_frac + k; + unsigned input_idx = c->num_inputs++; + struct v3d_varying_slot slot = + v3d_slot_from_slot_and_component(var->data.location + j, chan); + c->input_slots[input_idx] = slot; + } + } + } +} + + +static void +ntq_setup_fs_inputs(struct v3d_compile *c) +{ + nir_variable **vars; + unsigned num_entries; + get_sorted_input_variables(c, &num_entries, &vars); for (unsigned i = 0; i < num_entries; i++) { nir_variable *var = vars[i]; @@ -1949,6 +2012,40 @@ ntq_emit_color_write(struct v3d_compile *c, } static void +emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr) +{ + assert(instr->num_components == 1); + + uint32_t base_offset = nir_intrinsic_base(instr); + struct qreg src_offset = ntq_get_src(c, instr->src[1], 0); + struct qreg offset = + vir_ADD(c, vir_uniform_ui(c, base_offset), src_offset); + + vir_VPM_WRITE_indirect(c, ntq_get_src(c, instr->src[0], 0), offset); +} + +static void +ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr) +{ + /* XXX perf: Use stvpmv with uniform non-constant offsets and + * stvpmd with non-uniform offsets and enable + * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR. + */ + if (c->s->info.stage == MESA_SHADER_FRAGMENT) { + ntq_emit_color_write(c, instr); + } else if (c->s->info.stage == MESA_SHADER_GEOMETRY) { + emit_store_output_gs(c, instr); + } else { + assert(c->s->info.stage == MESA_SHADER_VERTEX); + assert(instr->num_components == 1); + + vir_VPM_WRITE(c, + ntq_get_src(c, instr->src[0], 0), + nir_intrinsic_base(instr)); + } +} + +static void ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) { switch (instr->intrinsic) { @@ -2090,19 +2187,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) break; case nir_intrinsic_store_output: - /* XXX perf: Use stvpmv with uniform non-constant offsets and - * stvpmd with non-uniform offsets and enable - * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR. - */ - if (c->s->info.stage == MESA_SHADER_FRAGMENT) { - ntq_emit_color_write(c, instr); - } else { - assert(instr->num_components == 1); - - vir_VPM_WRITE(c, - ntq_get_src(c, instr->src[0], 0), - nir_intrinsic_base(instr)); - } + ntq_emit_store_output(c, instr); break; case nir_intrinsic_image_deref_size: @@ -2214,6 +2299,34 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c)); break; + case nir_intrinsic_load_per_vertex_input: { + /* col: vertex index, row = varying index */ + struct qreg col = ntq_get_src(c, instr->src[0], 0); + uint32_t row_idx = nir_intrinsic_base(instr) * 4 + + nir_intrinsic_component(instr); + for (int i = 0; i < instr->num_components; i++) { + struct qreg row = vir_uniform_ui(c, row_idx++); + ntq_store_dest(c, &instr->dest, i, + vir_LDVPMG_IN(c, row, col)); + } + break; + } + + case nir_intrinsic_emit_vertex: + case nir_intrinsic_end_primitive: + unreachable("Should have been lowered in v3d_nir_lower_io"); + break; + + case nir_intrinsic_load_primitive_id: { + /* gl_PrimitiveIdIn is written by the GBG in the first word of + * VPM output header. According to docs, we should read this + * using ldvpm(v,d)_in (See Table 71). + */ + ntq_store_dest(c, &instr->dest, 0, + vir_LDVPMV_IN(c, vir_uniform_ui(c, 0))); + break; + } + default: fprintf(stderr, "Unknown intrinsic: "); nir_print_instr(&instr->instr, stderr); @@ -2636,10 +2749,21 @@ nir_to_vir(struct v3d_compile *c) c->spill_size += V3D_CHANNELS * c->s->scratch_size; } - if (c->s->info.stage == MESA_SHADER_FRAGMENT) + switch (c->s->info.stage) { + case MESA_SHADER_VERTEX: + ntq_setup_vs_inputs(c); + break; + case MESA_SHADER_GEOMETRY: + ntq_setup_gs_inputs(c); + break; + case MESA_SHADER_FRAGMENT: ntq_setup_fs_inputs(c); - else - ntq_setup_vpm_inputs(c); + break; + case MESA_SHADER_COMPUTE: + break; + default: + unreachable("unsupported shader stage"); + } ntq_setup_outputs(c); @@ -2785,6 +2909,9 @@ v3d_nir_to_vir(struct v3d_compile *c) case MESA_SHADER_FRAGMENT: emit_frag_end(c); break; + case MESA_SHADER_GEOMETRY: + emit_geom_end(c); + break; case MESA_SHADER_VERTEX: emit_vert_end(c); break; diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index 29057bdf4df..9b08e4a270e 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -329,6 +329,7 @@ struct v3d_key { bool clamp_r:1; } tex[V3D_MAX_TEXTURE_SAMPLERS]; uint8_t ucp_enables; + bool is_last_geometry_stage; }; struct v3d_fs_key { @@ -371,6 +372,16 @@ struct v3d_fs_key { struct pipe_rt_blend_state blend; }; +struct v3d_gs_key { + struct v3d_key base; + + struct v3d_varying_slot used_outputs[V3D_MAX_FS_INPUTS]; + uint8_t num_used_outputs; + + bool is_coord; + bool per_vertex_point_size; +}; + struct v3d_vs_key { struct v3d_key base; @@ -552,6 +563,7 @@ struct v3d_compile { int local_invocation_index_bits; uint8_t vattr_sizes[V3D_MAX_VS_INPUTS / 4]; + uint8_t gs_input_sizes[V3D_MAX_GS_INPUTS]; uint32_t vpm_output_size; /* Size in bytes of registers that have been spilled. This is how much @@ -586,6 +598,7 @@ struct v3d_compile { struct pipe_shader_state *shader_state; struct v3d_key *key; struct v3d_fs_key *fs_key; + struct v3d_gs_key *gs_key; struct v3d_vs_key *vs_key; /* Live ranges of temps. */ @@ -687,6 +700,26 @@ struct v3d_vs_prog_data { uint8_t vcm_cache_size; }; +struct v3d_gs_prog_data { + struct v3d_prog_data base; + + /* Whether the program reads gl_PrimitiveIDIn */ + bool uses_pid; + + /* Number of components read from each input varying. */ + uint8_t input_sizes[V3D_MAX_GS_INPUTS / 4]; + + /* Number of inputs */ + uint8_t num_inputs; + struct v3d_varying_slot input_slots[V3D_MAX_GS_INPUTS]; + + /* Total number of components written, for the shader state record. */ + uint32_t vpm_output_size; + + /* Output primitive type */ + uint8_t out_prim_type; +}; + struct v3d_fs_prog_data { struct v3d_prog_data base; diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c index 3145c560a14..3c9279a2fee 100644 --- a/src/broadcom/compiler/v3d_nir_lower_io.c +++ b/src/broadcom/compiler/v3d_nir_lower_io.c @@ -45,22 +45,46 @@ struct v3d_nir_lower_io_state { int psiz_vpm_offset; int varyings_vpm_offset; + /* Geometry shader state */ + struct { + /* VPM offset for the current vertex data output */ + nir_variable *output_offset_var; + /* VPM offset for the current vertex header */ + nir_variable *header_offset_var; + /* VPM header for the current vertex */ + nir_variable *header_var; + + /* Size of the complete VPM output header */ + uint32_t output_header_size; + /* Size of the output data for a single vertex */ + uint32_t output_vertex_data_size; + } gs; + BITSET_WORD varyings_stored[BITSET_WORDS(V3D_MAX_ANY_STAGE_INPUTS)]; nir_ssa_def *pos[4]; }; static void -v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *chan) +v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b, + struct v3d_nir_lower_io_state *state); + +static void +v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *offset, + nir_ssa_def *chan) { nir_intrinsic_instr *intr = - nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output); + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_store_output); nir_ssa_dest_init(&intr->instr, &intr->dest, 1, intr->dest.ssa.bit_size, NULL); intr->num_components = 1; intr->src[0] = nir_src_for_ssa(chan); - intr->src[1] = nir_src_for_ssa(nir_imm_int(b, 0)); + if (offset) + intr->src[1] = nir_src_for_ssa(offset); + else + intr->src[1] = nir_src_for_ssa(nir_imm_int(b, 0)); nir_intrinsic_set_base(intr, base); nir_intrinsic_set_write_mask(intr, 0x1); @@ -91,8 +115,23 @@ v3d_varying_slot_vpm_offset(struct v3d_compile *c, nir_variable *var, int chan) { int component = var->data.location_frac + chan; - for (int i = 0; i < c->vs_key->num_used_outputs; i++) { - struct v3d_varying_slot slot = c->vs_key->used_outputs[i]; + uint32_t num_used_outputs = 0; + struct v3d_varying_slot *used_outputs = NULL; + switch (c->s->info.stage) { + case MESA_SHADER_VERTEX: + num_used_outputs = c->vs_key->num_used_outputs; + used_outputs = c->vs_key->used_outputs; + break; + case MESA_SHADER_GEOMETRY: + num_used_outputs = c->gs_key->num_used_outputs; + used_outputs = c->gs_key->used_outputs; + break; + default: + unreachable("Unsupported shader stage"); + } + + for (int i = 0; i < num_used_outputs; i++) { + struct v3d_varying_slot slot = used_outputs[i]; if (v3d_slot_get_slot(slot) == var->data.location && v3d_slot_get_component(slot) == component) { @@ -105,6 +144,9 @@ v3d_varying_slot_vpm_offset(struct v3d_compile *c, nir_variable *var, int chan) /* Lowers a store_output(gallium driver location) to a series of store_outputs * with a driver_location equal to the offset in the VPM. + * + * For geometry shaders we need to emit multiple vertices so the VPM offsets + * need to be computed in the shader code based on the current vertex index. */ static void v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b, @@ -113,6 +155,13 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b, { b->cursor = nir_before_instr(&intr->instr); + /* If this is a geometry shader we need to emit our outputs + * to the current vertex offset in the VPM. + */ + nir_ssa_def *offset_reg = + c->s->info.stage == MESA_SHADER_GEOMETRY ? + nir_load_var(b, state->gs.output_offset_var) : NULL; + int start_comp = nir_intrinsic_component(intr); nir_ssa_def *src = nir_ssa_for_src(b, intr->src[0], intr->num_components); @@ -141,7 +190,7 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b, /* Just psiz to the position in the FF header right now. */ if (var->data.location == VARYING_SLOT_PSIZ && state->psiz_vpm_offset != -1) { - v3d_nir_store_output(b, state->psiz_vpm_offset, src); + v3d_nir_store_output(b, state->psiz_vpm_offset, offset_reg, src); } /* Scalarize outputs if it hasn't happened already, since we want to @@ -161,12 +210,73 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b, BITSET_SET(state->varyings_stored, vpm_offset); v3d_nir_store_output(b, state->varyings_vpm_offset + vpm_offset, - nir_channel(b, src, i)); + offset_reg, nir_channel(b, src, i)); } nir_instr_remove(&intr->instr); } +static inline void +reset_gs_header(nir_builder *b, struct v3d_nir_lower_io_state *state) +{ + const uint8_t NEW_PRIMITIVE_OFFSET = 0; + const uint8_t VERTEX_DATA_LENGTH_OFFSET = 8; + + uint32_t vertex_data_size = state->gs.output_vertex_data_size; + assert((vertex_data_size & 0xffffff00) == 0); + + uint32_t header; + header = 1 << NEW_PRIMITIVE_OFFSET; + header |= vertex_data_size << VERTEX_DATA_LENGTH_OFFSET; + nir_store_var(b, state->gs.header_var, nir_imm_int(b, header), 0x1); +} + +static void +v3d_nir_lower_emit_vertex(struct v3d_compile *c, nir_builder *b, + nir_intrinsic_instr *instr, + struct v3d_nir_lower_io_state *state) +{ + b->cursor = nir_before_instr(&instr->instr); + + nir_ssa_def *header = nir_load_var(b, state->gs.header_var); + nir_ssa_def *header_offset = nir_load_var(b, state->gs.header_offset_var); + nir_ssa_def *output_offset = nir_load_var(b, state->gs.output_offset_var); + + /* Emit fixed function outputs */ + v3d_nir_emit_ff_vpm_outputs(c, b, state); + + /* Emit vertex header */ + v3d_nir_store_output(b, 0, header_offset, header); + + /* Update VPM offset for next vertex output data and header */ + output_offset = + nir_iadd(b, output_offset, + nir_imm_int(b, state->gs.output_vertex_data_size)); + + header_offset = nir_iadd(b, header_offset, nir_imm_int(b, 1)); + + /* Reset the New Primitive bit */ + header = nir_iand(b, header, nir_imm_int(b, 0xfffffffe)); + + nir_store_var(b, state->gs.output_offset_var, output_offset, 0x1); + nir_store_var(b, state->gs.header_offset_var, header_offset, 0x1); + nir_store_var(b, state->gs.header_var, header, 0x1); + + nir_instr_remove(&instr->instr); +} + +static void +v3d_nir_lower_end_primitive(struct v3d_compile *c, nir_builder *b, + nir_intrinsic_instr *instr, + struct v3d_nir_lower_io_state *state) +{ + assert(state->gs.header_var); + b->cursor = nir_before_instr(&instr->instr); + reset_gs_header(b, state); + + nir_instr_remove(&instr->instr); +} + static void v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b, struct nir_instr *instr, @@ -182,8 +292,18 @@ v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b, break; case nir_intrinsic_store_output: - if (c->s->info.stage == MESA_SHADER_VERTEX) + if (c->s->info.stage == MESA_SHADER_VERTEX || + c->s->info.stage == MESA_SHADER_GEOMETRY) { v3d_nir_lower_vpm_output(c, b, intr, state); + } + break; + + case nir_intrinsic_emit_vertex: + v3d_nir_lower_emit_vertex(c, b, intr, state); + break; + + case nir_intrinsic_end_primitive: + v3d_nir_lower_end_primitive(c, b, intr, state); break; default: @@ -226,12 +346,64 @@ v3d_nir_lower_io_update_output_var_base(struct v3d_compile *c, } static void -v3d_nir_setup_vpm_layout(struct v3d_compile *c, - struct v3d_nir_lower_io_state *state) +v3d_nir_setup_vpm_layout_vs(struct v3d_compile *c, + struct v3d_nir_lower_io_state *state) { uint32_t vpm_offset = 0; - if (c->vs_key->is_coord) { + state->pos_vpm_offset = -1; + state->vp_vpm_offset = -1; + state->zs_vpm_offset = -1; + state->rcp_wc_vpm_offset = -1; + state->psiz_vpm_offset = -1; + + bool needs_ff_outputs = c->vs_key->base.is_last_geometry_stage; + if (needs_ff_outputs) { + if (c->vs_key->is_coord) { + state->pos_vpm_offset = vpm_offset; + vpm_offset += 4; + } + + state->vp_vpm_offset = vpm_offset; + vpm_offset += 2; + + if (!c->vs_key->is_coord) { + state->zs_vpm_offset = vpm_offset++; + state->rcp_wc_vpm_offset = vpm_offset++; + } + + if (c->vs_key->per_vertex_point_size) + state->psiz_vpm_offset = vpm_offset++; + } + + state->varyings_vpm_offset = vpm_offset; + + c->vpm_output_size = vpm_offset + c->vs_key->num_used_outputs; +} + +static void +v3d_nir_setup_vpm_layout_gs(struct v3d_compile *c, + struct v3d_nir_lower_io_state *state) +{ + /* 1 header slot for number of output vertices */ + uint32_t vpm_offset = 1; + + /* 1 header slot per output vertex */ + const uint32_t num_vertices = c->s->info.gs.vertices_out; + vpm_offset += num_vertices; + + state->gs.output_header_size = vpm_offset; + + /* Vertex data: here we only compute offsets into a generic vertex data + * elements. When it is time to actually write a particular vertex to + * the VPM, we will add the offset for that vertex into the VPM output + * to these offsets. + * + * If geometry shaders are present, they are always the last shader + * stage before rasterization, so we always emit fixed function outputs. + */ + vpm_offset = 0; + if (c->gs_key->is_coord) { state->pos_vpm_offset = vpm_offset; vpm_offset += 4; } else { @@ -241,7 +413,7 @@ v3d_nir_setup_vpm_layout(struct v3d_compile *c, state->vp_vpm_offset = vpm_offset; vpm_offset += 2; - if (!c->vs_key->is_coord) { + if (!c->gs_key->is_coord) { state->zs_vpm_offset = vpm_offset++; state->rcp_wc_vpm_offset = vpm_offset++; } else { @@ -249,20 +421,34 @@ v3d_nir_setup_vpm_layout(struct v3d_compile *c, state->rcp_wc_vpm_offset = -1; } - if (c->vs_key->per_vertex_point_size) + /* Mesa enables OES_geometry_shader_point_size automatically with + * OES_geometry_shader so we always need to handle point size + * writes if present. + */ + if (c->gs_key->per_vertex_point_size) state->psiz_vpm_offset = vpm_offset++; - else - state->psiz_vpm_offset = -1; state->varyings_vpm_offset = vpm_offset; - c->vpm_output_size = vpm_offset + c->vs_key->num_used_outputs; + state->gs.output_vertex_data_size = + state->varyings_vpm_offset + c->gs_key->num_used_outputs; + + c->vpm_output_size = + state->gs.output_header_size + + state->gs.output_vertex_data_size * num_vertices; } static void v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b, struct v3d_nir_lower_io_state *state) { + /* If this is a geometry shader we need to emit our fixed function + * outputs to the current vertex offset in the VPM. + */ + nir_ssa_def *offset_reg = + c->s->info.stage == MESA_SHADER_GEOMETRY ? + nir_load_var(b, state->gs.output_offset_var) : NULL; + for (int i = 0; i < 4; i++) { if (!state->pos[i]) state->pos[i] = nir_ssa_undef(b, 1, 32); @@ -273,23 +459,25 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b, if (state->pos_vpm_offset != -1) { for (int i = 0; i < 4; i++) { v3d_nir_store_output(b, state->pos_vpm_offset + i, - state->pos[i]); + offset_reg, state->pos[i]); } } - for (int i = 0; i < 2; i++) { - nir_ssa_def *pos; - nir_ssa_def *scale; - pos = state->pos[i]; - if (i == 0) - scale = nir_load_viewport_x_scale(b); - else - scale = nir_load_viewport_y_scale(b); - pos = nir_fmul(b, pos, scale); - pos = nir_fmul(b, pos, rcp_wc); - pos = nir_f2i32(b, nir_fround_even(b, pos)); - v3d_nir_store_output(b, state->vp_vpm_offset + i, - pos); + if (state->vp_vpm_offset != -1) { + for (int i = 0; i < 2; i++) { + nir_ssa_def *pos; + nir_ssa_def *scale; + pos = state->pos[i]; + if (i == 0) + scale = nir_load_viewport_x_scale(b); + else + scale = nir_load_viewport_y_scale(b); + pos = nir_fmul(b, pos, scale); + pos = nir_fmul(b, pos, rcp_wc); + pos = nir_f2i32(b, nir_fround_even(b, pos)); + v3d_nir_store_output(b, state->vp_vpm_offset + i, + offset_reg, pos); + } } if (state->zs_vpm_offset != -1) { @@ -297,38 +485,118 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b, z = nir_fmul(b, z, nir_load_viewport_z_scale(b)); z = nir_fmul(b, z, rcp_wc); z = nir_fadd(b, z, nir_load_viewport_z_offset(b)); - v3d_nir_store_output(b, state->zs_vpm_offset, z); + v3d_nir_store_output(b, state->zs_vpm_offset, offset_reg, z); } - if (state->rcp_wc_vpm_offset != -1) - v3d_nir_store_output(b, state->rcp_wc_vpm_offset, rcp_wc); + if (state->rcp_wc_vpm_offset != -1) { + v3d_nir_store_output(b, state->rcp_wc_vpm_offset, + offset_reg, rcp_wc); + } - /* Store 0 to varyings requested by the FS but not stored in the VS. - * This should be undefined behavior, but glsl-routing seems to rely - * on it. + /* Store 0 to varyings requested by the FS but not stored by the + * previous stage. This should be undefined behavior, but + * glsl-routing seems to rely on it. */ - for (int i = 0; i < c->vs_key->num_used_outputs; i++) { + uint32_t num_used_outputs; + switch (c->s->info.stage) { + case MESA_SHADER_VERTEX: + num_used_outputs = c->vs_key->num_used_outputs; + break; + case MESA_SHADER_GEOMETRY: + num_used_outputs = c->gs_key->num_used_outputs; + break; + default: + unreachable("Unsupported shader stage"); + } + + for (int i = 0; i < num_used_outputs; i++) { if (!BITSET_TEST(state->varyings_stored, i)) { v3d_nir_store_output(b, state->varyings_vpm_offset + i, - nir_imm_int(b, 0)); + offset_reg, nir_imm_int(b, 0)); } } } +static void +emit_gs_prolog(struct v3d_compile *c, nir_builder *b, + nir_function_impl *impl, + struct v3d_nir_lower_io_state *state) +{ + nir_block *first = nir_start_block(impl); + b->cursor = nir_before_block(first); + + const struct glsl_type *uint_type = glsl_uint_type(); + + assert(!state->gs.output_offset_var); + state->gs.output_offset_var = + nir_local_variable_create(impl, uint_type, "output_offset"); + nir_store_var(b, state->gs.output_offset_var, + nir_imm_int(b, state->gs.output_header_size), 0x1); + + assert(!state->gs.header_offset_var); + state->gs.header_offset_var = + nir_local_variable_create(impl, uint_type, "header_offset"); + nir_store_var(b, state->gs.header_offset_var, nir_imm_int(b, 1), 0x1); + + assert(!state->gs.header_var); + state->gs.header_var = + nir_local_variable_create(impl, uint_type, "header"); + reset_gs_header(b, state); +} + +static void +emit_gs_vpm_output_header_prolog(struct v3d_compile *c, nir_builder *b, + struct v3d_nir_lower_io_state *state) +{ + const uint8_t VERTEX_COUNT_OFFSET = 16; + + /* Our GS header has 1 generic header slot (at VPM offset 0) and then + * one slot per output vertex after it. This means we don't need to + * have a variable just to keep track of the number of vertices we + * emitted and instead we can just compute it here from the header + * offset variable by removing the one generic header slot that always + * goes at the begining of out header. + */ + nir_ssa_def *header_offset = + nir_load_var(b, state->gs.header_offset_var); + nir_ssa_def *vertex_count = + nir_isub(b, header_offset, nir_imm_int(b, 1)); + nir_ssa_def *header = + nir_ior(b, nir_imm_int(b, state->gs.output_header_size), + nir_ishl(b, vertex_count, + nir_imm_int(b, VERTEX_COUNT_OFFSET))); + + v3d_nir_store_output(b, 0, NULL, header); +} + void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c) { struct v3d_nir_lower_io_state state = { 0 }; /* Set up the layout of the VPM outputs. */ - if (s->info.stage == MESA_SHADER_VERTEX) - v3d_nir_setup_vpm_layout(c, &state); + switch (s->info.stage) { + case MESA_SHADER_VERTEX: + v3d_nir_setup_vpm_layout_vs(c, &state); + break; + case MESA_SHADER_GEOMETRY: + v3d_nir_setup_vpm_layout_gs(c, &state); + break; + case MESA_SHADER_FRAGMENT: + case MESA_SHADER_COMPUTE: + break; + default: + unreachable("Unsupported shader stage"); + } nir_foreach_function(function, s) { if (function->impl) { nir_builder b; nir_builder_init(&b, function->impl); + if (c->s->info.stage == MESA_SHADER_GEOMETRY) + emit_gs_prolog(c, &b, function->impl, &state); + nir_foreach_block(block, function->impl) { nir_foreach_instr_safe(instr, block) v3d_nir_lower_io_instr(c, &b, instr, @@ -337,8 +605,11 @@ v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c) nir_block *last = nir_impl_last_block(function->impl); b.cursor = nir_after_block(last); - if (s->info.stage == MESA_SHADER_VERTEX) + if (s->info.stage == MESA_SHADER_VERTEX) { v3d_nir_emit_ff_vpm_outputs(c, &b, &state); + } else if (s->info.stage == MESA_SHADER_GEOMETRY) { + emit_gs_vpm_output_header_prolog(c, &b, &state); + } nir_metadata_preserve(function->impl, nir_metadata_block_index | @@ -346,6 +617,8 @@ v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c) } } - if (s->info.stage == MESA_SHADER_VERTEX) + if (s->info.stage == MESA_SHADER_VERTEX || + s->info.stage == MESA_SHADER_GEOMETRY) { v3d_nir_lower_io_update_output_var_base(c, &state); + } } diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index 340cda903e9..dc966bc80ca 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -23,6 +23,7 @@ #include "broadcom/common/v3d_device_info.h" #include "v3d_compiler.h" +#include "util/u_prim.h" int vir_get_nsrc(struct qinst *inst) @@ -661,6 +662,28 @@ v3d_vs_set_prog_data(struct v3d_compile *c, } static void +v3d_gs_set_prog_data(struct v3d_compile *c, + struct v3d_gs_prog_data *prog_data) +{ + prog_data->num_inputs = c->num_inputs; + memcpy(prog_data->input_slots, c->input_slots, + c->num_inputs * sizeof(*c->input_slots)); + + /* gl_PrimitiveIdIn is written by the GBG into the first word of the + * VPM output header automatically and the shader will overwrite + * it after reading it if necessary, so it doesn't add to the VPM + * size requirements. + */ + prog_data->uses_pid = (c->s->info.system_values_read & + (1ull << SYSTEM_VALUE_PRIMITIVE_ID)); + + /* Output segment size is in sectors (8 rows of 32 bits per channel) */ + prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8; + + prog_data->out_prim_type = c->s->info.gs.output_primitive; +} + +static void v3d_set_fs_prog_data_inputs(struct v3d_compile *c, struct v3d_fs_prog_data *prog_data) { @@ -714,13 +737,21 @@ v3d_set_prog_data(struct v3d_compile *c, v3d_set_prog_data_uniforms(c, prog_data); - if (c->s->info.stage == MESA_SHADER_COMPUTE) { - v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data); - } else if (c->s->info.stage == MESA_SHADER_VERTEX) { + switch (c->s->info.stage) { + case MESA_SHADER_VERTEX: v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data); - } else { - assert(c->s->info.stage == MESA_SHADER_FRAGMENT); + break; + case MESA_SHADER_GEOMETRY: + v3d_gs_set_prog_data(c, (struct v3d_gs_prog_data *)prog_data); + break; + case MESA_SHADER_FRAGMENT: v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data); + break; + case MESA_SHADER_COMPUTE: + v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data); + break; + default: + unreachable("unsupported shader stage"); } } @@ -772,6 +803,37 @@ v3d_nir_lower_vs_early(struct v3d_compile *c) } static void +v3d_nir_lower_gs_early(struct v3d_compile *c) +{ + /* Split our I/O vars and dead code eliminate the unused + * components. + */ + NIR_PASS_V(c->s, nir_lower_io_to_scalar_early, + nir_var_shader_in | nir_var_shader_out); + uint64_t used_outputs[4] = {0}; + for (int i = 0; i < c->gs_key->num_used_outputs; i++) { + int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]); + int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]); + used_outputs[comp] |= 1ull << slot; + } + NIR_PASS_V(c->s, nir_remove_unused_io_vars, + &c->s->outputs, used_outputs, NULL); /* demotes to globals */ + NIR_PASS_V(c->s, nir_lower_global_vars_to_local); + v3d_optimize_nir(c->s); + NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in); + + /* This must go before nir_lower_io */ + if (c->gs_key->per_vertex_point_size) + NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f); + + NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + type_size_vec4, + (nir_lower_io_options)0); + /* clean up nir_lower_io's deref_var remains */ + NIR_PASS_V(c->s, nir_opt_dce); +} + +static void v3d_fixup_fs_output_types(struct v3d_compile *c) { nir_foreach_variable(var, &c->s->outputs) { @@ -819,6 +881,18 @@ v3d_nir_lower_fs_early(struct v3d_compile *c) } static void +v3d_nir_lower_gs_late(struct v3d_compile *c) +{ + if (c->key->ucp_enables) { + NIR_PASS_V(c->s, nir_lower_clip_gs, c->key->ucp_enables, + false, NULL); + } + + /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */ + NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); +} + +static void v3d_nir_lower_vs_late(struct v3d_compile *c) { if (c->vs_key->clamp_color) @@ -907,6 +981,10 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, c->vs_key = (struct v3d_vs_key *)key; prog_data = rzalloc_size(NULL, sizeof(struct v3d_vs_prog_data)); break; + case MESA_SHADER_GEOMETRY: + c->gs_key = (struct v3d_gs_key *)key; + prog_data = rzalloc_size(NULL, sizeof(struct v3d_gs_prog_data)); + break; case MESA_SHADER_FRAGMENT: c->fs_key = (struct v3d_fs_key *)key; prog_data = rzalloc_size(NULL, sizeof(struct v3d_fs_prog_data)); @@ -919,20 +997,35 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, unreachable("unsupported shader stage"); } - if (c->s->info.stage == MESA_SHADER_VERTEX) { + + switch (c->s->info.stage) { + case MESA_SHADER_VERTEX: v3d_nir_lower_vs_early(c); - } else if (c->s->info.stage != MESA_SHADER_COMPUTE) { - assert(c->s->info.stage == MESA_SHADER_FRAGMENT); + break; + case MESA_SHADER_GEOMETRY: + v3d_nir_lower_gs_early(c); + break; + case MESA_SHADER_FRAGMENT: v3d_nir_lower_fs_early(c); + break; + default: + break; } v3d_lower_nir(c); - if (c->s->info.stage == MESA_SHADER_VERTEX) { + switch (c->s->info.stage) { + case MESA_SHADER_VERTEX: v3d_nir_lower_vs_late(c); - } else if (c->s->info.stage != MESA_SHADER_COMPUTE) { - assert(c->s->info.stage == MESA_SHADER_FRAGMENT); + break; + case MESA_SHADER_GEOMETRY: + v3d_nir_lower_gs_late(c); + break; + case MESA_SHADER_FRAGMENT: v3d_nir_lower_fs_late(c); + break; + default: + break; } NIR_PASS_V(c->s, v3d_nir_lower_io, c); @@ -1134,7 +1227,9 @@ const char * vir_get_stage_name(struct v3d_compile *c) { if (c->vs_key && c->vs_key->is_coord) - return "MESA_SHADER_COORD"; + return "MESA_SHADER_VERTEX_BIN"; + else if (c->gs_key && c->gs_key->is_coord) + return "MESA_SHADER_GEOMETRY_BIN"; else return gl_shader_stage_name(c->s->info.stage); } diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h index ecedbaf9efb..bf85b42eb9d 100644 --- a/src/gallium/drivers/v3d/v3d_context.h +++ b/src/gallium/drivers/v3d/v3d_context.h @@ -59,7 +59,8 @@ void v3d_job_add_bo(struct v3d_job *job, struct v3d_bo *bo); #define VC5_DIRTY_ZSA (1ull << 2) #define VC5_DIRTY_COMPTEX (1ull << 3) #define VC5_DIRTY_VERTTEX (1ull << 4) -#define VC5_DIRTY_FRAGTEX (1ull << 5) +#define VC5_DIRTY_GEOMTEX (1ull << 5) +#define VC5_DIRTY_FRAGTEX (1ull << 6) #define VC5_DIRTY_SHADER_IMAGE (1ull << 9) #define VC5_DIRTY_BLEND_COLOR (1ull << 10) @@ -77,18 +78,22 @@ void v3d_job_add_bo(struct v3d_job *job, struct v3d_bo *bo); #define VC5_DIRTY_CLIP (1ull << 22) #define VC5_DIRTY_UNCOMPILED_CS (1ull << 23) #define VC5_DIRTY_UNCOMPILED_VS (1ull << 24) -#define VC5_DIRTY_UNCOMPILED_FS (1ull << 25) +#define VC5_DIRTY_UNCOMPILED_GS (1ull << 25) +#define VC5_DIRTY_UNCOMPILED_FS (1ull << 26) #define VC5_DIRTY_COMPILED_CS (1ull << 29) #define VC5_DIRTY_COMPILED_VS (1ull << 30) -#define VC5_DIRTY_COMPILED_FS (1ull << 31) - -#define VC5_DIRTY_FS_INPUTS (1ull << 35) -#define VC5_DIRTY_STREAMOUT (1ull << 36) -#define VC5_DIRTY_OQ (1ull << 37) -#define VC5_DIRTY_CENTROID_FLAGS (1ull << 38) -#define VC5_DIRTY_NOPERSPECTIVE_FLAGS (1ull << 39) -#define VC5_DIRTY_SSBO (1ull << 40) +#define VC5_DIRTY_COMPILED_GS_BIN (1ULL << 31) +#define VC5_DIRTY_COMPILED_GS (1ULL << 32) +#define VC5_DIRTY_COMPILED_FS (1ull << 33) + +#define VC5_DIRTY_FS_INPUTS (1ull << 38) +#define VC5_DIRTY_GS_INPUTS (1ull << 39) +#define VC5_DIRTY_STREAMOUT (1ull << 40) +#define VC5_DIRTY_OQ (1ull << 41) +#define VC5_DIRTY_CENTROID_FLAGS (1ull << 42) +#define VC5_DIRTY_NOPERSPECTIVE_FLAGS (1ull << 43) +#define VC5_DIRTY_SSBO (1ull << 44) #define VC5_MAX_FS_INPUTS 64 @@ -206,6 +211,7 @@ struct v3d_compiled_shader { union { struct v3d_prog_data *base; struct v3d_vs_prog_data *vs; + struct v3d_gs_prog_data *gs; struct v3d_fs_prog_data *fs; struct v3d_compute_prog_data *compute; } prog_data; @@ -219,8 +225,8 @@ struct v3d_compiled_shader { }; struct v3d_program_stateobj { - struct v3d_uncompiled_shader *bind_vs, *bind_fs, *bind_compute; - struct v3d_compiled_shader *cs, *vs, *fs, *compute; + struct v3d_uncompiled_shader *bind_vs, *bind_gs, *bind_fs, *bind_compute; + struct v3d_compiled_shader *cs, *vs, *gs_bin, *gs, *fs, *compute; struct hash_table *cache[MESA_SHADER_STAGES]; diff --git a/src/gallium/drivers/v3d/v3d_program.c b/src/gallium/drivers/v3d/v3d_program.c index 0f7762f119d..7bbdbe409e2 100644 --- a/src/gallium/drivers/v3d/v3d_program.c +++ b/src/gallium/drivers/v3d/v3d_program.c @@ -205,8 +205,12 @@ v3d_shader_precompile(struct v3d_context *v3d, v3d_setup_shared_precompile_key(so, &key.base); v3d_get_compiled_shader(v3d, &key.base, sizeof(key)); } else { + /* FIXME: add geometry shaders */ + struct v3d_vs_key key = { .base.shader_state = so, + /* Emit fixed function outputs */ + .base.is_last_geometry_stage = true, }; v3d_setup_shared_precompile_key(so, &key.base); @@ -271,8 +275,10 @@ v3d_uncompiled_shader_create(struct pipe_context *pctx, } nir_variable_mode lower_mode = nir_var_all & ~nir_var_uniform; - if (s->info.stage == MESA_SHADER_VERTEX) + if (s->info.stage == MESA_SHADER_VERTEX || + s->info.stage == MESA_SHADER_GEOMETRY) { lower_mode &= ~(nir_var_shader_in | nir_var_shader_out); + } NIR_PASS_V(s, nir_lower_io, lower_mode, type_size, (nir_lower_io_options)0); @@ -609,55 +615,153 @@ v3d_update_compiled_fs(struct v3d_context *v3d, uint8_t prim_mode) } static void -v3d_update_compiled_vs(struct v3d_context *v3d, uint8_t prim_mode) +v3d_update_compiled_gs(struct v3d_context *v3d, uint8_t prim_mode) { - struct v3d_vs_key local_key; - struct v3d_vs_key *key = &local_key; + struct v3d_gs_key local_key; + struct v3d_gs_key *key = &local_key; - if (!(v3d->dirty & (VC5_DIRTY_PRIM_MODE | + if (!(v3d->dirty & (VC5_DIRTY_GEOMTEX | VC5_DIRTY_RASTERIZER | - VC5_DIRTY_VERTTEX | - VC5_DIRTY_VTXSTATE | - VC5_DIRTY_UNCOMPILED_VS | + VC5_DIRTY_UNCOMPILED_GS | + VC5_DIRTY_PRIM_MODE | VC5_DIRTY_FS_INPUTS))) { return; } + if (!v3d->prog.bind_gs) { + v3d->prog.gs = NULL; + v3d->prog.gs_bin = NULL; + return; + } + memset(key, 0, sizeof(*key)); - v3d_setup_shared_key(v3d, &key->base, &v3d->tex[PIPE_SHADER_VERTEX]); - key->base.shader_state = v3d->prog.bind_vs; + v3d_setup_shared_key(v3d, &key->base, &v3d->tex[PIPE_SHADER_GEOMETRY]); + key->base.shader_state = v3d->prog.bind_gs; key->base.ucp_enables = v3d->rasterizer->base.clip_plane_enable; + key->base.is_last_geometry_stage = true; key->num_used_outputs = v3d->prog.fs->prog_data.fs->num_inputs; STATIC_ASSERT(sizeof(key->used_outputs) == sizeof(v3d->prog.fs->prog_data.fs->input_slots)); memcpy(key->used_outputs, v3d->prog.fs->prog_data.fs->input_slots, sizeof(key->used_outputs)); - key->clamp_color = v3d->rasterizer->base.clamp_vertex_color; key->per_vertex_point_size = (prim_mode == PIPE_PRIM_POINTS && v3d->rasterizer->base.point_size_per_vertex); - struct v3d_compiled_shader *vs = + struct v3d_compiled_shader *gs = v3d_get_compiled_shader(v3d, &key->base, sizeof(*key)); - if (vs != v3d->prog.vs) { - v3d->prog.vs = vs; - v3d->dirty |= VC5_DIRTY_COMPILED_VS; + if (gs != v3d->prog.gs) { + v3d->prog.gs = gs; + v3d->dirty |= VC5_DIRTY_COMPILED_GS; } key->is_coord = true; - /* Coord shaders only output varyings used by transform feedback. */ + + /* The last bin-mode shader in the geometry pipeline only outputs + * varyings used by transform feedback. + */ struct v3d_uncompiled_shader *shader_state = key->base.shader_state; memcpy(key->used_outputs, shader_state->tf_outputs, sizeof(*key->used_outputs) * shader_state->num_tf_outputs); if (shader_state->num_tf_outputs < key->num_used_outputs) { + uint32_t size = sizeof(*key->used_outputs) * + (key->num_used_outputs - + shader_state->num_tf_outputs); memset(&key->used_outputs[shader_state->num_tf_outputs], - 0, - sizeof(*key->used_outputs) * (key->num_used_outputs - - shader_state->num_tf_outputs)); + 0, size); } key->num_used_outputs = shader_state->num_tf_outputs; + struct v3d_compiled_shader *old_gs = v3d->prog.gs; + struct v3d_compiled_shader *gs_bin = + v3d_get_compiled_shader(v3d, &key->base, sizeof(*key)); + if (gs_bin != old_gs) { + v3d->prog.gs_bin = gs_bin; + v3d->dirty |= VC5_DIRTY_COMPILED_GS_BIN; + } + + if (old_gs && memcmp(v3d->prog.gs->prog_data.gs->input_slots, + old_gs->prog_data.gs->input_slots, + sizeof(v3d->prog.gs->prog_data.gs->input_slots))) { + v3d->dirty |= VC5_DIRTY_GS_INPUTS; + } +} + +static void +v3d_update_compiled_vs(struct v3d_context *v3d, uint8_t prim_mode) +{ + struct v3d_vs_key local_key; + struct v3d_vs_key *key = &local_key; + + if (!(v3d->dirty & (VC5_DIRTY_VERTTEX | + VC5_DIRTY_VTXSTATE | + VC5_DIRTY_UNCOMPILED_VS | + (v3d->prog.bind_gs ? 0 : VC5_DIRTY_RASTERIZER) | + (v3d->prog.bind_gs ? 0 : VC5_DIRTY_PRIM_MODE) | + (v3d->prog.bind_gs ? VC5_DIRTY_GS_INPUTS : + VC5_DIRTY_FS_INPUTS)))) { + return; + } + + memset(key, 0, sizeof(*key)); + v3d_setup_shared_key(v3d, &key->base, &v3d->tex[PIPE_SHADER_VERTEX]); + key->base.shader_state = v3d->prog.bind_vs; + key->base.ucp_enables = v3d->rasterizer->base.clip_plane_enable; + key->base.is_last_geometry_stage = !v3d->prog.bind_gs; + + if (!v3d->prog.bind_gs) { + key->num_used_outputs = v3d->prog.fs->prog_data.fs->num_inputs; + STATIC_ASSERT(sizeof(key->used_outputs) == + sizeof(v3d->prog.fs->prog_data.fs->input_slots)); + memcpy(key->used_outputs, v3d->prog.fs->prog_data.fs->input_slots, + sizeof(key->used_outputs)); + } else { + key->num_used_outputs = v3d->prog.gs->prog_data.gs->num_inputs; + STATIC_ASSERT(sizeof(key->used_outputs) == + sizeof(v3d->prog.gs->prog_data.gs->input_slots)); + memcpy(key->used_outputs, v3d->prog.gs->prog_data.gs->input_slots, + sizeof(key->used_outputs)); + } + + key->clamp_color = v3d->rasterizer->base.clamp_vertex_color; + + key->per_vertex_point_size = + (prim_mode == PIPE_PRIM_POINTS && + v3d->rasterizer->base.point_size_per_vertex); + + struct v3d_compiled_shader *vs = + v3d_get_compiled_shader(v3d, &key->base, sizeof(*key)); + if (vs != v3d->prog.vs) { + v3d->prog.vs = vs; + v3d->dirty |= VC5_DIRTY_COMPILED_VS; + } + + key->is_coord = true; + + /* Coord shaders only output varyings used by transform feedback, + * unless they are linked to other shaders in the geometry side + * of the pipeline, since in that case any of the output varyings + * could be required in later geometry stages to compute + * gl_Position or TF outputs. + */ + if (!v3d->prog.bind_gs) { + struct v3d_uncompiled_shader *shader_state = + key->base.shader_state; + memcpy(key->used_outputs, shader_state->tf_outputs, + sizeof(*key->used_outputs) * + shader_state->num_tf_outputs); + if (shader_state->num_tf_outputs < key->num_used_outputs) { + uint32_t tail_bytes = + sizeof(*key->used_outputs) * + (key->num_used_outputs - + shader_state->num_tf_outputs); + memset(&key->used_outputs[shader_state->num_tf_outputs], + 0, tail_bytes); + } + key->num_used_outputs = shader_state->num_tf_outputs; + } + struct v3d_compiled_shader *cs = v3d_get_compiled_shader(v3d, &key->base, sizeof(*key)); if (cs != v3d->prog.cs) { @@ -670,6 +774,7 @@ void v3d_update_compiled_shaders(struct v3d_context *v3d, uint8_t prim_mode) { v3d_update_compiled_fs(v3d, prim_mode); + v3d_update_compiled_gs(v3d, prim_mode); v3d_update_compiled_vs(v3d, prim_mode); } @@ -703,6 +808,12 @@ fs_cache_hash(const void *key) } static uint32_t +gs_cache_hash(const void *key) +{ + return _mesa_hash_data(key, sizeof(struct v3d_gs_key)); +} + +static uint32_t vs_cache_hash(const void *key) { return _mesa_hash_data(key, sizeof(struct v3d_vs_key)); @@ -721,6 +832,12 @@ fs_cache_compare(const void *key1, const void *key2) } static bool +gs_cache_compare(const void *key1, const void *key2) +{ + return memcmp(key1, key2, sizeof(struct v3d_gs_key)) == 0; +} + +static bool vs_cache_compare(const void *key1, const void *key2) { return memcmp(key1, key2, sizeof(struct v3d_vs_key)) == 0; @@ -772,6 +889,14 @@ v3d_fp_state_bind(struct pipe_context *pctx, void *hwcso) } static void +v3d_gp_state_bind(struct pipe_context *pctx, void *hwcso) +{ + struct v3d_context *v3d = v3d_context(pctx); + v3d->prog.bind_gs = hwcso; + v3d->dirty |= VC5_DIRTY_UNCOMPILED_GS; +} + +static void v3d_vp_state_bind(struct pipe_context *pctx, void *hwcso) { struct v3d_context *v3d = v3d_context(pctx); @@ -804,10 +929,14 @@ v3d_program_init(struct pipe_context *pctx) pctx->create_vs_state = v3d_shader_state_create; pctx->delete_vs_state = v3d_shader_state_delete; + pctx->create_gs_state = v3d_shader_state_create; + pctx->delete_gs_state = v3d_shader_state_delete; + pctx->create_fs_state = v3d_shader_state_create; pctx->delete_fs_state = v3d_shader_state_delete; pctx->bind_fs_state = v3d_fp_state_bind; + pctx->bind_gs_state = v3d_gp_state_bind; pctx->bind_vs_state = v3d_vp_state_bind; if (v3d->screen->has_csd) { @@ -818,6 +947,8 @@ v3d_program_init(struct pipe_context *pctx) v3d->prog.cache[MESA_SHADER_VERTEX] = _mesa_hash_table_create(pctx, vs_cache_hash, vs_cache_compare); + v3d->prog.cache[MESA_SHADER_GEOMETRY] = + _mesa_hash_table_create(pctx, gs_cache_hash, gs_cache_compare); v3d->prog.cache[MESA_SHADER_FRAGMENT] = _mesa_hash_table_create(pctx, fs_cache_hash, fs_cache_compare); v3d->prog.cache[MESA_SHADER_COMPUTE] = |