7 files changed, 778 insertions, 110 deletions
diff --git a/src/broadcom/common/v3d_limits.h b/src/broadcom/common/v3d_limits.h
index d65edddab74..e02582035f1 100644
--- a/src/broadcom/common/v3d_limits.h
+++ b/src/broadcom/common/v3d_limits.h
@@ -30,8 +30,11 @@
 #define V3D_CHANNELS 16
 
 #define V3D_MAX_FS_INPUTS 64
+#define V3D_MAX_GS_INPUTS 64
 #define V3D_MAX_VS_INPUTS 64
-#define V3D_MAX_ANY_STAGE_INPUTS MAX2(V3D_MAX_VS_INPUTS, V3D_MAX_FS_INPUTS)
+#define V3D_MAX_ANY_STAGE_INPUTS MAX3(V3D_MAX_VS_INPUTS, \
+                                      V3D_MAX_GS_INPUTS, \
+                                      V3D_MAX_FS_INPUTS)
 
 /* Not specifically a hardware limit, just coordination between compiler and
  * driver.
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 6b566c68e07..d7bef12fef9 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -1367,11 +1367,20 @@ emit_frag_end(struct v3d_compile *c)
                 vir_emit_tlb_color_write(c, rt);
 }
 
+static inline void
+vir_VPM_WRITE_indirect(struct v3d_compile *c,
+                       struct qreg val,
+                       struct qreg vpm_index)
+{
+        assert(c->devinfo->ver >= 40);
+        vir_STVPMV(c, vpm_index, val);
+}
+
 static void
 vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
 {
         if (c->devinfo->ver >= 40) {
-                vir_STVPMV(c, vir_uniform_ui(c, vpm_index), val);
+                vir_VPM_WRITE_indirect(c, val, vir_uniform_ui(c, vpm_index));
         } else {
                 /* XXX: v3d33_vir_vpm_write_setup(c); */
                 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
@@ -1387,6 +1396,15 @@ emit_vert_end(struct v3d_compile *c)
                 vir_VPMWT(c);
 }
 
+static void
+emit_geom_end(struct v3d_compile *c)
+{
+        /* GFXH-1684: VPM writes need to be complete by the end of the shader.
+         */
+        if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
+                vir_VPMWT(c);
+}
+
 void
 v3d_optimize_nir(struct nir_shader *s)
 {
@@ -1474,7 +1492,7 @@ ntq_emit_vpm_read(struct v3d_compile *c,
 }
 
 static void
-ntq_setup_vpm_inputs(struct v3d_compile *c)
+ntq_setup_vs_inputs(struct v3d_compile *c)
 {
         /* Figure out how many components of each vertex attribute the shader
          * uses.  Each variable should have been split to individual
@@ -1565,24 +1583,69 @@ program_reads_point_coord(struct v3d_compile *c)
 }
 
 static void
-ntq_setup_fs_inputs(struct v3d_compile *c)
+get_sorted_input_variables(struct v3d_compile *c,
+                           unsigned *num_entries,
+                           nir_variable ***vars)
 {
-        unsigned num_entries = 0;
+        *num_entries = 0;
         nir_foreach_variable(var, &c->s->inputs)
-                num_entries++;
+                (*num_entries)++;
 
-        nir_variable *vars[num_entries];
+        *vars = ralloc_array(c, nir_variable *, *num_entries);
 
         unsigned i = 0;
         nir_foreach_variable(var, &c->s->inputs)
-                vars[i++] = var;
+                (*vars)[i++] = var;
 
         /* Sort the variables so that we emit the input setup in
          * driver_location order.  This is required for VPM reads, whose data
          * is fetched into the VPM in driver_location (TGSI register index)
          * order.
          */
-        qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
+        qsort(*vars, *num_entries, sizeof(**vars), driver_location_compare);
+}
+
+static void
+ntq_setup_gs_inputs(struct v3d_compile *c)
+{
+        nir_variable **vars;
+        unsigned num_entries;
+        get_sorted_input_variables(c, &num_entries, &vars);
+
+        for (unsigned i = 0; i < num_entries; i++) {
+                nir_variable *var = vars[i];
+
+                /* All GS inputs are arrays with as many entries as vertices
+                 * in the input primitive, but here we only care about the
+                 * per-vertex input type.
+                 */
+                const struct glsl_type *type = glsl_without_array(var->type);
+                unsigned array_len = MAX2(glsl_get_length(type), 1);
+                unsigned loc = var->data.driver_location;
+
+                resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
+                                  (loc + array_len) * 4);
+
+                for (unsigned j = 0; j < array_len; j++) {
+                        unsigned num_elements = glsl_get_vector_elements(type);
+                        for (unsigned k = 0; k < num_elements; k++) {
+                                unsigned chan = var->data.location_frac + k;
+                                unsigned input_idx = c->num_inputs++;
+                                struct v3d_varying_slot slot =
+                                        v3d_slot_from_slot_and_component(var->data.location + j, chan);
+                                c->input_slots[input_idx] = slot;
+                        }
+                }
+        }
+}
+
+
+static void
+ntq_setup_fs_inputs(struct v3d_compile *c)
+{
+        nir_variable **vars;
+        unsigned num_entries;
+        get_sorted_input_variables(c, &num_entries, &vars);
 
         for (unsigned i = 0; i < num_entries; i++) {
                 nir_variable *var = vars[i];
@@ -1949,6 +2012,40 @@ ntq_emit_color_write(struct v3d_compile *c,
 }
 
 static void
+emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+        assert(instr->num_components == 1);
+
+        uint32_t base_offset = nir_intrinsic_base(instr);
+        struct qreg src_offset = ntq_get_src(c, instr->src[1], 0);
+        struct qreg offset =
+                vir_ADD(c, vir_uniform_ui(c, base_offset), src_offset);
+
+        vir_VPM_WRITE_indirect(c, ntq_get_src(c, instr->src[0], 0), offset);
+}
+
+static void
+ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+        /* XXX perf: Use stvpmv with uniform non-constant offsets and
+         * stvpmd with non-uniform offsets and enable
+         * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR.
+         */
+        if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
+               ntq_emit_color_write(c, instr);
+        } else if (c->s->info.stage == MESA_SHADER_GEOMETRY)  {
+               emit_store_output_gs(c, instr);
+        } else {
+               assert(c->s->info.stage == MESA_SHADER_VERTEX);
+               assert(instr->num_components == 1);
+
+               vir_VPM_WRITE(c,
+                             ntq_get_src(c, instr->src[0], 0),
+                             nir_intrinsic_base(instr));
+        }
+}
+
+static void
 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
         switch (instr->intrinsic) {
@@ -2090,19 +2187,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                break;
 
        case nir_intrinsic_store_output:
-                /* XXX perf: Use stvpmv with uniform non-constant offsets and
-                 * stvpmd with non-uniform offsets and enable
-                 * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR.
-                 */
-                if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
-                        ntq_emit_color_write(c, instr);
-                } else {
-                        assert(instr->num_components == 1);
-
-                        vir_VPM_WRITE(c,
-                                      ntq_get_src(c, instr->src[0], 0),
-                                      nir_intrinsic_base(instr));
-                }
+                ntq_emit_store_output(c, instr);
                 break;
 
         case nir_intrinsic_image_deref_size:
@@ -2214,6 +2299,34 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
                 break;
 
+        case nir_intrinsic_load_per_vertex_input: {
+                /* col: vertex index, row = varying index */
+                struct qreg col = ntq_get_src(c, instr->src[0], 0);
+                uint32_t row_idx = nir_intrinsic_base(instr) * 4 +
+                                   nir_intrinsic_component(instr);
+                for (int i = 0; i < instr->num_components; i++) {
+                        struct qreg row = vir_uniform_ui(c, row_idx++);
+                        ntq_store_dest(c, &instr->dest, i,
+                                       vir_LDVPMG_IN(c, row, col));
+                }
+                break;
+        }
+
+        case nir_intrinsic_emit_vertex:
+        case nir_intrinsic_end_primitive:
+                unreachable("Should have been lowered in v3d_nir_lower_io");
+                break;
+
+        case nir_intrinsic_load_primitive_id: {
+                /* gl_PrimitiveIdIn is written by the GBG in the first word of
+                 * VPM output header. According to docs, we should read this
+                 * using ldvpm(v,d)_in (See Table 71).
+                 */
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
+                break;
+        }
+
         default:
                 fprintf(stderr, "Unknown intrinsic: ");
                 nir_print_instr(&instr->instr, stderr);
@@ -2636,10 +2749,21 @@ nir_to_vir(struct v3d_compile *c)
                 c->spill_size += V3D_CHANNELS * c->s->scratch_size;
         }
 
-        if (c->s->info.stage == MESA_SHADER_FRAGMENT)
+        switch (c->s->info.stage) {
+        case MESA_SHADER_VERTEX:
+                ntq_setup_vs_inputs(c);
+                break;
+        case MESA_SHADER_GEOMETRY:
+                ntq_setup_gs_inputs(c);
+                break;
+        case MESA_SHADER_FRAGMENT:
                 ntq_setup_fs_inputs(c);
-        else
-                ntq_setup_vpm_inputs(c);
+                break;
+        case MESA_SHADER_COMPUTE:
+                break;
+        default:
+                unreachable("unsupported shader stage");
+        }
 
         ntq_setup_outputs(c);
 
@@ -2785,6 +2909,9 @@ v3d_nir_to_vir(struct v3d_compile *c)
         case MESA_SHADER_FRAGMENT:
                 emit_frag_end(c);
                 break;
+        case MESA_SHADER_GEOMETRY:
+                emit_geom_end(c);
+                break;
         case MESA_SHADER_VERTEX:
                 emit_vert_end(c);
                 break;
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 29057bdf4df..9b08e4a270e 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -329,6 +329,7 @@ struct v3d_key {
                 bool clamp_r:1;
         } tex[V3D_MAX_TEXTURE_SAMPLERS];
         uint8_t ucp_enables;
+        bool is_last_geometry_stage;
 };
 
 struct v3d_fs_key {
@@ -371,6 +372,16 @@ struct v3d_fs_key {
         struct pipe_rt_blend_state blend;
 };
 
+struct v3d_gs_key {
+        struct v3d_key base;
+
+        struct v3d_varying_slot used_outputs[V3D_MAX_FS_INPUTS];
+        uint8_t num_used_outputs;
+
+        bool is_coord;
+        bool per_vertex_point_size;
+};
+
 struct v3d_vs_key {
         struct v3d_key base;
 
@@ -552,6 +563,7 @@ struct v3d_compile {
         int local_invocation_index_bits;
 
         uint8_t vattr_sizes[V3D_MAX_VS_INPUTS / 4];
+        uint8_t gs_input_sizes[V3D_MAX_GS_INPUTS];
         uint32_t vpm_output_size;
 
         /* Size in bytes of registers that have been spilled. This is how much
@@ -586,6 +598,7 @@ struct v3d_compile {
         struct pipe_shader_state *shader_state;
         struct v3d_key *key;
         struct v3d_fs_key *fs_key;
+        struct v3d_gs_key *gs_key;
         struct v3d_vs_key *vs_key;
 
         /* Live ranges of temps. */
@@ -687,6 +700,26 @@ struct v3d_vs_prog_data {
         uint8_t vcm_cache_size;
 };
 
+struct v3d_gs_prog_data {
+        struct v3d_prog_data base;
+
+        /* Whether the program reads gl_PrimitiveIDIn */
+        bool uses_pid;
+
+        /* Number of components read from each input varying. */
+        uint8_t input_sizes[V3D_MAX_GS_INPUTS / 4];
+
+        /* Number of inputs */
+        uint8_t num_inputs;
+        struct v3d_varying_slot input_slots[V3D_MAX_GS_INPUTS];
+
+        /* Total number of components written, for the shader state record. */
+        uint32_t vpm_output_size;
+
+        /* Output primitive type */
+        uint8_t out_prim_type;
+};
+
 struct v3d_fs_prog_data {
         struct v3d_prog_data base;
 
diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c
index 3145c560a14..3c9279a2fee 100644
--- a/src/broadcom/compiler/v3d_nir_lower_io.c
+++ b/src/broadcom/compiler/v3d_nir_lower_io.c
@@ -45,22 +45,46 @@ struct v3d_nir_lower_io_state {
         int psiz_vpm_offset;
         int varyings_vpm_offset;
 
+        /* Geometry shader state */
+        struct {
+                /* VPM offset for the current vertex data output */
+                nir_variable *output_offset_var;
+                /* VPM offset for the current vertex header */
+                nir_variable *header_offset_var;
+                /* VPM header for the current vertex */
+                nir_variable *header_var;
+
+                /* Size of the complete VPM output header */
+                uint32_t output_header_size;
+                /* Size of the output data for a single vertex */
+                uint32_t output_vertex_data_size;
+        } gs;
+
         BITSET_WORD varyings_stored[BITSET_WORDS(V3D_MAX_ANY_STAGE_INPUTS)];
 
         nir_ssa_def *pos[4];
 };
 
 static void
-v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *chan)
+v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
+                            struct v3d_nir_lower_io_state *state);
+
+static void
+v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *offset,
+                     nir_ssa_def *chan)
 {
         nir_intrinsic_instr *intr =
-                nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output);
+                nir_intrinsic_instr_create(b->shader,
+                                           nir_intrinsic_store_output);
         nir_ssa_dest_init(&intr->instr, &intr->dest,
                           1, intr->dest.ssa.bit_size, NULL);
         intr->num_components = 1;
 
         intr->src[0] = nir_src_for_ssa(chan);
-        intr->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
+        if (offset)
+                intr->src[1] = nir_src_for_ssa(offset);
+        else
+                intr->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
 
         nir_intrinsic_set_base(intr, base);
         nir_intrinsic_set_write_mask(intr, 0x1);
@@ -91,8 +115,23 @@ v3d_varying_slot_vpm_offset(struct v3d_compile *c, nir_variable *var, int chan)
 {
         int component = var->data.location_frac + chan;
 
-        for (int i = 0; i < c->vs_key->num_used_outputs; i++) {
-                struct v3d_varying_slot slot = c->vs_key->used_outputs[i];
+        uint32_t num_used_outputs = 0;
+        struct v3d_varying_slot *used_outputs = NULL;
+        switch (c->s->info.stage) {
+        case MESA_SHADER_VERTEX:
+                num_used_outputs = c->vs_key->num_used_outputs;
+                used_outputs = c->vs_key->used_outputs;
+                break;
+        case MESA_SHADER_GEOMETRY:
+                num_used_outputs = c->gs_key->num_used_outputs;
+                used_outputs = c->gs_key->used_outputs;
+                break;
+        default:
+                unreachable("Unsupported shader stage");
+        }
+
+        for (int i = 0; i < num_used_outputs; i++) {
+                struct v3d_varying_slot slot = used_outputs[i];
 
                 if (v3d_slot_get_slot(slot) == var->data.location &&
                     v3d_slot_get_component(slot) == component) {
@@ -105,6 +144,9 @@ v3d_varying_slot_vpm_offset(struct v3d_compile *c, nir_variable *var, int chan)
 
 /* Lowers a store_output(gallium driver location) to a series of store_outputs
  * with a driver_location equal to the offset in the VPM.
+ *
+ * For geometry shaders we need to emit multiple vertices so the VPM offsets
+ * need to be computed in the shader code based on the current vertex index.
  */
 static void
 v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
@@ -113,6 +155,13 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
 {
         b->cursor = nir_before_instr(&intr->instr);
 
+        /* If this is a geometry shader we need to emit our outputs
+         * to the current vertex offset in the VPM.
+         */
+        nir_ssa_def *offset_reg =
+                c->s->info.stage == MESA_SHADER_GEOMETRY ?
+                        nir_load_var(b, state->gs.output_offset_var) : NULL;
+
         int start_comp = nir_intrinsic_component(intr);
         nir_ssa_def *src = nir_ssa_for_src(b, intr->src[0],
                                            intr->num_components);
@@ -141,7 +190,7 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
         /* Just psiz to the position in the FF header right now. */
         if (var->data.location == VARYING_SLOT_PSIZ &&
             state->psiz_vpm_offset != -1) {
-                v3d_nir_store_output(b, state->psiz_vpm_offset, src);
+                v3d_nir_store_output(b, state->psiz_vpm_offset, offset_reg, src);
         }
 
         /* Scalarize outputs if it hasn't happened already, since we want to
@@ -161,12 +210,73 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
                 BITSET_SET(state->varyings_stored, vpm_offset);
 
                 v3d_nir_store_output(b, state->varyings_vpm_offset + vpm_offset,
-                                     nir_channel(b, src, i));
+                                     offset_reg, nir_channel(b, src, i));
         }
 
         nir_instr_remove(&intr->instr);
 }
 
+static inline void
+reset_gs_header(nir_builder *b, struct v3d_nir_lower_io_state *state)
+{
+        const uint8_t NEW_PRIMITIVE_OFFSET = 0;
+        const uint8_t VERTEX_DATA_LENGTH_OFFSET = 8;
+
+        uint32_t vertex_data_size = state->gs.output_vertex_data_size;
+        assert((vertex_data_size & 0xffffff00) == 0);
+
+        uint32_t header;
+        header  = 1 << NEW_PRIMITIVE_OFFSET;
+        header |= vertex_data_size << VERTEX_DATA_LENGTH_OFFSET;
+        nir_store_var(b, state->gs.header_var, nir_imm_int(b, header), 0x1);
+}
+
+static void
+v3d_nir_lower_emit_vertex(struct v3d_compile *c, nir_builder *b,
+                          nir_intrinsic_instr *instr,
+                          struct v3d_nir_lower_io_state *state)
+{
+        b->cursor = nir_before_instr(&instr->instr);
+
+        nir_ssa_def *header = nir_load_var(b, state->gs.header_var);
+        nir_ssa_def *header_offset = nir_load_var(b, state->gs.header_offset_var);
+        nir_ssa_def *output_offset = nir_load_var(b, state->gs.output_offset_var);
+
+        /* Emit fixed function outputs */
+        v3d_nir_emit_ff_vpm_outputs(c, b, state);
+
+        /* Emit vertex header */
+        v3d_nir_store_output(b, 0, header_offset, header);
+
+        /* Update VPM offset for next vertex output data and header */
+        output_offset =
+                nir_iadd(b, output_offset,
+                            nir_imm_int(b, state->gs.output_vertex_data_size));
+
+        header_offset = nir_iadd(b, header_offset, nir_imm_int(b, 1));
+
+        /* Reset the New Primitive bit */
+        header = nir_iand(b, header, nir_imm_int(b, 0xfffffffe));
+
+        nir_store_var(b, state->gs.output_offset_var, output_offset, 0x1);
+        nir_store_var(b, state->gs.header_offset_var, header_offset, 0x1);
+        nir_store_var(b, state->gs.header_var, header, 0x1);
+
+        nir_instr_remove(&instr->instr);
+}
+
+static void
+v3d_nir_lower_end_primitive(struct v3d_compile *c, nir_builder *b,
+                            nir_intrinsic_instr *instr,
+                            struct v3d_nir_lower_io_state *state)
+{
+        assert(state->gs.header_var);
+        b->cursor = nir_before_instr(&instr->instr);
+        reset_gs_header(b, state);
+
+        nir_instr_remove(&instr->instr);
+}
+
 static void
 v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
                        struct nir_instr *instr,
@@ -182,8 +292,18 @@ v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
                 break;
 
         case nir_intrinsic_store_output:
-                if (c->s->info.stage == MESA_SHADER_VERTEX)
+                if (c->s->info.stage == MESA_SHADER_VERTEX ||
+                    c->s->info.stage == MESA_SHADER_GEOMETRY) {
                         v3d_nir_lower_vpm_output(c, b, intr, state);
+                }
+                break;
+
+        case nir_intrinsic_emit_vertex:
+                v3d_nir_lower_emit_vertex(c, b, intr, state);
+                break;
+
+        case nir_intrinsic_end_primitive:
+                v3d_nir_lower_end_primitive(c, b, intr, state);
                 break;
 
         default:
@@ -226,12 +346,64 @@ v3d_nir_lower_io_update_output_var_base(struct v3d_compile *c,
 }
 
 static void
-v3d_nir_setup_vpm_layout(struct v3d_compile *c,
-                         struct v3d_nir_lower_io_state *state)
+v3d_nir_setup_vpm_layout_vs(struct v3d_compile *c,
+                            struct v3d_nir_lower_io_state *state)
 {
         uint32_t vpm_offset = 0;
 
-        if (c->vs_key->is_coord) {
+        state->pos_vpm_offset = -1;
+        state->vp_vpm_offset = -1;
+        state->zs_vpm_offset = -1;
+        state->rcp_wc_vpm_offset = -1;
+        state->psiz_vpm_offset = -1;
+
+        bool needs_ff_outputs = c->vs_key->base.is_last_geometry_stage;
+        if (needs_ff_outputs) {
+                if (c->vs_key->is_coord) {
+                        state->pos_vpm_offset = vpm_offset;
+                        vpm_offset += 4;
+                }
+
+                state->vp_vpm_offset = vpm_offset;
+                vpm_offset += 2;
+
+                if (!c->vs_key->is_coord) {
+                        state->zs_vpm_offset = vpm_offset++;
+                        state->rcp_wc_vpm_offset = vpm_offset++;
+                }
+
+                if (c->vs_key->per_vertex_point_size)
+                        state->psiz_vpm_offset = vpm_offset++;
+        }
+
+        state->varyings_vpm_offset = vpm_offset;
+
+        c->vpm_output_size = vpm_offset + c->vs_key->num_used_outputs;
+}
+
+static void
+v3d_nir_setup_vpm_layout_gs(struct v3d_compile *c,
+                            struct v3d_nir_lower_io_state *state)
+{
+        /* 1 header slot for number of output vertices */
+        uint32_t vpm_offset = 1;
+
+        /* 1 header slot per output vertex */
+        const uint32_t num_vertices = c->s->info.gs.vertices_out;
+        vpm_offset += num_vertices;
+
+        state->gs.output_header_size = vpm_offset;
+
+        /* Vertex data: here we only compute offsets into a generic vertex data
+         * elements. When it is time to actually write a particular vertex to
+         * the VPM, we will add the offset for that vertex into the VPM output
+         * to these offsets.
+         *
+         * If geometry shaders are present, they are always the last shader
+         * stage before rasterization, so we always emit fixed function outputs.
+         */
+        vpm_offset = 0;
+        if (c->gs_key->is_coord) {
                 state->pos_vpm_offset = vpm_offset;
                 vpm_offset += 4;
         } else {
@@ -241,7 +413,7 @@ v3d_nir_setup_vpm_layout(struct v3d_compile *c,
         state->vp_vpm_offset = vpm_offset;
         vpm_offset += 2;
 
-        if (!c->vs_key->is_coord) {
+        if (!c->gs_key->is_coord) {
                 state->zs_vpm_offset = vpm_offset++;
                 state->rcp_wc_vpm_offset = vpm_offset++;
         } else {
@@ -249,20 +421,34 @@ v3d_nir_setup_vpm_layout(struct v3d_compile *c,
                 state->rcp_wc_vpm_offset = -1;
         }
 
-        if (c->vs_key->per_vertex_point_size)
+        /* Mesa enables OES_geometry_shader_point_size automatically with
+         * OES_geometry_shader so we always need to handle point size
+         * writes if present.
+         */
+        if (c->gs_key->per_vertex_point_size)
                 state->psiz_vpm_offset = vpm_offset++;
-        else
-                state->psiz_vpm_offset = -1;
 
         state->varyings_vpm_offset = vpm_offset;
 
-        c->vpm_output_size = vpm_offset + c->vs_key->num_used_outputs;
+        state->gs.output_vertex_data_size =
+                state->varyings_vpm_offset + c->gs_key->num_used_outputs;
+
+        c->vpm_output_size =
+                state->gs.output_header_size +
+                state->gs.output_vertex_data_size * num_vertices;
 }
 
 static void
 v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
                             struct v3d_nir_lower_io_state *state)
 {
+        /* If this is a geometry shader we need to emit our fixed function
+         * outputs to the current vertex offset in the VPM.
+         */
+        nir_ssa_def *offset_reg =
+                c->s->info.stage == MESA_SHADER_GEOMETRY ?
+                        nir_load_var(b, state->gs.output_offset_var) : NULL;
+
         for (int i = 0; i < 4; i++) {
                 if (!state->pos[i])
                         state->pos[i] = nir_ssa_undef(b, 1, 32);
@@ -273,23 +459,25 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
         if (state->pos_vpm_offset != -1) {
                 for (int i = 0; i < 4; i++) {
                         v3d_nir_store_output(b, state->pos_vpm_offset + i,
-                                             state->pos[i]);
+                                             offset_reg, state->pos[i]);
                 }
         }
 
-        for (int i = 0; i < 2; i++) {
-                nir_ssa_def *pos;
-                nir_ssa_def *scale;
-                pos = state->pos[i];
-                if (i == 0)
-                        scale = nir_load_viewport_x_scale(b);
-                else
-                        scale = nir_load_viewport_y_scale(b);
-                pos = nir_fmul(b, pos, scale);
-                pos = nir_fmul(b, pos, rcp_wc);
-                pos = nir_f2i32(b, nir_fround_even(b, pos));
-                v3d_nir_store_output(b, state->vp_vpm_offset + i,
-                                     pos);
+        if (state->vp_vpm_offset != -1) {
+                for (int i = 0; i < 2; i++) {
+                        nir_ssa_def *pos;
+                        nir_ssa_def *scale;
+                        pos = state->pos[i];
+                        if (i == 0)
+                                scale = nir_load_viewport_x_scale(b);
+                        else
+                                scale = nir_load_viewport_y_scale(b);
+                        pos = nir_fmul(b, pos, scale);
+                        pos = nir_fmul(b, pos, rcp_wc);
+                        pos = nir_f2i32(b, nir_fround_even(b, pos));
+                        v3d_nir_store_output(b, state->vp_vpm_offset + i,
+                                             offset_reg, pos);
+                }
         }
 
         if (state->zs_vpm_offset != -1) {
@@ -297,38 +485,118 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
                 z = nir_fmul(b, z, nir_load_viewport_z_scale(b));
                 z = nir_fmul(b, z, rcp_wc);
                 z = nir_fadd(b, z, nir_load_viewport_z_offset(b));
-                v3d_nir_store_output(b, state->zs_vpm_offset, z);
+                v3d_nir_store_output(b, state->zs_vpm_offset, offset_reg, z);
         }
 
-        if (state->rcp_wc_vpm_offset != -1)
-                v3d_nir_store_output(b, state->rcp_wc_vpm_offset, rcp_wc);
+        if (state->rcp_wc_vpm_offset != -1) {
+                v3d_nir_store_output(b, state->rcp_wc_vpm_offset,
+                                     offset_reg, rcp_wc);
+        }
 
-        /* Store 0 to varyings requested by the FS but not stored in the VS.
-         * This should be undefined behavior, but glsl-routing seems to rely
-         * on it.
+        /* Store 0 to varyings requested by the FS but not stored by the
+         * previous stage. This should be undefined behavior, but
+         * glsl-routing seems to rely on it.
          */
-        for (int i = 0; i < c->vs_key->num_used_outputs; i++) {
+        uint32_t num_used_outputs;
+        switch (c->s->info.stage) {
+        case MESA_SHADER_VERTEX:
+                num_used_outputs = c->vs_key->num_used_outputs;
+                break;
+        case MESA_SHADER_GEOMETRY:
+                num_used_outputs = c->gs_key->num_used_outputs;
+                break;
+        default:
+                unreachable("Unsupported shader stage");
+        }
+
+        for (int i = 0; i < num_used_outputs; i++) {
                 if (!BITSET_TEST(state->varyings_stored, i)) {
                         v3d_nir_store_output(b, state->varyings_vpm_offset + i,
-                                             nir_imm_int(b, 0));
+                                             offset_reg, nir_imm_int(b, 0));
                 }
         }
 }
 
+static void
+emit_gs_prolog(struct v3d_compile *c, nir_builder *b,
+               nir_function_impl *impl,
+               struct v3d_nir_lower_io_state *state)
+{
+        nir_block *first = nir_start_block(impl);
+        b->cursor = nir_before_block(first);
+
+        const struct glsl_type *uint_type = glsl_uint_type();
+
+        assert(!state->gs.output_offset_var);
+        state->gs.output_offset_var =
+                nir_local_variable_create(impl, uint_type, "output_offset");
+        nir_store_var(b, state->gs.output_offset_var,
+                      nir_imm_int(b, state->gs.output_header_size), 0x1);
+
+        assert(!state->gs.header_offset_var);
+        state->gs.header_offset_var =
+                nir_local_variable_create(impl, uint_type, "header_offset");
+        nir_store_var(b, state->gs.header_offset_var, nir_imm_int(b, 1), 0x1);
+
+        assert(!state->gs.header_var);
+        state->gs.header_var =
+                nir_local_variable_create(impl, uint_type, "header");
+        reset_gs_header(b, state);
+}
+
+static void
+emit_gs_vpm_output_header_prolog(struct v3d_compile *c, nir_builder *b,
+                                 struct v3d_nir_lower_io_state *state)
+{
+        const uint8_t VERTEX_COUNT_OFFSET = 16;
+
+        /* Our GS header has 1 generic header slot (at VPM offset 0) and then
+         * one slot per output vertex after it. This means we don't need to
+         * have a variable just to keep track of the number of vertices we
+         * emitted and instead we can just compute it here from the header
+         * offset variable by removing the one generic header slot that always
+         * goes at the begining of out header.
+         */
+        nir_ssa_def *header_offset =
+                nir_load_var(b, state->gs.header_offset_var);
+        nir_ssa_def *vertex_count =
+                nir_isub(b, header_offset, nir_imm_int(b, 1));
+        nir_ssa_def *header =
+                nir_ior(b, nir_imm_int(b, state->gs.output_header_size),
+                           nir_ishl(b, vertex_count,
+                                    nir_imm_int(b, VERTEX_COUNT_OFFSET)));
+
+        v3d_nir_store_output(b, 0, NULL, header);
+}
+
 void
 v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
 {
         struct v3d_nir_lower_io_state state = { 0 };
 
         /* Set up the layout of the VPM outputs. */
-        if (s->info.stage == MESA_SHADER_VERTEX)
-                v3d_nir_setup_vpm_layout(c, &state);
+        switch (s->info.stage) {
+        case MESA_SHADER_VERTEX:
+                v3d_nir_setup_vpm_layout_vs(c, &state);
+                break;
+        case MESA_SHADER_GEOMETRY:
+                v3d_nir_setup_vpm_layout_gs(c, &state);
+                break;
+        case MESA_SHADER_FRAGMENT:
+        case MESA_SHADER_COMPUTE:
+                break;
+        default:
+                unreachable("Unsupported shader stage");
+        }
 
         nir_foreach_function(function, s) {
                 if (function->impl) {
                         nir_builder b;
                         nir_builder_init(&b, function->impl);
 
+                        if (c->s->info.stage == MESA_SHADER_GEOMETRY)
+                                emit_gs_prolog(c, &b, function->impl, &state);
+
                         nir_foreach_block(block, function->impl) {
                                 nir_foreach_instr_safe(instr, block)
                                         v3d_nir_lower_io_instr(c, &b, instr,
@@ -337,8 +605,11 @@ v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
 
                         nir_block *last = nir_impl_last_block(function->impl);
                         b.cursor = nir_after_block(last);
-                        if (s->info.stage == MESA_SHADER_VERTEX)
+                        if (s->info.stage == MESA_SHADER_VERTEX) {
                                 v3d_nir_emit_ff_vpm_outputs(c, &b, &state);
+                        } else if (s->info.stage == MESA_SHADER_GEOMETRY) {
+                                emit_gs_vpm_output_header_prolog(c, &b, &state);
+                        }
 
                         nir_metadata_preserve(function->impl,
                                               nir_metadata_block_index |
@@ -346,6 +617,8 @@ v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
                 }
         }
 
-        if (s->info.stage == MESA_SHADER_VERTEX)
+        if (s->info.stage == MESA_SHADER_VERTEX ||
+            s->info.stage == MESA_SHADER_GEOMETRY) {
                 v3d_nir_lower_io_update_output_var_base(c, &state);
+        }
 }
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 340cda903e9..dc966bc80ca 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -23,6 +23,7 @@
 
 #include "broadcom/common/v3d_device_info.h"
 #include "v3d_compiler.h"
+#include "util/u_prim.h"
 
 int
 vir_get_nsrc(struct qinst *inst)
@@ -661,6 +662,28 @@ v3d_vs_set_prog_data(struct v3d_compile *c,
 }
 
 static void
+v3d_gs_set_prog_data(struct v3d_compile *c,
+                     struct v3d_gs_prog_data *prog_data)
+{
+        prog_data->num_inputs = c->num_inputs;
+        memcpy(prog_data->input_slots, c->input_slots,
+               c->num_inputs * sizeof(*c->input_slots));
+
+        /* gl_PrimitiveIdIn is written by the GBG into the first word of the
+         * VPM output header automatically and the shader will overwrite
+         * it after reading it if necessary, so it doesn't add to the VPM
+         * size requirements.
+         */
+        prog_data->uses_pid = (c->s->info.system_values_read &
+                               (1ull << SYSTEM_VALUE_PRIMITIVE_ID));
+
+        /* Output segment size is in sectors (8 rows of 32 bits per channel) */
+        prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
+
+        prog_data->out_prim_type = c->s->info.gs.output_primitive;
+}
+
+static void
 v3d_set_fs_prog_data_inputs(struct v3d_compile *c,
                             struct v3d_fs_prog_data *prog_data)
 {
@@ -714,13 +737,21 @@ v3d_set_prog_data(struct v3d_compile *c,
 
         v3d_set_prog_data_uniforms(c, prog_data);
 
-        if (c->s->info.stage == MESA_SHADER_COMPUTE) {
-                v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data);
-        } else if (c->s->info.stage == MESA_SHADER_VERTEX) {
+        switch (c->s->info.stage) {
+        case MESA_SHADER_VERTEX:
                 v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data);
-        } else {
-                assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
+                break;
+        case MESA_SHADER_GEOMETRY:
+                v3d_gs_set_prog_data(c, (struct v3d_gs_prog_data *)prog_data);
+                break;
+        case MESA_SHADER_FRAGMENT:
                 v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data);
+                break;
+        case MESA_SHADER_COMPUTE:
+                v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data);
+                break;
+        default:
+                unreachable("unsupported shader stage");
         }
 }
 
@@ -772,6 +803,37 @@ v3d_nir_lower_vs_early(struct v3d_compile *c)
 }
 
 static void
+v3d_nir_lower_gs_early(struct v3d_compile *c)
+{
+        /* Split our I/O vars and dead code eliminate the unused
+         * components.
+         */
+        NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
+                   nir_var_shader_in | nir_var_shader_out);
+        uint64_t used_outputs[4] = {0};
+        for (int i = 0; i < c->gs_key->num_used_outputs; i++) {
+                int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]);
+                int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]);
+                used_outputs[comp] |= 1ull << slot;
+        }
+        NIR_PASS_V(c->s, nir_remove_unused_io_vars,
+                   &c->s->outputs, used_outputs, NULL); /* demotes to globals */
+        NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
+        v3d_optimize_nir(c->s);
+        NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in);
+
+        /* This must go before nir_lower_io */
+        if (c->gs_key->per_vertex_point_size)
+                NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
+
+        NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
+                   type_size_vec4,
+                   (nir_lower_io_options)0);
+        /* clean up nir_lower_io's deref_var remains */
+        NIR_PASS_V(c->s, nir_opt_dce);
+}
+
+static void
 v3d_fixup_fs_output_types(struct v3d_compile *c)
 {
         nir_foreach_variable(var, &c->s->outputs) {
@@ -819,6 +881,18 @@ v3d_nir_lower_fs_early(struct v3d_compile *c)
 }
 
 static void
+v3d_nir_lower_gs_late(struct v3d_compile *c)
+{
+        if (c->key->ucp_enables) {
+                NIR_PASS_V(c->s, nir_lower_clip_gs, c->key->ucp_enables,
+                           false, NULL);
+        }
+
+        /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */
+        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
+}
+
+static void
 v3d_nir_lower_vs_late(struct v3d_compile *c)
 {
         if (c->vs_key->clamp_color)
@@ -907,6 +981,10 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                 c->vs_key = (struct v3d_vs_key *)key;
                 prog_data = rzalloc_size(NULL, sizeof(struct v3d_vs_prog_data));
                 break;
+        case MESA_SHADER_GEOMETRY:
+                c->gs_key = (struct v3d_gs_key *)key;
+                prog_data = rzalloc_size(NULL, sizeof(struct v3d_gs_prog_data));
+                break;
         case MESA_SHADER_FRAGMENT:
                 c->fs_key = (struct v3d_fs_key *)key;
                 prog_data = rzalloc_size(NULL, sizeof(struct v3d_fs_prog_data));
@@ -919,20 +997,35 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                 unreachable("unsupported shader stage");
         }
 
-        if (c->s->info.stage == MESA_SHADER_VERTEX) {
+
+        switch (c->s->info.stage) {
+        case MESA_SHADER_VERTEX:
                 v3d_nir_lower_vs_early(c);
-        } else if (c->s->info.stage != MESA_SHADER_COMPUTE) {
-                assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
+                break;
+        case MESA_SHADER_GEOMETRY:
+                v3d_nir_lower_gs_early(c);
+                break;
+        case MESA_SHADER_FRAGMENT:
                 v3d_nir_lower_fs_early(c);
+                break;
+        default:
+                break;
         }
 
         v3d_lower_nir(c);
 
-        if (c->s->info.stage == MESA_SHADER_VERTEX) {
+        switch (c->s->info.stage) {
+        case MESA_SHADER_VERTEX:
                 v3d_nir_lower_vs_late(c);
-        } else if (c->s->info.stage != MESA_SHADER_COMPUTE)  {
-                assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
+                break;
+        case MESA_SHADER_GEOMETRY:
+                v3d_nir_lower_gs_late(c);
+                break;
+        case MESA_SHADER_FRAGMENT:
                 v3d_nir_lower_fs_late(c);
+                break;
+        default:
+                break;
         }
 
         NIR_PASS_V(c->s, v3d_nir_lower_io, c);
@@ -1134,7 +1227,9 @@ const char *
 vir_get_stage_name(struct v3d_compile *c)
 {
         if (c->vs_key && c->vs_key->is_coord)
-                return "MESA_SHADER_COORD";
+                return "MESA_SHADER_VERTEX_BIN";
+        else if (c->gs_key && c->gs_key->is_coord)
+                return "MESA_SHADER_GEOMETRY_BIN";
         else
                 return gl_shader_stage_name(c->s->info.stage);
 }
diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h
index ecedbaf9efb..bf85b42eb9d 100644
--- a/src/gallium/drivers/v3d/v3d_context.h
+++ b/src/gallium/drivers/v3d/v3d_context.h
@@ -59,7 +59,8 @@ void v3d_job_add_bo(struct v3d_job *job, struct v3d_bo *bo);
 #define VC5_DIRTY_ZSA                 (1ull <<  2)
 #define VC5_DIRTY_COMPTEX             (1ull <<  3)
 #define VC5_DIRTY_VERTTEX             (1ull <<  4)
-#define VC5_DIRTY_FRAGTEX             (1ull <<  5)
+#define VC5_DIRTY_GEOMTEX             (1ull <<  5)
+#define VC5_DIRTY_FRAGTEX             (1ull <<  6)
 
 #define VC5_DIRTY_SHADER_IMAGE        (1ull <<  9)
 #define VC5_DIRTY_BLEND_COLOR         (1ull << 10)
@@ -77,18 +78,22 @@ void v3d_job_add_bo(struct v3d_job *job, struct v3d_bo *bo);
 #define VC5_DIRTY_CLIP                (1ull << 22)
 #define VC5_DIRTY_UNCOMPILED_CS       (1ull << 23)
 #define VC5_DIRTY_UNCOMPILED_VS       (1ull << 24)
-#define VC5_DIRTY_UNCOMPILED_FS       (1ull << 25)
+#define VC5_DIRTY_UNCOMPILED_GS       (1ull << 25)
+#define VC5_DIRTY_UNCOMPILED_FS       (1ull << 26)
 
 #define VC5_DIRTY_COMPILED_CS         (1ull << 29)
 #define VC5_DIRTY_COMPILED_VS         (1ull << 30)
-#define VC5_DIRTY_COMPILED_FS         (1ull << 31)
-
-#define VC5_DIRTY_FS_INPUTS           (1ull << 35)
-#define VC5_DIRTY_STREAMOUT           (1ull << 36)
-#define VC5_DIRTY_OQ                  (1ull << 37)
-#define VC5_DIRTY_CENTROID_FLAGS      (1ull << 38)
-#define VC5_DIRTY_NOPERSPECTIVE_FLAGS (1ull << 39)
-#define VC5_DIRTY_SSBO                (1ull << 40)
+#define VC5_DIRTY_COMPILED_GS_BIN     (1ULL << 31)
+#define VC5_DIRTY_COMPILED_GS         (1ULL << 32)
+#define VC5_DIRTY_COMPILED_FS         (1ull << 33)
+
+#define VC5_DIRTY_FS_INPUTS           (1ull << 38)
+#define VC5_DIRTY_GS_INPUTS           (1ull << 39)
+#define VC5_DIRTY_STREAMOUT           (1ull << 40)
+#define VC5_DIRTY_OQ                  (1ull << 41)
+#define VC5_DIRTY_CENTROID_FLAGS      (1ull << 42)
+#define VC5_DIRTY_NOPERSPECTIVE_FLAGS (1ull << 43)
+#define VC5_DIRTY_SSBO                (1ull << 44)
 
 #define VC5_MAX_FS_INPUTS 64
 
@@ -206,6 +211,7 @@ struct v3d_compiled_shader {
         union {
                 struct v3d_prog_data *base;
                 struct v3d_vs_prog_data *vs;
+                struct v3d_gs_prog_data *gs;
                 struct v3d_fs_prog_data *fs;
                 struct v3d_compute_prog_data *compute;
         } prog_data;
@@ -219,8 +225,8 @@ struct v3d_compiled_shader {
 };
 
 struct v3d_program_stateobj {
-        struct v3d_uncompiled_shader *bind_vs, *bind_fs, *bind_compute;
-        struct v3d_compiled_shader *cs, *vs, *fs, *compute;
+        struct v3d_uncompiled_shader *bind_vs, *bind_gs, *bind_fs, *bind_compute;
+        struct v3d_compiled_shader *cs, *vs, *gs_bin, *gs, *fs, *compute;
 
         struct hash_table *cache[MESA_SHADER_STAGES];
 
diff --git a/src/gallium/drivers/v3d/v3d_program.c b/src/gallium/drivers/v3d/v3d_program.c
index 0f7762f119d..7bbdbe409e2 100644
--- a/src/gallium/drivers/v3d/v3d_program.c
+++ b/src/gallium/drivers/v3d/v3d_program.c
@@ -205,8 +205,12 @@ v3d_shader_precompile(struct v3d_context *v3d,
                 v3d_setup_shared_precompile_key(so, &key.base);
                 v3d_get_compiled_shader(v3d, &key.base, sizeof(key));
         } else {
+                /* FIXME: add geometry shaders */
+
                 struct v3d_vs_key key = {
                         .base.shader_state = so,
+                        /* Emit fixed function outputs */
+                        .base.is_last_geometry_stage = true,
                 };
 
                 v3d_setup_shared_precompile_key(so, &key.base);
@@ -271,8 +275,10 @@ v3d_uncompiled_shader_create(struct pipe_context *pctx,
         }
 
         nir_variable_mode lower_mode = nir_var_all & ~nir_var_uniform;
-        if (s->info.stage == MESA_SHADER_VERTEX)
+        if (s->info.stage == MESA_SHADER_VERTEX ||
+            s->info.stage == MESA_SHADER_GEOMETRY) {
                 lower_mode &= ~(nir_var_shader_in | nir_var_shader_out);
+        }
         NIR_PASS_V(s, nir_lower_io, lower_mode,
                    type_size,
                    (nir_lower_io_options)0);
@@ -609,55 +615,153 @@ v3d_update_compiled_fs(struct v3d_context *v3d, uint8_t prim_mode)
 }
 
 static void
-v3d_update_compiled_vs(struct v3d_context *v3d, uint8_t prim_mode)
+v3d_update_compiled_gs(struct v3d_context *v3d, uint8_t prim_mode)
 {
-        struct v3d_vs_key local_key;
-        struct v3d_vs_key *key = &local_key;
+        struct v3d_gs_key local_key;
+        struct v3d_gs_key *key = &local_key;
 
-        if (!(v3d->dirty & (VC5_DIRTY_PRIM_MODE |
+        if (!(v3d->dirty & (VC5_DIRTY_GEOMTEX |
                             VC5_DIRTY_RASTERIZER |
-                            VC5_DIRTY_VERTTEX |
-                            VC5_DIRTY_VTXSTATE |
-                            VC5_DIRTY_UNCOMPILED_VS |
+                            VC5_DIRTY_UNCOMPILED_GS |
+                            VC5_DIRTY_PRIM_MODE |
                             VC5_DIRTY_FS_INPUTS))) {
                 return;
         }
 
+        if (!v3d->prog.bind_gs) {
+                v3d->prog.gs = NULL;
+                v3d->prog.gs_bin = NULL;
+                return;
+        }
+
         memset(key, 0, sizeof(*key));
-        v3d_setup_shared_key(v3d, &key->base, &v3d->tex[PIPE_SHADER_VERTEX]);
-        key->base.shader_state = v3d->prog.bind_vs;
+        v3d_setup_shared_key(v3d, &key->base, &v3d->tex[PIPE_SHADER_GEOMETRY]);
+        key->base.shader_state = v3d->prog.bind_gs;
         key->base.ucp_enables = v3d->rasterizer->base.clip_plane_enable;
+        key->base.is_last_geometry_stage = true;
         key->num_used_outputs = v3d->prog.fs->prog_data.fs->num_inputs;
         STATIC_ASSERT(sizeof(key->used_outputs) ==
                       sizeof(v3d->prog.fs->prog_data.fs->input_slots));
         memcpy(key->used_outputs, v3d->prog.fs->prog_data.fs->input_slots,
                sizeof(key->used_outputs));
-        key->clamp_color = v3d->rasterizer->base.clamp_vertex_color;
 
         key->per_vertex_point_size =
                 (prim_mode == PIPE_PRIM_POINTS &&
                  v3d->rasterizer->base.point_size_per_vertex);
 
-        struct v3d_compiled_shader *vs =
+        struct v3d_compiled_shader *gs =
                 v3d_get_compiled_shader(v3d, &key->base, sizeof(*key));
-        if (vs != v3d->prog.vs) {
-                v3d->prog.vs = vs;
-                v3d->dirty |= VC5_DIRTY_COMPILED_VS;
+        if (gs != v3d->prog.gs) {
+                v3d->prog.gs = gs;
+                v3d->dirty |= VC5_DIRTY_COMPILED_GS;
         }
 
         key->is_coord = true;
-        /* Coord shaders only output varyings used by transform feedback. */
+
+        /* The last bin-mode shader in the geometry pipeline only outputs
+         * varyings used by transform feedback.
+         */
         struct v3d_uncompiled_shader *shader_state = key->base.shader_state;
         memcpy(key->used_outputs, shader_state->tf_outputs,
                sizeof(*key->used_outputs) * shader_state->num_tf_outputs);
         if (shader_state->num_tf_outputs < key->num_used_outputs) {
+                uint32_t size = sizeof(*key->used_outputs) *
+                                (key->num_used_outputs -
+                                 shader_state->num_tf_outputs);
                 memset(&key->used_outputs[shader_state->num_tf_outputs],
-                       0,
-                       sizeof(*key->used_outputs) * (key->num_used_outputs -
-                                                  shader_state->num_tf_outputs));
+                       0, size);
         }
         key->num_used_outputs = shader_state->num_tf_outputs;
 
+        struct v3d_compiled_shader *old_gs = v3d->prog.gs;
+        struct v3d_compiled_shader *gs_bin =
+                v3d_get_compiled_shader(v3d, &key->base, sizeof(*key));
+        if (gs_bin != old_gs) {
+                v3d->prog.gs_bin = gs_bin;
+                v3d->dirty |= VC5_DIRTY_COMPILED_GS_BIN;
+        }
+
+        if (old_gs && memcmp(v3d->prog.gs->prog_data.gs->input_slots,
+                             old_gs->prog_data.gs->input_slots,
+                             sizeof(v3d->prog.gs->prog_data.gs->input_slots))) {
+                v3d->dirty |= VC5_DIRTY_GS_INPUTS;
+        }
+}
+
+static void
+v3d_update_compiled_vs(struct v3d_context *v3d, uint8_t prim_mode)
+{
+        struct v3d_vs_key local_key;
+        struct v3d_vs_key *key = &local_key;
+
+        if (!(v3d->dirty & (VC5_DIRTY_VERTTEX |
+                            VC5_DIRTY_VTXSTATE |
+                            VC5_DIRTY_UNCOMPILED_VS |
+                            (v3d->prog.bind_gs ? 0 : VC5_DIRTY_RASTERIZER) |
+                            (v3d->prog.bind_gs ? 0 : VC5_DIRTY_PRIM_MODE) |
+                            (v3d->prog.bind_gs ? VC5_DIRTY_GS_INPUTS :
+                                                 VC5_DIRTY_FS_INPUTS)))) {
+                return;
+        }
+
+        memset(key, 0, sizeof(*key));
+        v3d_setup_shared_key(v3d, &key->base, &v3d->tex[PIPE_SHADER_VERTEX]);
+        key->base.shader_state = v3d->prog.bind_vs;
+        key->base.ucp_enables = v3d->rasterizer->base.clip_plane_enable;
+        key->base.is_last_geometry_stage = !v3d->prog.bind_gs;
+
+        if (!v3d->prog.bind_gs) {
+            key->num_used_outputs = v3d->prog.fs->prog_data.fs->num_inputs;
+            STATIC_ASSERT(sizeof(key->used_outputs) ==
+                          sizeof(v3d->prog.fs->prog_data.fs->input_slots));
+            memcpy(key->used_outputs, v3d->prog.fs->prog_data.fs->input_slots,
+                   sizeof(key->used_outputs));
+        } else {
+            key->num_used_outputs = v3d->prog.gs->prog_data.gs->num_inputs;
+            STATIC_ASSERT(sizeof(key->used_outputs) ==
+                          sizeof(v3d->prog.gs->prog_data.gs->input_slots));
+            memcpy(key->used_outputs, v3d->prog.gs->prog_data.gs->input_slots,
+                   sizeof(key->used_outputs));
+        }
+
+        key->clamp_color = v3d->rasterizer->base.clamp_vertex_color;
+
+        key->per_vertex_point_size =
+                (prim_mode == PIPE_PRIM_POINTS &&
+                 v3d->rasterizer->base.point_size_per_vertex);
+
+        struct v3d_compiled_shader *vs =
+                v3d_get_compiled_shader(v3d, &key->base, sizeof(*key));
+        if (vs != v3d->prog.vs) {
+                v3d->prog.vs = vs;
+                v3d->dirty |= VC5_DIRTY_COMPILED_VS;
+        }
+
+        key->is_coord = true;
+
+        /* Coord shaders only output varyings used by transform feedback,
+         * unless they are linked to other shaders in the geometry side
+         * of the pipeline, since in that case any of the output varyings
+         * could be required in later geometry stages to compute
+         * gl_Position or TF outputs.
+         */
+        if (!v3d->prog.bind_gs) {
+                struct v3d_uncompiled_shader *shader_state =
+                        key->base.shader_state;
+                memcpy(key->used_outputs, shader_state->tf_outputs,
+                       sizeof(*key->used_outputs) *
+                       shader_state->num_tf_outputs);
+                if (shader_state->num_tf_outputs < key->num_used_outputs) {
+                        uint32_t tail_bytes =
+                                sizeof(*key->used_outputs) *
+                                (key->num_used_outputs -
+                                 shader_state->num_tf_outputs);
+                        memset(&key->used_outputs[shader_state->num_tf_outputs],
+                               0, tail_bytes);
+                }
+                key->num_used_outputs = shader_state->num_tf_outputs;
+        }
+
         struct v3d_compiled_shader *cs =
                 v3d_get_compiled_shader(v3d, &key->base, sizeof(*key));
         if (cs != v3d->prog.cs) {
@@ -670,6 +774,7 @@ void
 v3d_update_compiled_shaders(struct v3d_context *v3d, uint8_t prim_mode)
 {
         v3d_update_compiled_fs(v3d, prim_mode);
+        v3d_update_compiled_gs(v3d, prim_mode);
         v3d_update_compiled_vs(v3d, prim_mode);
 }
 
@@ -703,6 +808,12 @@ fs_cache_hash(const void *key)
 }
 
 static uint32_t
+gs_cache_hash(const void *key)
+{
+        return _mesa_hash_data(key, sizeof(struct v3d_gs_key));
+}
+
+static uint32_t
 vs_cache_hash(const void *key)
 {
         return _mesa_hash_data(key, sizeof(struct v3d_vs_key));
@@ -721,6 +832,12 @@ fs_cache_compare(const void *key1, const void *key2)
 }
 
 static bool
+gs_cache_compare(const void *key1, const void *key2)
+{
+        return memcmp(key1, key2, sizeof(struct v3d_gs_key)) == 0;
+}
+
+static bool
 vs_cache_compare(const void *key1, const void *key2)
 {
         return memcmp(key1, key2, sizeof(struct v3d_vs_key)) == 0;
@@ -772,6 +889,14 @@ v3d_fp_state_bind(struct pipe_context *pctx, void *hwcso)
 }
 
 static void
+v3d_gp_state_bind(struct pipe_context *pctx, void *hwcso)
+{
+        struct v3d_context *v3d = v3d_context(pctx);
+        v3d->prog.bind_gs = hwcso;
+        v3d->dirty |= VC5_DIRTY_UNCOMPILED_GS;
+}
+
+static void
 v3d_vp_state_bind(struct pipe_context *pctx, void *hwcso)
 {
         struct v3d_context *v3d = v3d_context(pctx);
@@ -804,10 +929,14 @@ v3d_program_init(struct pipe_context *pctx)
         pctx->create_vs_state = v3d_shader_state_create;
         pctx->delete_vs_state = v3d_shader_state_delete;
 
+        pctx->create_gs_state = v3d_shader_state_create;
+        pctx->delete_gs_state = v3d_shader_state_delete;
+
         pctx->create_fs_state = v3d_shader_state_create;
         pctx->delete_fs_state = v3d_shader_state_delete;
 
         pctx->bind_fs_state = v3d_fp_state_bind;
+        pctx->bind_gs_state = v3d_gp_state_bind;
         pctx->bind_vs_state = v3d_vp_state_bind;
 
         if (v3d->screen->has_csd) {
@@ -818,6 +947,8 @@ v3d_program_init(struct pipe_context *pctx)
 
         v3d->prog.cache[MESA_SHADER_VERTEX] =
                 _mesa_hash_table_create(pctx, vs_cache_hash, vs_cache_compare);
+        v3d->prog.cache[MESA_SHADER_GEOMETRY] =
+                _mesa_hash_table_create(pctx, gs_cache_hash, gs_cache_compare);
         v3d->prog.cache[MESA_SHADER_FRAGMENT] =
                 _mesa_hash_table_create(pctx, fs_cache_hash, fs_cache_compare);
         v3d->prog.cache[MESA_SHADER_COMPUTE] =