13 files changed, 517 insertions, 2 deletions
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 9b94c9edf23..5c98aeefc66 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -1524,6 +1524,9 @@ typedef enum {
    NIR_INTRINSIC_SRC_ACCESS,
    NIR_INTRINSIC_DST_ACCESS,
 
+   /* Driver location for nir_load_patch_location_ir3 */
+   NIR_INTRINSIC_DRIVER_LOCATION,
+
    NIR_INTRINSIC_NUM_INDEX_FLAGS,
 
 } nir_intrinsic_index_flag;
@@ -1632,6 +1635,7 @@ INTRINSIC_IDX_ACCESSORS(align_offset, ALIGN_OFFSET, unsigned)
 INTRINSIC_IDX_ACCESSORS(desc_type, DESC_TYPE, unsigned)
 INTRINSIC_IDX_ACCESSORS(type, TYPE, nir_alu_type)
 INTRINSIC_IDX_ACCESSORS(swizzle_mask, SWIZZLE_MASK, unsigned)
+INTRINSIC_IDX_ACCESSORS(driver_location, DRIVER_LOCATION, unsigned)
 
 static inline void
 nir_intrinsic_set_align(nir_intrinsic_instr *intrin,
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index ae62a85d39b..637576c092a 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -124,6 +124,8 @@ DESC_TYPE = "NIR_INTRINSIC_DESC_TYPE"
 TYPE = "NIR_INTRINSIC_TYPE"
 # The swizzle mask for quad_swizzle_amd & masked_swizzle_amd
 SWIZZLE_MASK = "NIR_INTRINSIC_SWIZZLE_MASK"
+# Driver location of attribute
+DRIVER_LOCATION = "NIR_INTRINSIC_DRIVER_LOCATION"
 
 #
 # Possible flags:
@@ -771,6 +773,12 @@ intrinsic("ssbo_atomic_xor_ir3",        src_comp=[1, 1, 1, 1],    dest_comp=1)
 intrinsic("ssbo_atomic_exchange_ir3",   src_comp=[1, 1, 1, 1],    dest_comp=1)
 intrinsic("ssbo_atomic_comp_swap_ir3",  src_comp=[1, 1, 1, 1, 1], dest_comp=1)
 
+# System values for freedreno geometry shaders.
+system_value("vs_primitive_stride_ir3", 1)
+system_value("vs_vertex_stride_ir3", 1)
+system_value("gs_header_ir3", 1)
+system_value("primitive_location_ir3", 1, indices=[DRIVER_LOCATION])
+
 # IR3-specific load/store intrinsics. These access a buffer used to pass data
 # between geometry stages - perhaps it's explicit access to the vertex cache.
 
diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c
index 48844b7ed79..496f9279676 100644
--- a/src/compiler/nir/nir_print.c
+++ b/src/compiler/nir/nir_print.c
@@ -800,6 +800,7 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state)
       [NIR_INTRINSIC_DESC_TYPE] = "desc_type",
       [NIR_INTRINSIC_TYPE] = "type",
       [NIR_INTRINSIC_SWIZZLE_MASK] = "swizzle_mask",
+      [NIR_INTRINSIC_DRIVER_LOCATION] = "driver_location",
    };
    for (unsigned idx = 1; idx < NIR_INTRINSIC_NUM_INDEX_FLAGS; idx++) {
       if (!info->index_map[idx])
diff --git a/src/compiler/shader_enums.c b/src/compiler/shader_enums.c
index 71796687afa..afaad50adf6 100644
--- a/src/compiler/shader_enums.c
+++ b/src/compiler/shader_enums.c
@@ -254,6 +254,7 @@ gl_system_value_name(gl_system_value sysval)
      ENUM(SYSTEM_VALUE_BARYCENTRIC_SAMPLE),
      ENUM(SYSTEM_VALUE_BARYCENTRIC_CENTROID),
      ENUM(SYSTEM_VALUE_BARYCENTRIC_SIZE),
+     ENUM(SYSTEM_VALUE_GS_HEADER_IR3),
    };
    STATIC_ASSERT(ARRAY_SIZE(names) == SYSTEM_VALUE_MAX);
    return NAME(sysval);
diff --git a/src/compiler/shader_enums.h b/src/compiler/shader_enums.h
index 0704719c229..f9b2b8c1d73 100644
--- a/src/compiler/shader_enums.h
+++ b/src/compiler/shader_enums.h
@@ -641,6 +641,13 @@ typedef enum
    SYSTEM_VALUE_BARYCENTRIC_CENTROID,
    SYSTEM_VALUE_BARYCENTRIC_SIZE,
 
+   /**
+    * IR3 specific geometry shader system value that packs invocation id,
+    * thread id and vertex id.  Having this as a nir level system value lets
+    * us do the unpacking in nir.
+    */
+   SYSTEM_VALUE_GS_HEADER_IR3,
+
    SYSTEM_VALUE_MAX             /**< Number of values */
 } gl_system_value;
 
diff --git a/src/freedreno/Makefile.sources b/src/freedreno/Makefile.sources
index cf3ac7bdba4..bb56869e1cc 100644
--- a/src/freedreno/Makefile.sources
+++ b/src/freedreno/Makefile.sources
@@ -38,6 +38,7 @@ ir3_SOURCES := \
 	ir3/ir3_nir_lower_load_barycentric_at_sample.c \
 	ir3/ir3_nir_lower_load_barycentric_at_offset.c \
 	ir3/ir3_nir_lower_io_offsets.c \
+	ir3/ir3_nir_lower_tess.c \
 	ir3/ir3_nir_lower_tg4_to_tex.c \
 	ir3/ir3_nir_move_varying_inputs.c \
 	ir3/ir3_print.c \
diff --git a/src/freedreno/ir3/ir3_context.h b/src/freedreno/ir3/ir3_context.h
index b0d3e98d00a..2a1f9071118 100644
--- a/src/freedreno/ir3/ir3_context.h
+++ b/src/freedreno/ir3/ir3_context.h
@@ -76,6 +76,10 @@ struct ir3_context {
 	/* For fragment shaders: */
 	struct ir3_instruction *samp_id, *samp_mask_in;
 
+	/* For geometry shaders: */
+	struct ir3_instruction *primitive_id;
+	struct ir3_instruction *gs_header;
+
 	/* Compute shader inputs: */
 	struct ir3_instruction *local_invocation_id, *work_group_id;
 
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index 2f95b249c26..103821cd6b3 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -101,7 +101,8 @@ ir3_key_lowers_nir(const struct ir3_shader_key *key)
 	return key->fsaturate_s | key->fsaturate_t | key->fsaturate_r |
 			key->vsaturate_s | key->vsaturate_t | key->vsaturate_r |
 			key->ucp_enables | key->color_two_side |
-			key->fclamp_color | key->vclamp_color;
+			key->fclamp_color | key->vclamp_color |
+			key->has_gs;
 }
 
 #define OPT(nir, pass, ...) ({                             \
@@ -186,6 +187,19 @@ ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
 			.lower_tg4_offsets = true,
 	};
 
+	if (key && key->has_gs) {
+		switch (shader->type) {
+		case MESA_SHADER_VERTEX:
+			NIR_PASS_V(s, ir3_nir_lower_vs_to_explicit_io, shader);
+			break;
+		case MESA_SHADER_GEOMETRY:
+			NIR_PASS_V(s, ir3_nir_lower_gs, shader);
+			break;
+		default:
+			break;
+		}
+	}
+
 	if (key) {
 		switch (shader->type) {
 		case MESA_SHADER_FRAGMENT:
diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h
index a9b39e235b5..a602f40858b 100644
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@@ -41,6 +41,9 @@ bool ir3_nir_lower_load_barycentric_at_sample(nir_shader *shader);
 bool ir3_nir_lower_load_barycentric_at_offset(nir_shader *shader);
 bool ir3_nir_move_varying_inputs(nir_shader *shader);
 
+void ir3_nir_lower_vs_to_explicit_io(nir_shader *shader, struct ir3_shader *s);
+void ir3_nir_lower_gs(nir_shader *shader, struct ir3_shader *s);
+
 const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler);
 bool ir3_key_lowers_nir(const struct ir3_shader_key *key);
 void ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
diff --git a/src/freedreno/ir3/ir3_nir_lower_tess.c b/src/freedreno/ir3/ir3_nir_lower_tess.c
new file mode 100644
index 00000000000..b4982503f0d
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir_lower_tess.c
@@ -0,0 +1,455 @@
+/*
+ * Copyright © 2019 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3_nir.h"
+#include "ir3_compiler.h"
+#include "compiler/nir/nir_builder.h"
+
+struct state {
+	struct primitive_map {
+		unsigned loc[32];
+		unsigned size[32];
+		unsigned stride;
+	} map;
+
+	nir_ssa_def *header;
+
+	nir_variable *vertex_count_var;
+	nir_variable *emitted_vertex_var;
+	nir_variable *vertex_flags_var;
+	nir_variable *vertex_flags_out;
+
+	nir_variable *output_vars[32];
+};
+
+static nir_ssa_def *
+bitfield_extract(nir_builder *b, nir_ssa_def *v, uint32_t start, uint32_t mask)
+{
+	return nir_iand(b, nir_ushr(b, v, nir_imm_int(b, start)),
+			nir_imm_int(b, mask));
+}
+
+static nir_ssa_def *
+build_invocation_id(nir_builder *b, struct state *state)
+{
+	return bitfield_extract(b, state->header, 11, 31);
+}
+
+static nir_ssa_def *
+build_vertex_id(nir_builder *b, struct state *state)
+{
+	return bitfield_extract(b, state->header, 6, 31);
+}
+
+static nir_ssa_def *
+build_local_primitive_id(nir_builder *b, struct state *state)
+{
+	return bitfield_extract(b, state->header, 0, 63);
+}
+
+static nir_variable *
+get_var(struct exec_list *list, int driver_location)
+{
+	nir_foreach_variable(v, list) {
+		if (v->data.driver_location == driver_location) {
+			return v;
+		}
+	}
+
+	return NULL;
+}
+
+static nir_ssa_def *
+build_local_offset(nir_builder *b, struct state *state,
+		nir_ssa_def *vertex, uint32_t base, nir_ssa_def *offset)
+{
+	nir_ssa_def *primitive_stride = nir_load_vs_primitive_stride_ir3(b);
+	nir_ssa_def *primitive_offset =
+		nir_imul(b, build_local_primitive_id(b, state), primitive_stride);
+	nir_ssa_def *attr_offset;
+	nir_ssa_def *vertex_stride;
+
+	if (b->shader->info.stage == MESA_SHADER_VERTEX) {
+		vertex_stride = nir_imm_int(b, state->map.stride * 4);
+		attr_offset = nir_imm_int(b, state->map.loc[base] * 4);
+	} else if (b->shader->info.stage == MESA_SHADER_GEOMETRY) {
+		vertex_stride = nir_load_vs_vertex_stride_ir3(b);
+		attr_offset = nir_load_primitive_location_ir3(b, base);
+	} else {
+		unreachable("bad shader stage");
+	}
+
+	nir_ssa_def *vertex_offset = nir_imul(b, vertex, vertex_stride);
+
+	return nir_iadd(b, nir_iadd(b, primitive_offset, vertex_offset),
+			nir_iadd(b, attr_offset, offset));
+}
+
+static nir_intrinsic_instr *
+replace_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
+		nir_intrinsic_op op, nir_ssa_def *src0, nir_ssa_def *src1, nir_ssa_def *src2)
+{
+	nir_intrinsic_instr *new_intr =
+		nir_intrinsic_instr_create(b->shader, op);
+
+	new_intr->src[0] = nir_src_for_ssa(src0);
+	if (src1)
+		new_intr->src[1] = nir_src_for_ssa(src1);
+	if (src2)
+		new_intr->src[2] = nir_src_for_ssa(src2);
+
+	new_intr->num_components = intr->num_components;
+
+	if (nir_intrinsic_infos[op].has_dest)
+		nir_ssa_dest_init(&new_intr->instr, &new_intr->dest,
+						  intr->num_components, 32, NULL);
+
+	nir_builder_instr_insert(b, &new_intr->instr);
+
+	if (nir_intrinsic_infos[op].has_dest)
+		nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(&new_intr->dest.ssa));
+
+	nir_instr_remove(&intr->instr);
+
+	return new_intr;
+}
+
+static void
+build_primitive_map(nir_shader *shader, struct primitive_map *map, struct exec_list *list)
+{
+	nir_foreach_variable(var, list) {
+		switch (var->data.location) {
+		case VARYING_SLOT_TESS_LEVEL_OUTER:
+		case VARYING_SLOT_TESS_LEVEL_INNER:
+			continue;
+		}
+
+		unsigned size = glsl_count_attribute_slots(var->type, false) * 4;
+
+		assert(var->data.driver_location < ARRAY_SIZE(map->size));
+		map->size[var->data.driver_location] =
+			MAX2(map->size[var->data.driver_location], size);
+	}
+
+	unsigned loc = 0;
+	for (uint32_t i = 0; i < ARRAY_SIZE(map->size); i++) {
+		if (map->size[i] == 0)
+				continue;
+		nir_variable *var = get_var(list, i);
+		map->loc[i] = loc;
+		loc += map->size[i];
+
+		if (var->data.patch)
+			map->size[i] = 0;
+		else
+			map->size[i] = map->size[i] / glsl_get_length(var->type);
+	}
+
+	map->stride = loc;
+}
+
+static void
+lower_vs_block(nir_block *block, nir_builder *b, struct state *state)
+{
+	nir_foreach_instr_safe(instr, block) {
+		if (instr->type != nir_instr_type_intrinsic)
+			continue;
+
+		nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+		switch (intr->intrinsic) {
+		case nir_intrinsic_store_output: {
+			// src[] = { value, offset }.
+
+			b->cursor = nir_before_instr(&intr->instr);
+
+			nir_ssa_def *vertex_id = build_vertex_id(b, state);
+			nir_ssa_def *offset = build_local_offset(b, state, vertex_id, nir_intrinsic_base(intr),
+					intr->src[1].ssa);
+			nir_intrinsic_instr *store =
+				nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_shared_ir3);
+
+			nir_intrinsic_set_write_mask(store, MASK(intr->num_components));
+			store->src[0] = nir_src_for_ssa(intr->src[0].ssa);
+			store->src[1] = nir_src_for_ssa(offset);
+
+			store->num_components = intr->num_components;
+
+			nir_builder_instr_insert(b, &store->instr);
+			break;
+		}
+
+		default:
+			break;
+		}
+	}
+}
+
+static nir_ssa_def *
+local_thread_id(nir_builder *b)
+{
+	return bitfield_extract(b, nir_load_gs_header_ir3(b), 16, 1023);
+}
+
+void
+ir3_nir_lower_vs_to_explicit_io(nir_shader *shader, struct ir3_shader *s)
+{
+	struct state state = { };
+
+	build_primitive_map(shader, &state.map, &shader->outputs);
+	memcpy(s->output_loc, state.map.loc, sizeof(s->output_loc));
+
+	nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+	assert(impl);
+
+	nir_builder b;
+	nir_builder_init(&b, impl);
+	b.cursor = nir_before_cf_list(&impl->body);
+
+	state.header = nir_load_gs_header_ir3(&b);
+
+	nir_foreach_block_safe(block, impl)
+		lower_vs_block(block, &b, &state);
+
+	nir_metadata_preserve(impl, nir_metadata_block_index |
+			nir_metadata_dominance);
+
+	s->output_size = state.map.stride;
+}
+
+static void
+lower_gs_block(nir_block *block, nir_builder *b, struct state *state)
+{
+	nir_intrinsic_instr *outputs[32] = {};
+
+	nir_foreach_instr_safe(instr, block) {
+		if (instr->type != nir_instr_type_intrinsic)
+			continue;
+
+		nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+		switch (intr->intrinsic) {
+		case nir_intrinsic_store_output: {
+			// src[] = { value, offset }.
+
+			uint32_t loc = nir_intrinsic_base(intr);
+			outputs[loc] = intr;
+			break;
+		}
+
+		case nir_intrinsic_end_primitive: {
+			b->cursor = nir_before_instr(&intr->instr);
+			nir_store_var(b, state->vertex_flags_var, nir_imm_int(b, 4), 0x1);
+			nir_instr_remove(&intr->instr);
+			break;
+		}
+
+		case nir_intrinsic_emit_vertex: {
+
+			/* Load the vertex count */
+			b->cursor = nir_before_instr(&intr->instr);
+			nir_ssa_def *count = nir_load_var(b, state->vertex_count_var);
+
+			nir_push_if(b, nir_ieq(b, count, local_thread_id(b)));
+
+			for (uint32_t i = 0; i < ARRAY_SIZE(outputs); i++) {
+				if (outputs[i]) {
+					nir_store_var(b, state->output_vars[i],
+							outputs[i]->src[0].ssa,
+							(1 << outputs[i]->num_components) - 1);
+
+					nir_instr_remove(&outputs[i]->instr);
+				}
+				outputs[i] = NULL;
+			}
+
+			nir_instr_remove(&intr->instr);
+
+			nir_store_var(b, state->emitted_vertex_var,
+					nir_iadd(b, nir_load_var(b, state->emitted_vertex_var), nir_imm_int(b, 1)), 0x1);
+
+			nir_store_var(b, state->vertex_flags_out,
+					nir_load_var(b, state->vertex_flags_var), 0x1);
+
+			nir_pop_if(b, NULL);
+
+			/* Increment the vertex count by 1 */
+			nir_store_var(b, state->vertex_count_var,
+					nir_iadd(b, count, nir_imm_int(b, 1)), 0x1); /* .x */
+			nir_store_var(b, state->vertex_flags_var, nir_imm_int(b, 0), 0x1);
+
+			break;
+		}
+
+		case nir_intrinsic_load_per_vertex_input: {
+			// src[] = { vertex, offset }.
+
+			b->cursor = nir_before_instr(&intr->instr);
+
+			nir_ssa_def *offset = build_local_offset(b, state,
+					intr->src[0].ssa, // this is typically gl_InvocationID
+					nir_intrinsic_base(intr),
+					intr->src[1].ssa);
+
+			replace_intrinsic(b, intr, nir_intrinsic_load_shared_ir3, offset, NULL, NULL);
+			break;
+		}
+
+		case nir_intrinsic_load_invocation_id: {
+			b->cursor = nir_before_instr(&intr->instr);
+
+			nir_ssa_def *iid = build_invocation_id(b, state);
+			nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(iid));
+			nir_instr_remove(&intr->instr);
+			break;
+		}
+
+		default:
+			break;
+		}
+	}
+}
+
+static void
+emit_store_outputs(nir_builder *b, struct state *state)
+{
+	/* This also stores the internally added vertex_flags output. */
+
+	for (uint32_t i = 0; i < ARRAY_SIZE(state->output_vars); i++) {
+		if (!state->output_vars[i])
+			continue;
+
+		nir_intrinsic_instr *store =
+			nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output);
+
+		nir_intrinsic_set_base(store, i);
+		store->src[0] = nir_src_for_ssa(nir_load_var(b, state->output_vars[i]));
+		store->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
+		store->num_components = store->src[0].ssa->num_components;
+
+		nir_builder_instr_insert(b, &store->instr);
+	}
+}
+
+static void
+clean_up_split_vars(nir_shader *shader, struct exec_list *list)
+{
+	uint32_t components[32] = {};
+
+	nir_foreach_variable(var, list) {
+		uint32_t mask =
+			((1 << glsl_get_components(glsl_without_array(var->type))) - 1) << var->data.location_frac;
+		components[var->data.driver_location] |= mask;
+	}
+
+	nir_foreach_variable_safe(var, list) {
+		uint32_t mask =
+			((1 << glsl_get_components(glsl_without_array(var->type))) - 1) << var->data.location_frac;
+		bool subset =
+			(components[var->data.driver_location] | mask) != mask;
+		if (subset)
+			exec_node_remove(&var->node);
+	}
+}
+
+void
+ir3_nir_lower_gs(nir_shader *shader, struct ir3_shader *s)
+{
+	struct state state = { };
+
+	if (shader_debug_enabled(shader->info.stage)) {
+		fprintf(stderr, "NIR (before gs lowering):\n");
+		nir_print_shader(shader, stderr);
+	}
+
+	clean_up_split_vars(shader, &shader->inputs);
+	clean_up_split_vars(shader, &shader->outputs);
+
+	build_primitive_map(shader, &state.map, &shader->inputs);
+
+	uint32_t loc = 0;
+	nir_foreach_variable(var, &shader->outputs) {
+		uint32_t end = var->data.driver_location + glsl_count_attribute_slots(var->type, false);
+		loc = MAX2(loc, end);
+	}
+
+	state.vertex_flags_out = nir_variable_create(shader, nir_var_shader_out,
+			glsl_uint_type(), "vertex_flags");
+	state.vertex_flags_out->data.driver_location = loc;
+	state.vertex_flags_out->data.location = VARYING_SLOT_GS_VERTEX_FLAGS_IR3;
+
+	nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+	assert(impl);
+
+	nir_builder b;
+	nir_builder_init(&b, impl);
+	b.cursor = nir_before_cf_list(&impl->body);
+
+	state.header = nir_load_gs_header_ir3(&b);
+
+	nir_foreach_variable(var, &shader->outputs) {
+		state.output_vars[var->data.driver_location] = 
+			nir_local_variable_create(impl, var->type,
+					ralloc_asprintf(var, "%s:gs-temp", var->name));
+	}
+
+	state.vertex_count_var =
+		nir_local_variable_create(impl, glsl_uint_type(), "vertex_count");
+	state.emitted_vertex_var =
+		nir_local_variable_create(impl, glsl_uint_type(), "emitted_vertex");
+	state.vertex_flags_var =
+		nir_local_variable_create(impl, glsl_uint_type(), "vertex_flags");
+	state.vertex_flags_out = state.output_vars[state.vertex_flags_out->data.driver_location];
+
+	/* initialize to 0 */
+	b.cursor = nir_before_cf_list(&impl->body);
+	nir_store_var(&b, state.vertex_count_var, nir_imm_int(&b, 0), 0x1);
+	nir_store_var(&b, state.emitted_vertex_var, nir_imm_int(&b, 0), 0x1);
+	nir_store_var(&b, state.vertex_flags_var, nir_imm_int(&b, 4), 0x1);
+
+	nir_foreach_block_safe(block, impl)
+		lower_gs_block(block, &b, &state);
+
+	set_foreach(impl->end_block->predecessors, block_entry) {
+		struct nir_block *block = (void *)block_entry->key;
+		b.cursor = nir_after_block_before_jump(block);
+
+		nir_intrinsic_instr *discard_if =
+			nir_intrinsic_instr_create(b.shader, nir_intrinsic_discard_if);
+
+		nir_ssa_def *cond = nir_ieq(&b, nir_load_var(&b, state.emitted_vertex_var), nir_imm_int(&b, 0));
+
+		discard_if->src[0] = nir_src_for_ssa(cond);
+
+		nir_builder_instr_insert(&b, &discard_if->instr);
+
+		emit_store_outputs(&b, &state);
+	}
+
+	nir_metadata_preserve(impl, 0);
+
+	if (shader_debug_enabled(shader->info.stage)) {
+		fprintf(stderr, "NIR (after gs lowering):\n");
+		nir_print_shader(shader, stderr);
+	}
+}
diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c
index aae7baeb2e0..10980bd38be 100644
--- a/src/freedreno/ir3/ir3_shader.c
+++ b/src/freedreno/ir3/ir3_shader.c
@@ -350,7 +350,14 @@ output_name(struct ir3_shader_variant *so, int i)
 	if (so->type == MESA_SHADER_FRAGMENT) {
 		return gl_frag_result_name(so->outputs[i].slot);
 	} else {
-		return gl_varying_slot_name(so->outputs[i].slot);
+		switch (so->outputs[i].slot) {
+		case VARYING_SLOT_GS_HEADER_IR3:
+			return "GS_HEADER";
+		case VARYING_SLOT_GS_VERTEX_FLAGS_IR3:
+			return "GS_VERTEX_FLAGS";
+		default:
+			return gl_varying_slot_name(so->outputs[i].slot);
+		}
 	}
 }
 
diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h
index fa6d5b7d387..ce258865658 100644
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -554,6 +554,11 @@ struct ir3_shader {
 
 	struct ir3_shader_variant *variants;
 	mtx_t variants_lock;
+
+	uint32_t output_size; /* Size in dwords of all outputs for VS, size of entire patch for HS. */
+
+	/* Map from driver_location to byte offset in per-primitive storage */
+	unsigned output_loc[32];
 };
 
 void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id);
@@ -693,6 +698,10 @@ ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot)
 	return regid(63, 0);
 }
 
+#define VARYING_SLOT_GS_HEADER_IR3			(VARYING_SLOT_MAX + 0)
+#define VARYING_SLOT_GS_VERTEX_FLAGS_IR3	(VARYING_SLOT_MAX + 1)
+
+
 static inline uint32_t
 ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot)
 {
diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build
index be03ffb88c3..6e1434057e7 100644
--- a/src/freedreno/ir3/meson.build
+++ b/src/freedreno/ir3/meson.build
@@ -66,6 +66,7 @@ libfreedreno_ir3_files = files(
   'ir3_nir_lower_load_barycentric_at_sample.c',
   'ir3_nir_lower_load_barycentric_at_offset.c',
   'ir3_nir_lower_io_offsets.c',
+  'ir3_nir_lower_tess.c',
   'ir3_nir_lower_tg4_to_tex.c',
   'ir3_nir_move_varying_inputs.c',
   'ir3_print.c',