5 files changed, 81 insertions, 11 deletions
diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c
index 987c01a7919..2f94eaeda4f 100644
--- a/src/gallium/auxiliary/draw/draw_gs.c
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -280,6 +280,7 @@ llvm_fetch_gs_outputs(struct draw_geometry_shader *shader,
    int max_prims_per_invocation = 0;
    char *output_ptr = (char*)shader->gs_output;
    int i, j, prim_idx;
+   unsigned next_prim_boundary = shader->primitive_boundary;
 
    for (i = 0; i < shader->vector_length; ++i) {
       int prims = shader->llvm_emitted_primitives[i];
@@ -290,19 +291,42 @@ llvm_fetch_gs_outputs(struct draw_geometry_shader *shader,
       total_verts += shader->llvm_emitted_vertices[i];
    }
 
-
    output_ptr += shader->emitted_vertices * shader->vertex_size;
    for (i = 0; i < shader->vector_length - 1; ++i) {
       int current_verts = shader->llvm_emitted_vertices[i];
-
-      if (current_verts != shader->max_output_vertices) {
-         memcpy(output_ptr + (vertex_count + current_verts) * shader->vertex_size,
-                output_ptr + (vertex_count + shader->max_output_vertices) * shader->vertex_size,
-                shader->vertex_size * (total_verts - vertex_count));
+      int next_verts = shader->llvm_emitted_vertices[i + 1];
+#if 0
+      int j; 
+      for (j = 0; j < current_verts; ++j) {
+         struct vertex_header *vh = (struct vertex_header *)
+            (output_ptr + shader->vertex_size * (i * next_prim_boundary + j));
+         debug_printf("--- %d) [%f, %f, %f, %f]\n", j + vertex_count,
+                      vh->data[0][0], vh->data[0][1], vh->data[0][2], vh->data[0][3]);
+         
+      }
+#endif
+      debug_assert(current_verts <= shader->max_output_vertices);
+      debug_assert(next_verts <= shader->max_output_vertices);
+      if (next_verts) {
+         memmove(output_ptr + (vertex_count + current_verts) * shader->vertex_size,
+                 output_ptr + ((i + 1) * next_prim_boundary) * shader->vertex_size,
+                 shader->vertex_size * next_verts);
       }
       vertex_count += current_verts;
    }
 
+#if 0
+   {
+      int i;
+      for (i = 0; i < total_verts; ++i) {
+         struct vertex_header *vh = (struct vertex_header *)(output_ptr + shader->vertex_size * i);
+         debug_printf("%d) [%f, %f, %f, %f]\n", i,
+                      vh->data[0][0], vh->data[0][1], vh->data[0][2], vh->data[0][3]);
+         
+      }
+   }
+#endif
+
    prim_idx = 0;
    for (i = 0; i < shader->vector_length; ++i) {
       int num_prims = shader->llvm_emitted_primitives[i];
@@ -513,10 +537,12 @@ int draw_geometry_shader_run(struct draw_geometry_shader *shader,
 
    output_verts->vertex_size = vertex_size;
    output_verts->stride = output_verts->vertex_size;
+   /* we allocate exactly one extra vertex per primitive to allow the GS to emit
+    * overflown vertices into some area where they won't harm anyone */
    output_verts->verts =
       (struct vertex_header *)MALLOC(output_verts->vertex_size *
                                      max_out_prims *
-                                     shader->max_output_vertices);
+                                     shader->primitive_boundary);
 
 #if 0
    debug_printf("%s count = %d (in prims # = %d)\n",
@@ -724,6 +750,16 @@ draw_create_geometry_shader(struct draw_context *draw,
                TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES)
          gs->max_output_vertices = gs->info.properties[i].data[0];
    }
+   /* Primitive boundary is bigger than max_output_vertices by one, because
+    * the specification says that the geometry shader should exit if the 
+    * number of emitted vertices is bigger or equal to max_output_vertices and
+    * we can't do that because we're running in the SoA mode, which means that
+    * our storing routines will keep getting called on channels that have
+    * overflown.
+    * So we need some scratch area where we can keep writing the overflown 
+    * vertices without overwriting anything important or crashing.
+    */
+   gs->primitive_boundary = gs->max_output_vertices + 1;
 
    for (i = 0; i < gs->info.num_outputs; i++) {
       if (gs->info.output_semantic_name[i] == TGSI_SEMANTIC_POSITION &&
diff --git a/src/gallium/auxiliary/draw/draw_gs.h b/src/gallium/auxiliary/draw/draw_gs.h
index 7c841396aa2..ca744cebfc9 100644
--- a/src/gallium/auxiliary/draw/draw_gs.h
+++ b/src/gallium/auxiliary/draw/draw_gs.h
@@ -68,6 +68,7 @@ struct draw_geometry_shader {
    unsigned position_output;
 
    unsigned max_output_vertices;
+   unsigned primitive_boundary;
    unsigned input_primitive;
    unsigned output_primitive;
 
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index 0e349783198..33fe40d5e7f 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -1287,8 +1287,8 @@ draw_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base,
    LLVMValueRef clipmask = lp_build_const_int_vec(gallivm,
                                                   lp_int_type(gs_type), 0);
    LLVMValueRef indices[LP_MAX_VECTOR_LENGTH];
-   LLVMValueRef max_output_vertices =
-      lp_build_const_int32(gallivm, variant->shader->base.max_output_vertices);
+   LLVMValueRef next_prim_offset =
+      lp_build_const_int32(gallivm, variant->shader->base.primitive_boundary);
    LLVMValueRef io = variant->io_ptr;
    unsigned i;
    const struct tgsi_shader_info *gs_info = &variant->shader->base.info;
@@ -1297,7 +1297,7 @@ draw_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base,
       LLVMValueRef ind = lp_build_const_int32(gallivm, i);
       LLVMValueRef currently_emitted =
          LLVMBuildExtractElement(builder, emitted_vertices_vec, ind, "");
-      indices[i] = LLVMBuildMul(builder, ind, max_output_vertices, "");
+      indices[i] = LLVMBuildMul(builder, ind, next_prim_offset, "");
       indices[i] = LLVMBuildAdd(builder, indices[i], currently_emitted, "");
    }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index f1b1d79eab7..0fbb8aabbb0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -396,6 +396,7 @@ struct lp_build_tgsi_soa_context
    LLVMValueRef emitted_prims_vec_ptr;
    LLVMValueRef total_emitted_vertices_vec_ptr;
    LLVMValueRef emitted_vertices_vec_ptr;
+   LLVMValueRef max_output_vertices_vec;
 
    LLVMValueRef consts_ptr;
    const LLVMValueRef *pos;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 9822f72c6a3..28eb57b4b51 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -828,7 +828,6 @@ emit_fetch_gs_input(
       vertex_index = lp_build_const_int32(gallivm, reg->Dimension.Index);
    }
 
-
    res = bld->gs_iface->fetch_input(bld->gs_iface, bld_base,
                                     vertex_index, attrib_index,
                                     swizzle_index);
@@ -2257,6 +2256,20 @@ clear_uint_vec_ptr_from_mask(struct lp_build_tgsi_context * bld_base,
    LLVMBuildStore(builder, current_vec, ptr);
 }
 
+static LLVMValueRef
+clamp_mask_to_max_output_vertices(struct lp_build_tgsi_soa_context * bld,
+                                  LLVMValueRef current_mask_vec,
+                                  LLVMValueRef total_emitted_vertices_vec)
+{
+   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
+   struct lp_build_context *uint_bld = &bld->bld_base.uint_bld;
+   LLVMValueRef max_mask = lp_build_cmp(uint_bld, PIPE_FUNC_LESS,
+                                        total_emitted_vertices_vec,
+                                        bld->max_output_vertices_vec);
+
+   return LLVMBuildAnd(builder, current_mask_vec, max_mask, "");
+}
+
 static void
 emit_vertex(
    const struct lp_build_tgsi_action * action,
@@ -2270,6 +2283,8 @@ emit_vertex(
       LLVMValueRef masked_ones = mask_to_one_vec(bld_base);
       LLVMValueRef total_emitted_vertices_vec =
          LLVMBuildLoad(builder, bld->total_emitted_vertices_vec_ptr, "");
+      masked_ones = clamp_mask_to_max_output_vertices(bld, masked_ones,
+                                                      total_emitted_vertices_vec);
       gather_outputs(bld);
       bld->gs_iface->emit_vertex(bld->gs_iface, &bld->bld_base,
                                  bld->outputs,
@@ -2812,12 +2827,29 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
    bld.bld_base.op_actions[TGSI_OPCODE_SVIEWINFO].emit = sviewinfo_emit;
 
    if (gs_iface) {
+      /* There's no specific value for this because it should always
+       * be set, but apps using ext_geometry_shader4 quite often
+       * were forgetting so we're using MAX_VERTEX_VARYING from
+       * that spec even though we could debug_assert if it's not
+       * set, but that's a lot uglier. */
+      uint max_output_vertices = 32;
+      uint i = 0;
       /* inputs are always indirect with gs */
       bld.indirect_files |= (1 << TGSI_FILE_INPUT);
       bld.gs_iface = gs_iface;
       bld.bld_base.emit_fetch_funcs[TGSI_FILE_INPUT] = emit_fetch_gs_input;
       bld.bld_base.op_actions[TGSI_OPCODE_EMIT].emit = emit_vertex;
       bld.bld_base.op_actions[TGSI_OPCODE_ENDPRIM].emit = end_primitive;
+
+      for (i = 0; i < info->num_properties; ++i) {
+         if (info->properties[i].name ==
+             TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES) {
+            max_output_vertices = info->properties[i].data[0];
+         }
+      }
+      bld.max_output_vertices_vec =
+         lp_build_const_int_vec(gallivm, bld.bld_base.uint_bld.type,
+                                max_output_vertices);
    }
 
    lp_exec_mask_init(&bld.exec_mask, &bld.bld_base.base);