15 files changed, 417 insertions, 10 deletions
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index cd6a8f48b5a..e50f9c3f95f 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -93,6 +93,7 @@ i965_C_SOURCES := \
 	gen6_sampler_state.c \
 	gen6_scissor_state.c \
 	gen6_sf_state.c \
+        gen6_sol.c \
 	gen6_urb.c \
 	gen6_viewport_state.c \
 	gen6_vs_state.c \
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 5e9cb1f8b2f..d8cad54667a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -178,6 +178,26 @@ brwCreateContext(int api,
    
    ctx->Const.MaxTextureMaxAnisotropy = 16.0;
 
+   /* Hardware only supports a limited number of transform feedback buffers.
+    * So we need to override the Mesa default (which is based only on software
+    * limits).
+    */
+   ctx->Const.MaxTransformFeedbackSeparateAttribs = BRW_MAX_SOL_BUFFERS;
+
+   /* On Gen6, in the worst case, we use up one binding table entry per
+    * transform feedback component (see comments above the definition of
+    * BRW_MAX_SOL_BINDINGS, in brw_context.h), so we need to advertise a value
+    * for MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS equal to
+    * BRW_MAX_SOL_BINDINGS.
+    *
+    * In "separate components" mode, we need to divide this value by
+    * BRW_MAX_SOL_BUFFERS, so that the total number of binding table entries
+    * used up by all buffers will not exceed BRW_MAX_SOL_BINDINGS.
+    */
+   ctx->Const.MaxTransformFeedbackInterleavedComponents = BRW_MAX_SOL_BINDINGS;
+   ctx->Const.MaxTransformFeedbackSeparateComponents =
+      BRW_MAX_SOL_BINDINGS / BRW_MAX_SOL_BUFFERS;
+
    /* if conformance mode is set, swrast can handle any size AA point */
    ctx->Const.MaxPointSizeAA = 255.0;
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 70a45c77260..febd4fe4365 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -368,6 +368,12 @@ struct brw_clip_prog_data {
 struct brw_gs_prog_data {
    GLuint urb_read_length;
    GLuint total_grf;
+
+   /**
+    * Gen6 transform feedback: Amount by which the streaming vertex buffer
+    * indices should be incremented each time the GS is invoked.
+    */
+   unsigned svbi_postincrement_value;
 };
 
 struct brw_vs_prog_data {
@@ -407,6 +413,34 @@ struct brw_vs_ouput_sizes {
 #define BRW_MAX_DRAW_BUFFERS 8
 
 /**
+ * Max number of binding table entries used for stream output.
+ *
+ * From the OpenGL 3.0 spec, table 6.44 (Transform Feedback State), the
+ * minimum value of MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS is 64.
+ *
+ * On Gen6, the size of transform feedback data is limited not by the number
+ * of components but by the number of binding table entries we set aside.  We
+ * use one binding table entry for a float, one entry for a vector, and one
+ * entry per matrix column.  Since the only way we can communicate our
+ * transform feedback capabilities to the client is via
+ * MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS, we need to plan for the
+ * worst case, in which all the varyings are floats, so we use up one binding
+ * table entry per component.  Therefore we need to set aside at least 64
+ * binding table entries for use by transform feedback.
+ *
+ * Note: since we don't currently pack varyings, it is currently impossible
+ * for the client to actually use up all of these binding table entries--if
+ * all of their varyings were floats, they would run out of varying slots and
+ * fail to link.  But that's a bug, so it seems prudent to go ahead and
+ * allocate the number of binding table entries we will need once the bug is
+ * fixed.
+ */
+#define BRW_MAX_SOL_BINDINGS 64
+
+/** Maximum number of actual buffers used for stream output */
+#define BRW_MAX_SOL_BUFFERS 4
+
+/**
  * Helpers to create Surface Binding Table indexes for draw buffers,
  * textures, and constant buffers.
  *
@@ -436,6 +470,11 @@ struct brw_vs_ouput_sizes {
  *    |   . |     .                   |
  *    |   : |     :                   |
  *    |  25 | Texture 15              |
+ *    +-----|-------------------------+
+ *    |  26 | SOL Binding 0           |
+ *    |   . |     .                   |
+ *    |   : |     :                   |
+ *    |  89 | SOL Binding 63          |
  *    +-------------------------------+
  *
  * Note that nothing actually uses the SURF_INDEX_DRAW macro, so it has to be
@@ -446,9 +485,10 @@ struct brw_vs_ouput_sizes {
 #define SURF_INDEX_VERT_CONST_BUFFER (BRW_MAX_DRAW_BUFFERS + 0)
 #define SURF_INDEX_FRAG_CONST_BUFFER (BRW_MAX_DRAW_BUFFERS + 1)
 #define SURF_INDEX_TEXTURE(t)        (BRW_MAX_DRAW_BUFFERS + 2 + (t))
+#define SURF_INDEX_SOL_BINDING(t)    (SURF_INDEX_TEXTURE(BRW_MAX_TEX_UNIT) + (t))
 
 /** Maximum size of the binding table. */
-#define BRW_MAX_SURFACES (BRW_MAX_DRAW_BUFFERS + BRW_MAX_TEX_UNIT + 2)
+#define BRW_MAX_SURFACES SURF_INDEX_SOL_BINDING(BRW_MAX_SOL_BINDINGS)
 
 enum brw_cache_id {
    BRW_BLEND_STATE,
@@ -1026,6 +1066,11 @@ brw_compute_barycentric_interp_modes(bool shade_model_flat,
 
 /* brw_wm_surface_state.c */
 void brw_init_surface_formats(struct brw_context *brw);
+void
+brw_update_sol_surface(struct brw_context *brw,
+                       struct gl_buffer_object *buffer_obj,
+                       uint32_t *out_offset, unsigned num_vector_components,
+                       unsigned stride_dwords, unsigned offset_dwords);
 
 /* gen6_clip_state.c */
 bool
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 18546023531..4edfaf7d5e4 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1424,6 +1424,12 @@ enum brw_wm_barycentric_interp_mode {
 #define URB_WRITE_PRIM_START		0x2
 #define URB_WRITE_PRIM_TYPE_SHIFT	2
 
+
+/* Maximum number of entries that can be addressed using a binding table
+ * pointer of type SURFTYPE_BUFFER
+ */
+#define BRW_MAX_NUM_BUFFER_ENTRIES	(1 << 27)
+
 #include "intel_chipset.h"
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index 596be02158c..1529ec622a7 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -912,6 +912,13 @@ void brw_ff_sync(struct brw_compile *p,
 		   GLuint response_length,
 		   bool eot);
 
+void brw_svb_write(struct brw_compile *p,
+                   struct brw_reg dest,
+                   GLuint msg_reg_nr,
+                   struct brw_reg src0,
+                   GLuint binding_table_index,
+                   bool   send_commit_msg);
+
 void brw_fb_WRITE(struct brw_compile *p,
 		  int dispatch_width,
 		   GLuint msg_reg_nr,
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index d48753c546f..f6726fcfca5 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -2390,3 +2390,42 @@ void brw_ff_sync(struct brw_compile *p,
 			   response_length,
 			   eot);
 }
+
+/**
+ * Emit the SEND instruction necessary to generate stream output data on Gen6
+ * (for transform feedback).
+ *
+ * If send_commit_msg is true, this is the last piece of stream output data
+ * from this thread, so send the data as a committed write.  According to the
+ * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
+ *
+ *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
+ *   writes are complete by sending the final write as a committed write."
+ */
+void
+brw_svb_write(struct brw_compile *p,
+              struct brw_reg dest,
+              GLuint msg_reg_nr,
+              struct brw_reg src0,
+              GLuint binding_table_index,
+              bool   send_commit_msg)
+{
+   struct brw_instruction *insn;
+
+   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
+
+   insn = next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, brw_imm_d(0));
+   brw_set_dp_write_message(p, insn,
+                            binding_table_index,
+                            0, /* msg_control: ignored */
+                            GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
+                            1, /* msg_length */
+                            true, /* header_present */
+                            0, /* last_render_target: ignored */
+                            send_commit_msg, /* response_length */
+                            0, /* end_of_thread */
+                            send_commit_msg); /* send_commit_msg */
+}
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index f5d5898e04b..1e605efd6e4 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -183,7 +183,31 @@ static void populate_key( struct brw_context *brw,
    } else if (intel->gen == 6) {
       /* On Gen6, GS is used for transform feedback. */
       /* _NEW_TRANSFORM_FEEDBACK */
-      key->need_gs_prog = ctx->TransformFeedback.CurrentObject->Active;
+      if (ctx->TransformFeedback.CurrentObject->Active) {
+         const struct gl_shader_program *shaderprog =
+            ctx->Shader.CurrentVertexProgram;
+         const struct gl_transform_feedback_info *linked_xfb_info =
+            &shaderprog->LinkedTransformFeedback;
+         int i;
+
+         /* Make sure that the VUE slots won't overflow the unsigned chars in
+          * key->transform_feedback_bindings[].
+          */
+         STATIC_ASSERT(BRW_VERT_RESULT_MAX <= 256);
+
+         /* Make sure that we don't need more binding table entries than we've
+          * set aside for use in transform feedback.  (We shouldn't, since we
+          * set aside enough binding table entries to have one per component).
+          */
+         assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS);
+
+         key->need_gs_prog = true;
+         key->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
+         for (i = 0; i < key->num_transform_feedback_bindings; ++i) {
+            key->transform_feedback_bindings[i] =
+               linked_xfb_info->Outputs[i].OutputRegister;
+         }
+      }
    } else {
       /* Pre-gen6, GS is used to transform QUADLIST, QUADSTRIP, and LINELOOP
        * into simpler primitives.
diff --git a/src/mesa/drivers/dri/i965/brw_gs.h b/src/mesa/drivers/dri/i965/brw_gs.h
index ecab3ef37fa..33d8d7ab5a7 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.h
+++ b/src/mesa/drivers/dri/i965/brw_gs.h
@@ -50,6 +50,18 @@ struct brw_gs_prog_key {
    GLuint pv_first:1;
    GLuint need_gs_prog:1;
    GLuint userclip_active:1;
+
+   /**
+    * Number of varyings that are output to transform feedback.
+    */
+   GLuint num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */
+
+   /**
+    * Map from the index of a transform feedback binding table entry to the
+    * gl_vert_result that should be streamed out through that binding table
+    * entry.
+    */
+   unsigned char transform_feedback_bindings[BRW_MAX_SOL_BINDINGS];
 };
 
 struct brw_gs_compile {
@@ -59,6 +71,14 @@ struct brw_gs_compile {
    
    struct {
       struct brw_reg R0;
+
+      /**
+       * Register holding streamed vertex buffer pointers -- see the Sandy
+       * Bridge PRM, volume 2 part 1, section 4.4.2 (GS Thread Payload
+       * [DevSNB]).  These pointers are delivered in GRF 1.
+       */
+      struct brw_reg SVBI;
+
       struct brw_reg vertex[MAX_GS_VERTS];
       struct brw_reg header;
       struct brw_reg temp;
diff --git a/src/mesa/drivers/dri/i965/brw_gs_emit.c b/src/mesa/drivers/dri/i965/brw_gs_emit.c
index 322f9bd81c1..3062c3312b2 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_emit.c
@@ -42,8 +42,16 @@
 #include "brw_eu.h"
 #include "brw_gs.h"
 
+/**
+ * Allocate registers for GS.
+ *
+ * If svbi_payload_enable is true, then the thread will be spawned with the
+ * "SVBI Payload Enable" bit set, so GRF 1 needs to be set aside to hold the
+ * streamed vertex buffer indices.
+ */
 static void brw_gs_alloc_regs( struct brw_gs_compile *c,
-			       GLuint nr_verts )
+			       GLuint nr_verts,
+                               bool svbi_payload_enable )
 {
    GLuint i = 0,j;
 
@@ -51,6 +59,10 @@ static void brw_gs_alloc_regs( struct brw_gs_compile *c,
     */
    c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
 
+   /* Streamed vertex buffer indices */
+   if (svbi_payload_enable)
+      c->reg.SVBI = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
+
    /* Payload vertices plus space for more generated vertices:
     */
    for (j = 0; j < nr_verts; j++) {
@@ -212,7 +224,7 @@ void brw_gs_quads( struct brw_gs_compile *c, struct brw_gs_prog_key *key )
 {
    struct intel_context *intel = &c->func.brw->intel;
 
-   brw_gs_alloc_regs(c, 4);
+   brw_gs_alloc_regs(c, 4, false);
    brw_gs_initialize_header(c);
    /* Use polygons for correct edgeflag behaviour. Note that vertex 3
     * is the PV for quads, but vertex 0 for polygons:
@@ -250,7 +262,7 @@ void brw_gs_quad_strip( struct brw_gs_compile *c, struct brw_gs_prog_key *key )
 {
    struct intel_context *intel = &c->func.brw->intel;
 
-   brw_gs_alloc_regs(c, 4);
+   brw_gs_alloc_regs(c, 4, false);
    brw_gs_initialize_header(c);
    
    if (intel->needs_ff_sync)
@@ -286,7 +298,7 @@ void brw_gs_lines( struct brw_gs_compile *c )
 {
    struct intel_context *intel = &c->func.brw->intel;
 
-   brw_gs_alloc_regs(c, 2);
+   brw_gs_alloc_regs(c, 2, false);
    brw_gs_initialize_header(c);
 
    if (intel->needs_ff_sync)
@@ -310,10 +322,81 @@ gen6_sol_program(struct brw_gs_compile *c, struct brw_gs_prog_key *key,
 	         unsigned num_verts, bool check_edge_flags)
 {
    struct brw_compile *p = &c->func;
+   c->prog_data.svbi_postincrement_value = num_verts;
 
-   brw_gs_alloc_regs(c, num_verts);
+   brw_gs_alloc_regs(c, num_verts, true);
    brw_gs_initialize_header(c);
 
+   if (key->num_transform_feedback_bindings > 0) {
+      unsigned vertex, binding;
+      /* Note: since we use the binding table to keep track of buffer offsets
+       * and stride, the GS doesn't need to keep track of a separate pointer
+       * into each buffer; it uses a single pointer which increments by 1 for
+       * each vertex.  So we use SVBI0 for this pointer, regardless of whether
+       * transform feedback is in interleaved or separate attribs mode.
+       */
+      brw_MOV(p, get_element_ud(c->reg.header, 5),
+              get_element_ud(c->reg.SVBI, 0));
+      /* For each vertex, generate code to output each varying using the
+       * appropriate binding table entry.
+       */
+      for (vertex = 0; vertex < num_verts; ++vertex) {
+         for (binding = 0; binding < key->num_transform_feedback_bindings;
+              ++binding) {
+            unsigned char vert_result =
+               key->transform_feedback_bindings[binding];
+            unsigned char slot = c->vue_map.vert_result_to_slot[vert_result];
+            /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
+             *
+             *   "Prior to End of Thread with a URB_WRITE, the kernel must
+             *   ensure that all writes are complete by sending the final
+             *   write as a committed write."
+             */
+            bool final_write =
+               binding == key->num_transform_feedback_bindings - 1 &&
+               vertex == num_verts - 1;
+            struct brw_reg vertex_slot = c->reg.vertex[vertex];
+            vertex_slot.nr += slot / 2;
+            vertex_slot.subnr = (slot % 2) * 16;
+            brw_MOV(p, stride(c->reg.header, 4, 4, 1),
+                    retype(vertex_slot, BRW_REGISTER_TYPE_UD));
+            brw_svb_write(p,
+                          final_write ? c->reg.temp : brw_null_reg(), /* dest */
+                          1, /* msg_reg_nr */
+                          c->reg.header, /* src0 */
+                          SURF_INDEX_SOL_BINDING(binding), /* binding_table_index */
+                          final_write); /* send_commit_msg */
+         }
+
+         /* If there are more vertices to output, increment the pointer so
+          * that we will start outputting to the next location in the
+          * transform feedback buffers.
+          */
+         if (vertex != num_verts - 1) {
+            brw_ADD(p, get_element_ud(c->reg.header, 5),
+                    get_element_ud(c->reg.header, 5), brw_imm_ud(1));
+         }
+      }
+
+      /* Now, reinitialize the header register from R0 to restore the parts of
+       * the register that we overwrote while streaming out transform feedback
+       * data.
+       */
+      brw_gs_initialize_header(c);
+
+      /* Finally, wait for the write commit to occur so that we can proceed to
+       * other things safely.
+       *
+       * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
+       *
+       *   The write commit does not modify the destination register, but
+       *   merely clears the dependency associated with the destination
+       *   register. Thus, a simple “mov” instruction using the register as a
+       *   source is sufficient to wait for the write commit to occur.
+       */
+      brw_MOV(p, c->reg.temp, c->reg.temp);
+   }
+
    brw_gs_ff_sync(c, 1);
 
    brw_gs_overwrite_header_dw2_from_r0(c);
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index e76901a3136..7b1398134bf 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -116,7 +116,7 @@ static void upload_gen6_binding_table_pointers(struct brw_context *brw)
 	     GEN6_BINDING_TABLE_MODIFY_PS |
 	     (4 - 2));
    OUT_BATCH(brw->bind.bo_offset); /* vs */
-   OUT_BATCH(0); /* gs */
+   OUT_BATCH(brw->bind.bo_offset); /* gs */
    OUT_BATCH(brw->bind.bo_offset); /* wm/ps */
    ADVANCE_BATCH();
 }
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 59fe81aec26..a3a470fee6b 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -92,6 +92,7 @@ extern const struct brw_tracked_state gen6_gs_state;
 extern const struct brw_tracked_state gen6_renderbuffer_surfaces;
 extern const struct brw_tracked_state gen6_sampler_state;
 extern const struct brw_tracked_state gen6_scissor_state;
+extern const struct brw_tracked_state gen6_sol_surface;
 extern const struct brw_tracked_state gen6_sf_state;
 extern const struct brw_tracked_state gen6_sf_vp;
 extern const struct brw_tracked_state gen6_urb;
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index bd32815d08c..463689224df 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -145,6 +145,7 @@ static const struct brw_tracked_state *gen6_atoms[] =
    &brw_wm_pull_constants,
    &gen6_renderbuffer_surfaces,
    &brw_texture_surfaces,
+   &gen6_sol_surface,
    &brw_binding_table,
 
    &brw_samplers,
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 7a959522a09..3801c096dda 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -38,6 +38,7 @@
 #include "intel_batchbuffer.h"
 #include "intel_tex.h"
 #include "intel_fbo.h"
+#include "intel_buffer_objects.h"
 
 #include "brw_context.h"
 #include "brw_state.h"
@@ -715,6 +716,90 @@ brw_create_constant_surface(struct brw_context *brw,
 			   I915_GEM_DOMAIN_SAMPLER, 0);
 }
 
+/**
+ * Set up a binding table entry for use by stream output logic (transform
+ * feedback).
+ *
+ * buffer_size_minus_1 must me less than BRW_MAX_NUM_BUFFER_ENTRIES.
+ */
+void
+brw_update_sol_surface(struct brw_context *brw,
+                       struct gl_buffer_object *buffer_obj,
+                       uint32_t *out_offset, unsigned num_vector_components,
+                       unsigned stride_dwords, unsigned offset_dwords)
+{
+   drm_intel_bo *bo = intel_buffer_object(buffer_obj)->buffer;
+   uint32_t *surf = brw_state_batch(brw, AUB_TRACE_SURFACE_STATE, 6 * 4, 32,
+                                    out_offset);
+   uint32_t pitch_minus_1 = 4*stride_dwords - 1;
+   uint32_t offset_bytes = 4 * offset_dwords;
+   size_t size_dwords = buffer_obj->Size / 4;
+   uint32_t buffer_size_minus_1, width, height, depth, surface_format;
+
+   /* FIXME: can we rely on core Mesa to ensure that the buffer isn't
+    * too big to map using a single binding table entry?
+    */
+   assert((size_dwords - offset_dwords) / stride_dwords
+          <= BRW_MAX_NUM_BUFFER_ENTRIES);
+
+   if (size_dwords > offset_dwords + num_vector_components) {
+      /* There is room for at least 1 transform feedback output in the buffer.
+       * Compute the number of additional transform feedback outputs the
+       * buffer has room for.
+       */
+      buffer_size_minus_1 =
+         (size_dwords - offset_dwords - num_vector_components) / stride_dwords;
+   } else {
+      /* There isn't even room for a single transform feedback output in the
+       * buffer.  We can't configure the binding table entry to prevent output
+       * entirely; we'll have to rely on the geometry shader to detect
+       * overflow.  But to minimize the damage in case of a bug, set up the
+       * binding table entry to just allow a single output.
+       */
+      buffer_size_minus_1 = 0;
+   }
+   width = buffer_size_minus_1 & 0x7f;
+   height = (buffer_size_minus_1 & 0xfff80) >> 7;
+   depth = (buffer_size_minus_1 & 0x7f00000) >> 20;
+
+   switch (num_vector_components) {
+   case 1:
+      surface_format = BRW_SURFACEFORMAT_R32_FLOAT;
+      break;
+   case 2:
+      surface_format = BRW_SURFACEFORMAT_R32G32_FLOAT;
+      break;
+   case 3:
+      surface_format = BRW_SURFACEFORMAT_R32G32B32_FLOAT;
+      break;
+   case 4:
+      surface_format = BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
+      break;
+   default:
+      assert(!"Invalid vector size for transform feedback output");
+      surface_format = BRW_SURFACEFORMAT_R32_FLOAT;
+      break;
+   }
+
+   surf[0] = BRW_SURFACE_BUFFER << BRW_SURFACE_TYPE_SHIFT |
+      BRW_SURFACE_MIPMAPLAYOUT_BELOW << BRW_SURFACE_MIPLAYOUT_SHIFT |
+      surface_format << BRW_SURFACE_FORMAT_SHIFT |
+      BRW_SURFACE_RC_READ_WRITE;
+   surf[1] = bo->offset + offset_bytes; /* reloc */
+   surf[2] = (width << BRW_SURFACE_WIDTH_SHIFT |
+	      height << BRW_SURFACE_HEIGHT_SHIFT);
+   surf[3] = (depth << BRW_SURFACE_DEPTH_SHIFT |
+              pitch_minus_1 << BRW_SURFACE_PITCH_SHIFT);
+   surf[4] = 0;
+   surf[5] = 0;
+
+   /* Emit relocation to surface contents. */
+   drm_intel_bo_emit_reloc(brw->intel.batch.bo,
+			   *out_offset + 4,
+			   bo, offset_bytes,
+			   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
+}
+
 /* Creates a new WM constant buffer reflecting the current fragment program's
  * constants, if needed by the fragment program.
  *
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_state.c b/src/mesa/drivers/dri/i965/gen6_gs_state.c
index 42962a64d36..fdad5d42dcd 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_gs_state.c
@@ -50,13 +50,17 @@ upload_gs_state(struct brw_context *brw)
       OUT_BATCH(brw->gs.prog_offset);
       OUT_BATCH(GEN6_GS_SPF_MODE | GEN6_GS_VECTOR_MASK_ENABLE);
       OUT_BATCH(0); /* no scratch space */
-      OUT_BATCH((1 << GEN6_GS_DISPATCH_START_GRF_SHIFT) |
+      OUT_BATCH((2 << GEN6_GS_DISPATCH_START_GRF_SHIFT) |
 	        (brw->gs.prog_data->urb_read_length << GEN6_GS_URB_READ_LENGTH_SHIFT));
       OUT_BATCH(((brw->max_gs_threads - 1) << GEN6_GS_MAX_THREADS_SHIFT) |
 	        GEN6_GS_STATISTICS_ENABLE |
 		GEN6_GS_SO_STATISTICS_ENABLE |
 		GEN6_GS_RENDERING_ENABLE);
-      OUT_BATCH(GEN6_GS_ENABLE);
+      OUT_BATCH(GEN6_GS_SVBI_PAYLOAD_ENABLE |
+                GEN6_GS_SVBI_POSTINCREMENT_ENABLE |
+                (brw->gs.prog_data->svbi_postincrement_value <<
+                 GEN6_GS_SVBI_POSTINCREMENT_VALUE_SHIFT) |
+                GEN6_GS_ENABLE);
       ADVANCE_BATCH();
    } else {
       BEGIN_BATCH(7);
diff --git a/src/mesa/drivers/dri/i965/gen6_sol.c b/src/mesa/drivers/dri/i965/gen6_sol.c
new file mode 100644
index 00000000000..491b39cce12
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/gen6_sol.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** \file gen6_sol.c
+ *
+ * Code to initialize the binding table entries used by transform feedback.
+ */
+
+#include "brw_context.h"
+#include "brw_defines.h"
+
+static void
+gen6_update_sol_surfaces(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->intel.ctx;
+   /* _NEW_TRANSFORM_FEEDBACK */
+   struct gl_transform_feedback_object *xfb_obj =
+      ctx->TransformFeedback.CurrentObject;
+   /* BRW_NEW_VERTEX_PROGRAM */
+   const struct gl_shader_program *shaderprog =
+      ctx->Shader.CurrentVertexProgram;
+   const struct gl_transform_feedback_info *linked_xfb_info =
+      &shaderprog->LinkedTransformFeedback;
+   int i;
+
+   for (i = 0; i < BRW_MAX_SOL_BINDINGS; ++i) {
+      const int surf_index = SURF_INDEX_SOL_BINDING(i);
+      if (xfb_obj->Active && i < linked_xfb_info->NumOutputs) {
+         unsigned buffer = linked_xfb_info->Outputs[i].OutputBuffer;
+         unsigned buffer_offset =
+            xfb_obj->Offset[buffer] / 4 +
+            linked_xfb_info->Outputs[i].DstOffset;
+         brw_update_sol_surface(
+            brw, xfb_obj->Buffers[buffer], &brw->bind.surf_offset[surf_index],
+            linked_xfb_info->Outputs[i].NumComponents,
+            linked_xfb_info->BufferStride[buffer], buffer_offset);
+      } else {
+         brw->bind.surf_offset[surf_index] = 0;
+      }
+   }
+}
+
+const struct brw_tracked_state gen6_sol_surface = {
+   .dirty = {
+      .mesa = _NEW_TRANSFORM_FEEDBACK,
+      .brw = (BRW_NEW_BATCH |
+              BRW_NEW_VERTEX_PROGRAM),
+      .cache = 0
+   },
+   .emit = gen6_update_sol_surfaces,
+};