12 files changed, 468 insertions, 33 deletions
diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h
index 1cee0e06c02..44a0ba0f561 100644
--- a/src/gallium/drivers/nv50/nv50_context.h
+++ b/src/gallium/drivers/nv50/nv50_context.h
@@ -48,6 +48,7 @@
 #define NV50_NEW_CONSTBUF     (1 << 18)
 #define NV50_NEW_TEXTURES     (1 << 19)
 #define NV50_NEW_SAMPLERS     (1 << 20)
+#define NV50_NEW_STRMOUT      (1 << 21)
 #define NV50_NEW_CONTEXT      (1 << 31)
 
 #define NV50_BIND_FB          0
@@ -56,9 +57,10 @@
 #define NV50_BIND_INDEX       3
 #define NV50_BIND_TEXTURES    4
 #define NV50_BIND_CB(s, i)   (5 + 16 * (s) + (i))
-#define NV50_BIND_SCREEN     53
-#define NV50_BIND_TLS        54
-#define NV50_BIND_COUNT      55
+#define NV50_BIND_SO         53
+#define NV50_BIND_SCREEN     54
+#define NV50_BIND_TLS        55
+#define NV50_BIND_COUNT      56
 #define NV50_BIND_2D          0
 #define NV50_BIND_M2MF        0
 #define NV50_BIND_FENCE       1
@@ -92,11 +94,13 @@ struct nv50_context {
       boolean point_sprite;
       boolean rt_serialize;
       boolean flushed;
+      boolean rasterizer_discard;
       uint8_t tls_required;
       uint8_t num_vtxbufs;
       uint8_t num_vtxelts;
       uint8_t num_textures[3];
       uint8_t num_samplers[3];
+      uint8_t prim_size;
       uint16_t scissor;
    } state;
 
@@ -126,6 +130,10 @@ struct nv50_context {
    struct nv50_tsc_entry *samplers[3][PIPE_MAX_SAMPLERS];
    unsigned num_samplers[3];
 
+   uint8_t num_so_targets;
+   uint8_t so_targets_dirty;
+   struct pipe_stream_output_target *so_target[4];
+
    struct pipe_framebuffer_state framebuffer;
    struct pipe_blend_color blend_colour;
    struct pipe_stencil_ref stencil_ref;
@@ -168,6 +176,14 @@ extern struct draw_stage *nv50_draw_render_stage(struct nv50_context *);
 
 /* nv50_query.c */
 void nv50_init_query_functions(struct nv50_context *);
+void nv50_query_pushbuf_submit(struct nouveau_pushbuf *,
+                               struct pipe_query *, unsigned result_offset);
+void nv84_query_fifo_wait(struct nouveau_pushbuf *, struct pipe_query *);
+void nva0_so_target_save_offset(struct pipe_context *,
+                                struct pipe_stream_output_target *,
+                                unsigned index, boolean seralize);
+
+#define NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
 
 /* nv50_shader_state.c */
 void nv50_vertprog_validate(struct nv50_context *);
@@ -177,6 +193,7 @@ void nv50_fp_linkage_validate(struct nv50_context *);
 void nv50_gp_linkage_validate(struct nv50_context *);
 void nv50_constbufs_validate(struct nv50_context *);
 void nv50_validate_derived_rs(struct nv50_context *);
+void nv50_stream_output_validate(struct nv50_context *);
 
 /* nv50_state.c */
 extern void nv50_init_state_functions(struct nv50_context *);
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 1b2e2934b79..48e8db333f9 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -235,6 +235,59 @@ nv50_program_assign_varying_slots(struct nv50_ir_prog_info *info)
    }
 }
 
+static struct nv50_stream_output_state *
+nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
+                                  const struct pipe_stream_output_info *pso)
+{
+   struct nv50_stream_output_state *so;
+   unsigned b, i, c;
+   unsigned base[4];
+
+   so = MALLOC_STRUCT(nv50_stream_output_state);
+   if (!so)
+      return NULL;
+   memset(so->map, 0xff, sizeof(so->map));
+
+   for (b = 0; b < 4; ++b)
+      so->num_attribs[b] = 0;
+   for (i = 0; i < pso->num_outputs; ++i) {
+      unsigned end =  pso->output[i].dst_offset + pso->output[i].num_components;
+      b = pso->output[i].output_buffer;
+      assert(b < 4);
+      so->num_attribs[b] = MAX2(so->num_attribs[b], end);
+   }
+
+   so->ctrl = NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED;
+
+   so->stride[0] = pso->stride[0] * 4;
+   base[0] = 0;
+   for (b = 1; b < 4; ++b) {
+      assert(!so->num_attribs[b] || so->num_attribs[b] == pso->stride[b]);
+      so->stride[b] = so->num_attribs[b] * 4;
+      if (so->num_attribs[b])
+         so->ctrl = (b + 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT;
+      base[b] = align(base[b - 1] + so->num_attribs[b - 1], 4);
+   }
+   if (so->ctrl & NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED) {
+      assert(so->stride[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX);
+      so->ctrl |= so->stride[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT;
+   }
+
+   so->map_size = base[3] + so->num_attribs[3];
+
+   for (i = 0; i < pso->num_outputs; ++i) {
+      const unsigned s = pso->output[i].start_component;
+      const unsigned p = pso->output[i].dst_offset;
+      const unsigned r = pso->output[i].register_index;
+      b = pso->output[i].output_buffer;
+
+      for (c = 0; c < pso->output[i].num_components; ++c)
+         so->map[base[b] + p + c] = info->out[r].slot[s + c];
+   }
+
+   return so;
+}
+
 boolean
 nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
 {
@@ -293,6 +346,10 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
          prog->fp.flags[0] |= NV50_3D_FP_CONTROL_USES_KIL;
    }
 
+   if (prog->pipe.stream_output.num_outputs)
+      prog->so = nv50_program_create_strmout_state(info,
+                                                   &prog->pipe.stream_output);
+
 out:
    FREE(info);
    return !ret;
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index 92361ad9946..f56268b5439 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -42,6 +42,15 @@ struct nv50_varying {
    ubyte si; /* semantic index */
 };
 
+struct nv50_stream_output_state
+{
+   uint32_t ctrl;
+   uint16_t stride[4];
+   uint8_t num_attribs[4];
+   uint8_t map_size;
+   uint8_t map[128];
+};
+
 struct nv50_program {
    struct pipe_shader_state pipe;
 
@@ -88,6 +97,8 @@ struct nv50_program {
    void *fixups; /* relocation records */
 
    struct nouveau_heap *mem;
+
+   struct nv50_stream_output_state *so;
 };
 
 boolean nv50_program_translate(struct nv50_program *, uint16_t chipset);
diff --git a/src/gallium/drivers/nv50/nv50_push.c b/src/gallium/drivers/nv50/nv50_push.c
index 04e32b7e8b9..3abe189e7b5 100644
--- a/src/gallium/drivers/nv50/nv50_push.c
+++ b/src/gallium/drivers/nv50/nv50_push.c
@@ -210,7 +210,8 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info)
 {
    struct push_context ctx;
    unsigned i, index_size;
-   unsigned inst = info->instance_count;
+   unsigned inst_count = info->instance_count;
+   unsigned vert_count = info->count;
    boolean apply_bias = info->indexed && info->index_bias;
 
    ctx.push = nv50->base.pushbuf;
@@ -242,6 +243,17 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info)
       ctx.primitive_restart = info->primitive_restart;
       ctx.restart_index = info->restart_index;
    } else {
+      if (unlikely(info->count_from_stream_output)) {
+         struct pipe_context *pipe = &nv50->base.pipe;
+         struct nv50_so_target *targ;
+         targ = nv50_so_target(info->count_from_stream_output);
+         if (!targ->pq) {
+            NOUVEAU_ERR("draw_stream_output not supported on pre-NVA0 cards\n");
+            return;
+         }
+         pipe->get_query_result(pipe, targ->pq, TRUE, (void *)&vert_count);
+         vert_count /= targ->stride;
+      }
       ctx.idxbuf = NULL;
       index_size = 0;
       ctx.primitive_restart = FALSE;
@@ -262,21 +274,21 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info)
    }
    nv50->state.prim_restart = info->primitive_restart;
 
-   while (inst--) {
+   while (inst_count--) {
       BEGIN_NV04(ctx.push, NV50_3D(VERTEX_BEGIN_GL), 1);
       PUSH_DATA (ctx.push, ctx.prim);
       switch (index_size) {
       case 0:
-         emit_vertices_seq(&ctx, info->start, info->count);
+         emit_vertices_seq(&ctx, info->start, vert_count);
          break;
       case 1:
-         emit_vertices_i08(&ctx, info->start, info->count);
+         emit_vertices_i08(&ctx, info->start, vert_count);
          break;
       case 2:
-         emit_vertices_i16(&ctx, info->start, info->count);
+         emit_vertices_i16(&ctx, info->start, vert_count);
          break;
       case 4:
-         emit_vertices_i32(&ctx, info->start, info->count);
+         emit_vertices_i32(&ctx, info->start, vert_count);
          break;
       default:
          assert(0);
diff --git a/src/gallium/drivers/nv50/nv50_query.c b/src/gallium/drivers/nv50/nv50_query.c
index 5275e74964a..8e62c5f11bc 100644
--- a/src/gallium/drivers/nv50/nv50_query.c
+++ b/src/gallium/drivers/nv50/nv50_query.c
@@ -36,7 +36,8 @@
 
 struct nv50_query {
    uint32_t *data;
-   uint32_t type;
+   uint16_t type;
+   uint16_t index;
    uint32_t sequence;
    struct nouveau_bo *bo;
    uint32_t base;
@@ -170,21 +171,15 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
       BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
       PUSH_DATA (push, 1);
       break;
-   case PIPE_QUERY_PRIMITIVES_GENERATED: /* store before & after instead ? */
-      PUSH_SPACE(push, 2);
-      BEGIN_NV04(push, NV50_3D(COUNTER_RESET), 1);
-      PUSH_DATA (push, NV50_3D_COUNTER_RESET_GENERATED_PRIMITIVES);
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      nv50_query_get(push, q, 0x10, 0x06805002);
       break;
    case PIPE_QUERY_PRIMITIVES_EMITTED:
-      PUSH_SPACE(push, 2);
-      BEGIN_NV04(push, NV50_3D(COUNTER_RESET), 1);
-      PUSH_DATA (push, NV50_3D_COUNTER_RESET_TRANSFORM_FEEDBACK);
+      nv50_query_get(push, q, 0x10, 0x05805002);
       break;
    case PIPE_QUERY_SO_STATISTICS:
-      PUSH_SPACE(push, 3);
-      BEGIN_NI04(push, NV50_3D(COUNTER_RESET), 2);
-      PUSH_DATA (push, NV50_3D_COUNTER_RESET_TRANSFORM_FEEDBACK);
-      PUSH_DATA (push, NV50_3D_COUNTER_RESET_GENERATED_PRIMITIVES);
+      nv50_query_get(push, q, 0x20, 0x05805002);
+      nv50_query_get(push, q, 0x30, 0x06805002);
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
    case PIPE_QUERY_TIME_ELAPSED:
@@ -227,6 +222,9 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
    case PIPE_QUERY_GPU_FINISHED:
       nv50_query_get(push, q, 0, 0x1000f010);
       break;
+   case NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET:
+      nv50_query_get(push, q, 0, 0x0d005002 | (q->index << 5));
+      break;
    default:
       assert(0);
       break;
@@ -247,6 +245,7 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    struct nv50_context *nv50 = nv50_context(pipe);
    struct nv50_query *q = nv50_query(pq);
    uint64_t *res64 = (uint64_t *)result;
+   uint32_t *res32 = (uint32_t *)result;
    boolean *res8 = (boolean *)result;
    uint64_t *data64 = (uint64_t *)q->data;
 
@@ -275,11 +274,11 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */
    case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */
-      res64[0] = data64[0];
+      res64[0] = data64[0] - data64[2];
       break;
    case PIPE_QUERY_SO_STATISTICS:
-      res64[0] = data64[0];
-      res64[1] = data64[1];
+      res64[0] = data64[0] - data64[4];
+      res64[1] = data64[2] - data64[6];
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT: /* u32 sequence, u32 0, u64 time */
       res64[0] = 1000000000;
@@ -288,6 +287,9 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    case PIPE_QUERY_TIME_ELAPSED:
       res64[0] = data64[1] - data64[3];
       break;
+   case NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET:
+      res32[0] = q->data[1];
+      break;
    default:
       return FALSE;
    }
@@ -295,6 +297,21 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    return TRUE;
 }
 
+void
+nv84_query_fifo_wait(struct nouveau_pushbuf *push, struct pipe_query *pq)
+{
+   struct nv50_query *q = nv50_query(pq);
+   unsigned offset = q->offset;
+
+   PUSH_SPACE(push, 5);
+   PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+   BEGIN_NV04(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4);
+   PUSH_DATAh(push, q->bo->offset + offset);
+   PUSH_DATA (push, q->bo->offset + offset);
+   PUSH_DATA (push, q->sequence);
+   PUSH_DATA (push, NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL);
+}
+
 static void
 nv50_render_condition(struct pipe_context *pipe,
                       struct pipe_query *pq, uint mode)
@@ -325,6 +342,38 @@ nv50_render_condition(struct pipe_context *pipe,
 }
 
 void
+nv50_query_pushbuf_submit(struct nouveau_pushbuf *push,
+                          struct pipe_query *pq, unsigned result_offset)
+{
+   struct nv50_query *q = nv50_query(pq);
+
+   /* XXX: does this exist ? */
+#define NV50_IB_ENTRY_1_NO_PREFETCH (0 << (31 - 8))
+
+   nouveau_pushbuf_space(push, 0, 0, 1);
+   nouveau_pushbuf_data(push, q->bo, q->offset + result_offset, 4 |
+                        NV50_IB_ENTRY_1_NO_PREFETCH);
+}
+
+void
+nva0_so_target_save_offset(struct pipe_context *pipe,
+                           struct pipe_stream_output_target *ptarg,
+                           unsigned index, boolean serialize)
+{
+   struct nv50_so_target *targ = nv50_so_target(ptarg);
+
+   if (serialize) {
+      struct nouveau_pushbuf *push = nv50_context(pipe)->base.pushbuf;
+      PUSH_SPACE(push, 2);
+      BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
+      PUSH_DATA (push, 0);
+   }
+
+   nv50_query(targ->pq)->index = index;
+   nv50_query_end(pipe, targ->pq);
+}
+
+void
 nv50_init_query_functions(struct nv50_context *nv50)
 {
    struct pipe_context *pipe = &nv50->base.pipe;
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index d72b6e91d2a..b341ade695e 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -73,6 +73,8 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen,
 static int
 nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 {
+   const uint16_t class_3d = nouveau_screen(pscreen)->class_3d;
+
    switch (param) {
    case PIPE_CAP_MAX_COMBINED_SAMPLERS:
       return 64;
@@ -95,7 +97,6 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_ANISOTROPIC_FILTER:
    case PIPE_CAP_SCALED_RESOLVE:
       return 1;
-   case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
    case PIPE_CAP_SEAMLESS_CUBE_MAP:
       return nv50_screen(pscreen)->tesla->oclass >= NVA0_3D_CLASS;
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
@@ -121,11 +122,12 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_OCCLUSION_QUERY:
       return 1;
    case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
-      return 0;
+      return 4;
    case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
-      return 128;
    case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
-      return 32;
+      return 64;
+   case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+      return (class_3d >= NVA0_3D_CLASS) ? 1 : 0;
    case PIPE_CAP_BLEND_EQUATION_SEPARATE:
    case PIPE_CAP_INDEP_BLEND_ENABLE:
       return 1;
diff --git a/src/gallium/drivers/nv50/nv50_shader_state.c b/src/gallium/drivers/nv50/nv50_shader_state.c
index aef3f129c81..d070f07bbbc 100644
--- a/src/gallium/drivers/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nv50/nv50_shader_state.c
@@ -207,6 +207,8 @@ nv50_gmtyprog_validate(struct nv50_context *nv50)
       PUSH_DATA (push, gp->gp.vert_count);
       BEGIN_NV04(push, NV50_3D(GP_START_ID), 1);
       PUSH_DATA (push, gp->code_base);
+
+      nv50->state.prim_size = gp->gp.prim_type; /* enum matches vertex count */
    }
    nv50_program_update_context_state(nv50, gp, 2);
 
@@ -278,6 +280,12 @@ nv50_validate_derived_rs(struct nv50_context *nv50)
 
    nv50_sprite_coords_validate(nv50);
 
+   if (nv50->state.rasterizer_discard != nv50->rast->pipe.rasterizer_discard) {
+      nv50->state.rasterizer_discard = nv50->rast->pipe.rasterizer_discard;
+      BEGIN_NV04(push, NV50_3D(RASTERIZE_ENABLE), 1);
+      PUSH_DATA (push, !nv50->rast->pipe.rasterizer_discard);
+   }
+
    if (nv50->dirty & NV50_NEW_FRAGPROG)
       return;
    psize = nv50->state.semantic_psize & ~NV50_3D_SEMANTIC_PTSZ_PTSZ_EN__MASK;
@@ -343,6 +351,7 @@ nv50_fp_linkage_validate(struct nv50_context *nv50)
    uint32_t colors = fp->fp.colors;
    uint32_t lin[4];
    uint8_t map[64];
+   uint8_t so_map[64];
 
    if (!(nv50->dirty & (NV50_NEW_VERTPROG |
                         NV50_NEW_FRAGPROG |
@@ -411,6 +420,30 @@ nv50_fp_linkage_validate(struct nv50_context *nv50)
    if (nv50->rast->pipe.clamp_vertex_color)
       colors |= NV50_3D_SEMANTIC_COLOR_CLMP_EN;
 
+   if (unlikely(vp->so)) {
+      /* Slot i in STRMOUT_MAP specifies the offset where slot i in RESULT_MAP
+       * gets written.
+       *
+       * TODO:
+       * Inverting vp->so->map (output -> offset) would probably speed this up.
+       */
+      memset(so_map, 0, sizeof(so_map));
+      for (i = 0; i < vp->so->map_size; ++i) {
+         if (vp->so->map[i] == 0xff)
+            continue;
+         for (c = 0; c < m; ++c)
+            if (map[c] == vp->so->map[i] && !so_map[c])
+               break;
+         if (c == m) {
+            c = m;
+            map[m++] = vp->so->map[i];
+         }
+         so_map[c] = 0x80 | i;
+      }
+      for (c = m; c & 3; ++c)
+         so_map[c] = 0;
+   }
+
    n = (m + 3) / 4;
    assert(m <= 64);
 
@@ -451,6 +484,11 @@ nv50_fp_linkage_validate(struct nv50_context *nv50)
 
    BEGIN_NV04(push, NV50_3D(GP_ENABLE), 1);
    PUSH_DATA (push, nv50->gmtyprog ? 1 : 0);
+
+   if (vp->so) {
+      BEGIN_NV04(push, NV50_3D(STRMOUT_MAP(0)), n);
+      PUSH_DATAp(push, so_map, n);
+   }
 }
 
 static int
@@ -509,3 +547,75 @@ nv50_gp_linkage_validate(struct nv50_context *nv50)
    BEGIN_NV04(push, NV50_3D(VP_RESULT_MAP(0)), n);
    PUSH_DATAp(push, map, n);
 }
+
+void
+nv50_stream_output_validate(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_stream_output_state *so;
+   uint32_t ctrl;
+   unsigned i;
+   unsigned prims = ~0;
+
+   so = nv50->gmtyprog ? nv50->gmtyprog->so : nv50->vertprog->so;
+
+   if (!so || !nv50->num_so_targets) {
+      BEGIN_NV04(push, NV50_3D(STRMOUT_ENABLE), 1);
+      PUSH_DATA (push, 0);
+      if (nv50->screen->base.class_3d < NVA0_3D_CLASS) {
+         BEGIN_NV04(push, NV50_3D(STRMOUT_PRIMITIVE_LIMIT), 1);
+         PUSH_DATA (push, 0);
+      }
+      BEGIN_NV04(push, NV50_3D(STRMOUT_PARAMS_LATCH), 1);
+      PUSH_DATA (push, 1);
+      return;
+   }
+
+   ctrl = so->ctrl;
+   if (nv50->screen->base.class_3d >= NVA0_3D_CLASS)
+      ctrl |= NVA0_3D_STRMOUT_BUFFERS_CTRL_LIMIT_MODE_OFFSET;
+
+   BEGIN_NV04(push, NV50_3D(STRMOUT_BUFFERS_CTRL), 1);
+   PUSH_DATA (push, ctrl);
+
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_SO);
+
+   for (i = 0; i < nv50->num_so_targets; ++i) {
+      struct nv50_so_target *targ = nv50_so_target(nv50->so_target[i]);
+      struct nv04_resource *buf = nv04_resource(targ->pipe.buffer);
+
+      const unsigned n = nv50->screen->base.class_3d >= NVA0_3D_CLASS ? 4 : 3;
+
+      if (n == 4 && !targ->clean)
+         nv84_query_fifo_wait(push, targ->pq);
+      BEGIN_NV04(push, NV50_3D(STRMOUT_ADDRESS_HIGH(i)), n);
+      PUSH_DATAh(push, buf->address + targ->pipe.buffer_offset);
+      PUSH_DATA (push, buf->address + targ->pipe.buffer_offset);
+      PUSH_DATA (push, so->num_attribs[i]);
+      if (n == 4) {
+         PUSH_DATA(push, targ->pipe.buffer_size);
+
+         BEGIN_NV04(push, NVA0_3D(STRMOUT_OFFSET(i)), 1);
+         if (!targ->clean) {
+            assert(targ->pq);
+            nv50_query_pushbuf_submit(push, targ->pq, 0x4);
+         } else {
+            PUSH_DATA(push, 0);
+            targ->clean = FALSE;
+         }
+      } else {
+         const unsigned limit = targ->pipe.buffer_size /
+            (so->stride[i] * nv50->state.prim_size);
+         prims = MIN2(prims, limit);
+      }
+      BCTX_REFN(nv50->bufctx_3d, SO, buf, WR);
+   }
+   if (prims != ~0) {
+      BEGIN_NV04(push, NV50_3D(STRMOUT_PRIMITIVE_LIMIT), 1);
+      PUSH_DATA (push, prims);
+   }
+   BEGIN_NV04(push, NV50_3D(STRMOUT_PARAMS_LATCH), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_3D(STRMOUT_ENABLE), 1);
+   PUSH_DATA (push, 1);
+}
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
index 5b783da7ad7..5e32b2717fd 100644
--- a/src/gallium/drivers/nv50/nv50_state.c
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -680,6 +680,9 @@ nv50_sp_state_create(struct pipe_context *pipe,
    prog->type = type;
    prog->pipe.tokens = tgsi_dup_tokens(cso->tokens);
 
+   if (cso->stream_output.num_outputs)
+      prog->pipe.stream_output = cso->stream_output;
+
    return (void *)prog;
 }
 
@@ -909,6 +912,90 @@ nv50_vertex_state_bind(struct pipe_context *pipe, void *hwcso)
    nv50->dirty |= NV50_NEW_VERTEX;
 }
 
+static struct pipe_stream_output_target *
+nv50_so_target_create(struct pipe_context *pipe,
+                      struct pipe_resource *res,
+                      unsigned offset, unsigned size)
+{
+   struct nv50_so_target *targ = MALLOC_STRUCT(nv50_so_target);
+   if (!targ)
+      return NULL;
+
+   if (nouveau_context(pipe)->screen->class_3d >= NVA0_3D_CLASS) {
+      targ->pq = pipe->create_query(pipe,
+                                    NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET);
+      if (!targ->pq) {
+         FREE(targ);
+         return NULL;
+      }
+   } else {
+      targ->pq = NULL;
+   }
+   targ->clean = TRUE;
+
+   targ->pipe.buffer_size = size;
+   targ->pipe.buffer_offset = offset;
+   targ->pipe.context = pipe;
+   targ->pipe.buffer = NULL;
+   pipe_resource_reference(&targ->pipe.buffer, res);
+   pipe_reference_init(&targ->pipe.reference, 1);
+
+   return &targ->pipe;
+}
+
+static void
+nv50_so_target_destroy(struct pipe_context *pipe,
+                       struct pipe_stream_output_target *ptarg)
+{
+   struct nv50_so_target *targ = nv50_so_target(ptarg);
+   if (targ->pq)
+      pipe->destroy_query(pipe, targ->pq);
+   FREE(targ);
+}
+
+static void
+nv50_set_stream_output_targets(struct pipe_context *pipe,
+                               unsigned num_targets,
+                               struct pipe_stream_output_target **targets,
+                               unsigned append_mask)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   unsigned i;
+   boolean serialize = TRUE;
+   const boolean can_resume = nv50->screen->base.class_3d >= NVA0_3D_CLASS;
+
+   assert(num_targets <= 4);
+
+   for (i = 0; i < num_targets; ++i) {
+      const boolean changed = nv50->so_target[i] != targets[i];
+      if (!changed && (append_mask & (1 << i)))
+         continue;
+      nv50->so_targets_dirty |= 1 << i;
+
+      if (can_resume && changed && nv50->so_target[i]) {
+         nva0_so_target_save_offset(pipe, nv50->so_target[i], i, serialize);
+         serialize = FALSE;
+      }
+
+      if (targets[i] && !(append_mask & (1 << i)))
+         nv50_so_target(targets[i])->clean = TRUE;
+
+      pipe_so_target_reference(&nv50->so_target[i], targets[i]);
+   }
+   for (; i < nv50->num_so_targets; ++i) {
+      if (can_resume && nv50->so_target[i]) {
+         nva0_so_target_save_offset(pipe, nv50->so_target[i], i, serialize);
+         serialize = FALSE;
+      }
+      pipe_so_target_reference(&nv50->so_target[i], NULL);
+      nv50->so_targets_dirty |= 1 << i;
+   }
+   nv50->num_so_targets = num_targets;
+
+   if (nv50->so_targets_dirty)
+      nv50->dirty |= NV50_NEW_STRMOUT;
+}
+
 void
 nv50_init_state_functions(struct nv50_context *nv50)
 {
@@ -965,6 +1052,10 @@ nv50_init_state_functions(struct nv50_context *nv50)
    pipe->set_vertex_buffers = nv50_set_vertex_buffers;
    pipe->set_index_buffer = nv50_set_index_buffer;
 
+   pipe->create_stream_output_target = nv50_so_target_create;
+   pipe->stream_output_target_destroy = nv50_so_target_destroy;
+   pipe->set_stream_output_targets = nv50_set_stream_output_targets;
+
    pipe->redefine_user_buffer = u_default_redefine_user_buffer;
 }
 
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
index c19acf6c426..a95e96d3c51 100644
--- a/src/gallium/drivers/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -360,6 +360,8 @@ static struct state_validate {
     { nv50_constbufs_validate,     NV50_NEW_CONSTBUF },
     { nv50_validate_textures,      NV50_NEW_TEXTURES },
     { nv50_validate_samplers,      NV50_NEW_SAMPLERS },
+    { nv50_stream_output_validate, NV50_NEW_STRMOUT |
+                                   NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
     { nv50_vertex_arrays_validate, NV50_NEW_VERTEX | NV50_NEW_ARRAYS }
 };
 #define validate_list_len (sizeof(validate_list) / sizeof(validate_list[0]))
diff --git a/src/gallium/drivers/nv50/nv50_stateobj.h b/src/gallium/drivers/nv50/nv50_stateobj.h
index 188406da600..8a9260c937e 100644
--- a/src/gallium/drivers/nv50/nv50_stateobj.h
+++ b/src/gallium/drivers/nv50/nv50_stateobj.h
@@ -51,4 +51,17 @@ struct nv50_vertex_stateobj {
    struct nv50_vertex_element element[0];
 };
 
+struct nv50_so_target {
+   struct pipe_stream_output_target pipe;
+   struct pipe_query *pq;
+   unsigned stride;
+   boolean clean;
+};
+
+static INLINE struct nv50_so_target *
+nv50_so_target(struct pipe_stream_output_target *ptarg)
+{
+   return (struct nv50_so_target *)ptarg;
+}
+
 #endif
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
index bc01e69decf..323677eaf80 100644
--- a/src/gallium/drivers/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -405,6 +405,25 @@ nv50_prim_gl(unsigned prim)
    }
 }
 
+/* For pre-nva0 transform feedback. */
+static const uint8_t nv50_pipe_prim_to_prim_size[PIPE_PRIM_MAX + 1] =
+{
+   [PIPE_PRIM_POINTS] = 1,
+   [PIPE_PRIM_LINES] = 2,
+   [PIPE_PRIM_LINE_LOOP] = 2,
+   [PIPE_PRIM_LINE_STRIP] = 2,
+   [PIPE_PRIM_TRIANGLES] = 3,
+   [PIPE_PRIM_TRIANGLE_STRIP] = 3,
+   [PIPE_PRIM_TRIANGLE_FAN] = 3,
+   [PIPE_PRIM_QUADS] = 3,
+   [PIPE_PRIM_QUAD_STRIP] = 3,
+   [PIPE_PRIM_POLYGON] = 3,
+   [PIPE_PRIM_LINES_ADJACENCY] = 2,
+   [PIPE_PRIM_LINE_STRIP_ADJACENCY] = 2,
+   [PIPE_PRIM_TRIANGLES_ADJACENCY] = 3,
+   [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = 3
+};
+
 static void
 nv50_draw_arrays(struct nv50_context *nv50,
                  unsigned mode, unsigned start, unsigned count,
@@ -624,6 +643,51 @@ nv50_draw_elements(struct nv50_context *nv50, boolean shorten,
 }
 
 static void
+nva0_draw_stream_output(struct nv50_context *nv50,
+                        const struct pipe_draw_info *info)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_so_target *so = nv50_so_target(info->count_from_stream_output);
+   struct nv04_resource *res = nv04_resource(so->pipe.buffer);
+   unsigned num_instances = info->instance_count;
+   unsigned mode = nv50_prim_gl(info->mode);
+
+   if (unlikely(nv50->screen->base.class_3d < NVA0_3D_CLASS)) {
+      /* A proper implementation without waiting doesn't seem possible,
+       * so don't bother.
+       */
+      NOUVEAU_ERR("draw_stream_output not supported on pre-NVA0 cards\n");
+      return;
+   }
+
+   if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
+      res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+      PUSH_SPACE(push, 4);
+      BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FLUSH), 1);
+      PUSH_DATA (push, 0);
+   }
+
+   assert(num_instances);
+   do {
+      PUSH_SPACE(push, 8);
+      BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
+      PUSH_DATA (push, mode);
+      BEGIN_NV04(push, NVA0_3D(DRAW_TFB_BASE), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NVA0_3D(DRAW_TFB_STRIDE), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NVA0_3D(DRAW_TFB_BYTES), 1);
+      nv50_query_pushbuf_submit(push, so->pq, 0x4);
+      BEGIN_NV04(push, NV50_3D(VERTEX_END_GL), 1);
+      PUSH_DATA (push, 0);
+
+      mode |= NV50_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT;
+   } while (--num_instances);
+}
+
+static void
 nv50_draw_vbo_kick_notify(struct nouveau_pushbuf *chan)
 {
    struct nv50_screen *screen = chan->user_priv;
@@ -655,6 +719,9 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    if (nv50->vbo_user && !(nv50->dirty & (NV50_NEW_VERTEX | NV50_NEW_ARRAYS)))
       nv50_update_user_vbufs(nv50);
 
+   if (unlikely(nv50->num_so_targets && !nv50->gmtyprog))
+      nv50->state.prim_size = nv50_pipe_prim_to_prim_size[info->mode];
+
    nv50_state_validate(nv50, ~0, 8); /* 8 as minimum, we use flush_notify */
 
    push->kick_notify = nv50_draw_vbo_kick_notify;
@@ -679,11 +746,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       nv50->base.vbo_dirty = FALSE;
    }
 
-   if (!info->indexed) {
-      nv50_draw_arrays(nv50,
-                       info->mode, info->start, info->count,
-                       info->instance_count);
-   } else {
+   if (info->indexed) {
       boolean shorten = info->max_index <= 65535;
 
       assert(nv50->idxbuf.buffer);
@@ -713,6 +776,13 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       nv50_draw_elements(nv50, shorten,
                          info->mode, info->start, info->count,
                          info->instance_count, info->index_bias);
+   } else
+   if (unlikely(info->count_from_stream_output)) {
+      nva0_draw_stream_output(nv50, info);
+   } else {
+      nv50_draw_arrays(nv50,
+                       info->mode, info->start, info->count,
+                       info->instance_count);
    }
    push->kick_notify = nv50_default_kick_notify;
 
diff --git a/src/gallium/drivers/nv50/nv50_winsys.h b/src/gallium/drivers/nv50/nv50_winsys.h
index b36898dabe6..145ee70cb9f 100644
--- a/src/gallium/drivers/nv50/nv50_winsys.h
+++ b/src/gallium/drivers/nv50/nv50_winsys.h
@@ -49,6 +49,7 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 
 #define SUBC_3D(m) 3, (m)
 #define NV50_3D(n) SUBC_3D(NV50_3D_##n)
+#define NVA0_3D(n) SUBC_3D(NVA0_3D_##n)
 
 #define SUBC_2D(m) 4, (m)
 #define NV50_2D(n) SUBC_2D(NV50_2D_##n)