summaryrefslogtreecommitdiffstats
path: root/src/gallium
diff options
context:
space:
mode:
authorMarek Olšák <[email protected]>2011-11-08 21:58:27 +0100
committerMarek Olšák <[email protected]>2011-12-17 14:56:21 +0100
commit543b2331d7b45a29ccd3530daa2389e87e65d89b (patch)
treee560fb84b532207280fbf0f4d6e7b17278c1b467 /src/gallium
parent047363a07b0e13f8724791ed834b0b68e762adc7 (diff)
r600g: implement transform feedback
r600: DONE. r700: MOSTLY (done but locks up). Evergreen: MOSTLY (done but doesn't work for an unknown reason). The kernel support will come soon.
Diffstat (limited to 'src/gallium')
-rw-r--r--src/gallium/drivers/r600/eg_asm.c29
-rw-r--r--src/gallium/drivers/r600/evergreen_hw_context.c35
-rw-r--r--src/gallium/drivers/r600/evergreen_state.c7
-rw-r--r--src/gallium/drivers/r600/evergreend.h45
-rw-r--r--src/gallium/drivers/r600/r600.h21
-rw-r--r--src/gallium/drivers/r600/r600_asm.c115
-rw-r--r--src/gallium/drivers/r600/r600_asm.h2
-rw-r--r--src/gallium/drivers/r600/r600_blit.c2
-rw-r--r--src/gallium/drivers/r600/r600_hw_context.c335
-rw-r--r--src/gallium/drivers/r600/r600_hw_context_priv.h7
-rw-r--r--src/gallium/drivers/r600/r600_pipe.c15
-rw-r--r--src/gallium/drivers/r600/r600_pipe.h15
-rw-r--r--src/gallium/drivers/r600/r600_query.c16
-rw-r--r--src/gallium/drivers/r600/r600_shader.c101
-rw-r--r--src/gallium/drivers/r600/r600_sq.h4
-rw-r--r--src/gallium/drivers/r600/r600_state.c9
-rw-r--r--src/gallium/drivers/r600/r600_state_common.c77
-rw-r--r--src/gallium/drivers/r600/r600d.h51
18 files changed, 873 insertions, 13 deletions
diff --git a/src/gallium/drivers/r600/eg_asm.c b/src/gallium/drivers/r600/eg_asm.c
index f6b8631124f..877e162d841 100644
--- a/src/gallium/drivers/r600/eg_asm.c
+++ b/src/gallium/drivers/r600/eg_asm.c
@@ -73,6 +73,35 @@ int eg_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf)
bc->bytecode[id] |= S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program);
id++;
break;
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3:
+ bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
+ S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
+ S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
+ S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
+ bc->bytecode[id] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
+ S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->output.barrier) |
+ cf->output.inst |
+ S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(cf->output.comp_mask) |
+ S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(cf->output.array_size);
+ if (bc->chip_class == EVERGREEN) /* no EOP on cayman */
+ bc->bytecode[id] |= S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program);
+ id++;
+ break;
case EG_V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
case EG_V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
case EG_V_SQ_CF_WORD1_SQ_CF_INST_POP:
diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c
index 96e8d181189..bd1d969eca3 100644
--- a/src/gallium/drivers/r600/evergreen_hw_context.c
+++ b/src/gallium/drivers/r600/evergreen_hw_context.c
@@ -1241,3 +1241,38 @@ void evergreen_context_flush_dest_caches(struct r600_context *ctx)
ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY;
}
+
+void evergreen_flush_vgt_streamout(struct r600_context *ctx)
+{
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = (R_0084FC_CP_STRMOUT_CNTL - EVERGREEN_CONFIG_REG_OFFSET) >> 2;
+ ctx->pm4[ctx->pm4_cdwords++] = 0;
+
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0);
+
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
+ ctx->pm4[ctx->pm4_cdwords++] = R_0084FC_CP_STRMOUT_CNTL >> 2; /* register */
+ ctx->pm4[ctx->pm4_cdwords++] = 0;
+ ctx->pm4[ctx->pm4_cdwords++] = S_0084FC_OFFSET_UPDATE_DONE(1); /* reference value */
+ ctx->pm4[ctx->pm4_cdwords++] = S_0084FC_OFFSET_UPDATE_DONE(1); /* mask */
+ ctx->pm4[ctx->pm4_cdwords++] = 4; /* poll interval */
+}
+
+void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_enable_bit)
+{
+ if (buffer_enable_bit) {
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = (R_028B94_VGT_STRMOUT_CONFIG - EVERGREEN_CONTEXT_REG_OFFSET) >> 2;
+ ctx->pm4[ctx->pm4_cdwords++] = S_028B94_STREAMOUT_0_EN(1);
+
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = (R_028B98_VGT_STRMOUT_BUFFER_CONFIG - EVERGREEN_CONTEXT_REG_OFFSET) >> 2;
+ ctx->pm4[ctx->pm4_cdwords++] = S_028B98_STREAM_0_BUFFER_EN(buffer_enable_bit);
+ } else {
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = (R_028B94_VGT_STRMOUT_CONFIG - EVERGREEN_CONTEXT_REG_OFFSET) >> 2;
+ ctx->pm4[ctx->pm4_cdwords++] = S_028B94_STREAMOUT_0_EN(0);
+ }
+}
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index d0c02d552a4..6f5d6f7a643 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -927,8 +927,8 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx,
state->fill_back != PIPE_POLYGON_MODE_FILL);
r600_pipe_state_add_reg(rstate, R_028814_PA_SU_SC_MODE_CNTL,
S_028814_PROVOKING_VTX_LAST(prov_vtx) |
- S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
- S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
+ S_028814_CULL_FRONT(state->rasterizer_discard || (state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
+ S_028814_CULL_BACK(state->rasterizer_discard || (state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
S_028814_FACE(!state->front_ccw) |
S_028814_POLY_OFFSET_FRONT_ENABLE(state->offset_tri) |
S_028814_POLY_OFFSET_BACK_ENABLE(state->offset_tri) |
@@ -1688,6 +1688,9 @@ void evergreen_init_state_functions(struct r600_pipe_context *rctx)
rctx->context.sampler_view_destroy = r600_sampler_view_destroy;
rctx->context.redefine_user_buffer = u_default_redefine_user_buffer;
rctx->context.texture_barrier = evergreen_texture_barrier;
+ rctx->context.create_stream_output_target = r600_create_so_target;
+ rctx->context.stream_output_target_destroy = r600_so_target_destroy;
+ rctx->context.set_stream_output_targets = r600_set_so_targets;
}
static void cayman_init_config(struct r600_pipe_context *rctx)
diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h
index 6baa2a759e6..68b77b48828 100644
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -50,6 +50,8 @@
#define EVENT_TYPE_PS_PARTIAL_FLUSH 0x10
#define EVENT_TYPE_ZPASS_DONE 0x15
#define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT 0x16
+#define EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH 0x1f
+
#define EVENT_TYPE(x) ((x) << 0)
#define EVENT_INDEX(x) ((x) << 8)
/* 0 - any non-TS event
@@ -82,6 +84,7 @@
#define PKT3_MEM_SEMAPHORE 0x39
#define PKT3_MPEG_INDEX 0x3A
#define PKT3_WAIT_REG_MEM 0x3C
+#define WAIT_REG_MEM_EQUAL 3
#define PKT3_MEM_WRITE 0x3D
#define PKT3_INDIRECT_BUFFER 0x32
#define PKT3_CP_INTERRUPT 0x40
@@ -118,6 +121,12 @@
#define PKT3(op, count, predicate) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count) | PKT3_PREDICATE(predicate))
/* Registers */
+#define R_0084FC_CP_STRMOUT_CNTL 0x000084FC
+#define S_0084FC_OFFSET_UPDATE_DONE(x) (((x) & 0x1) << 0)
+#define R_008960_VGT_STRMOUT_BUFFER_FILLED_SIZE_0 0x008960 /* read-only */
+#define R_008964_VGT_STRMOUT_BUFFER_FILLED_SIZE_1 0x008964 /* read-only */
+#define R_008968_VGT_STRMOUT_BUFFER_FILLED_SIZE_2 0x008968 /* read-only */
+#define R_00896C_VGT_STRMOUT_BUFFER_FILLED_SIZE_3 0x00896C /* read-only */
#define R_008C00_SQ_CONFIG 0x00008C00
#define S_008C00_VC_ENABLE(x) (((x) & 0x1) << 0)
#define G_008C00_VC_ENABLE(x) (((x) >> 0) & 0x1)
@@ -1723,6 +1732,33 @@
#define R_028AC0_DB_SRESULTS_COMPARE_STATE0 0x00028AC0
#define R_028AC4_DB_SRESULTS_COMPARE_STATE1 0x00028AC4
#define R_028AC8_DB_PRELOAD_CONTROL 0x00028AC8
+#define R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 0x028AD0
+#define R_028AD4_VGT_STRMOUT_VTX_STRIDE_0 0x028AD4
+#define R_028AD8_VGT_STRMOUT_BUFFER_BASE_0 0x028AD8
+#define R_028ADC_VGT_STRMOUT_BUFFER_OFFSET_0 0x028ADC
+#define R_028AE0_VGT_STRMOUT_BUFFER_SIZE_1 0x028AE0
+#define R_028AE4_VGT_STRMOUT_VTX_STRIDE_1 0x028AE4
+#define R_028AE8_VGT_STRMOUT_BUFFER_BASE_1 0x028AE8
+#define R_028AEC_VGT_STRMOUT_BUFFER_OFFSET_1 0x028AEC
+#define R_028AF0_VGT_STRMOUT_BUFFER_SIZE_2 0x028AF0
+#define R_028AF4_VGT_STRMOUT_VTX_STRIDE_2 0x028AF4
+#define R_028AF8_VGT_STRMOUT_BUFFER_BASE_2 0x028AF8
+#define R_028AFC_VGT_STRMOUT_BUFFER_OFFSET_2 0x028AFC
+#define R_028B00_VGT_STRMOUT_BUFFER_SIZE_3 0x028B00
+#define R_028B04_VGT_STRMOUT_VTX_STRIDE_3 0x028B04
+#define R_028B08_VGT_STRMOUT_BUFFER_BASE_3 0x028B08
+#define R_028B0C_VGT_STRMOUT_BUFFER_OFFSET_3 0x028B0C
+#define R_028B10_VGT_STRMOUT_BASE_OFFSET_0 0x028B10
+#define R_028B14_VGT_STRMOUT_BASE_OFFSET_1 0x028B14
+#define R_028B18_VGT_STRMOUT_BASE_OFFSET_2 0x028B18
+#define R_028B1C_VGT_STRMOUT_BASE_OFFSET_3 0x028B1C
+#define R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET 0x028B28
+#define R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE 0x028B2C
+#define R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE 0x028B30
+#define R_028B44_VGT_STRMOUT_BASE_OFFSET_HI_0 0x028B44
+#define R_028B48_VGT_STRMOUT_BASE_OFFSET_HI_1 0x028B48
+#define R_028B4C_VGT_STRMOUT_BASE_OFFSET_HI_2 0x028B4C
+#define R_028B50_VGT_STRMOUT_BASE_OFFSET_HI_3 0x028B50
#define R_028B54_VGT_SHADER_STAGES_EN 0x00028B54
#define R_028B70_DB_ALPHA_TO_MASK 0x00028B70
#define R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL 0x00028B78
@@ -1750,7 +1786,16 @@
#define G_028B8C_OFFSET(x) (((x) >> 0) & 0xFFFFFFFF)
#define C_028B8C_OFFSET 0x00000000
#define R_028B94_VGT_STRMOUT_CONFIG 0x00028B94
+#define S_028B94_STREAMOUT_0_EN(x) (((x) & 0x1) << 0)
+#define S_028B94_STREAMOUT_1_EN(x) (((x) & 0x1) << 1)
+#define S_028B94_STREAMOUT_2_EN(x) (((x) & 0x1) << 2)
+#define S_028B94_STREAMOUT_3_EN(x) (((x) & 0x1) << 3)
+#define S_028B94_RAST_STREAM(x) (((x) & 0x7) << 4)
#define R_028B98_VGT_STRMOUT_BUFFER_CONFIG 0x00028B98
+#define S_028B98_STREAM_0_BUFFER_EN(x) (((x) & 0xf) << 0)
+#define S_028B98_STREAM_1_BUFFER_EN(x) (((x) & 0xf) << 4)
+#define S_028B98_STREAM_2_BUFFER_EN(x) (((x) & 0xf) << 8)
+#define S_028B98_STREAM_3_BUFFER_EN(x) (((x) & 0xf) << 12)
#define R_028C00_PA_SC_LINE_CNTL 0x00028C00
#define R_028C04_PA_SC_AA_CONFIG 0x00028C04
#define R_028C08_PA_SU_VTX_CNTL 0x00028C08
diff --git a/src/gallium/drivers/r600/r600.h b/src/gallium/drivers/r600/r600.h
index e697406aaaf..fbd12fbe1b7 100644
--- a/src/gallium/drivers/r600/r600.h
+++ b/src/gallium/drivers/r600/r600.h
@@ -167,6 +167,7 @@ struct r600_query {
union {
uint64_t u64;
boolean b;
+ struct pipe_query_data_so_statistics so;
} result;
/* The kind of query */
unsigned type;
@@ -187,6 +188,15 @@ struct r600_query {
struct list_head list;
};
+struct r600_so_target {
+ struct pipe_stream_output_target b;
+
+ /* The buffer where BUFFER_FILLED_SIZE is stored. */
+ struct r600_resource *filled_size;
+ unsigned stride;
+ unsigned so_index;
+};
+
#define R600_CONTEXT_DRAW_PENDING (1 << 0)
#define R600_CONTEXT_DST_CACHES_DIRTY (1 << 1)
#define R600_CONTEXT_CHECK_EVENT_FLUSH (1 << 2)
@@ -218,6 +228,7 @@ struct r600_context {
/* The list of active queries. Only one query of each type can be active. */
struct list_head active_query_list;
unsigned num_cs_dw_queries_suspend;
+ unsigned num_cs_dw_streamout_end;
unsigned backend_mask;
unsigned max_db; /* for OQ */
@@ -229,6 +240,12 @@ struct r600_context {
struct r600_range fs_resources;
int num_ps_resources, num_vs_resources, num_fs_resources;
boolean have_depth_texture, have_depth_fb;
+
+ unsigned num_so_targets;
+ struct r600_so_target *so_targets[PIPE_MAX_SO_BUFFERS];
+ boolean streamout_start;
+ unsigned streamout_append_bitmask;
+ unsigned *vs_shader_so_strides;
};
struct r600_draw {
@@ -268,6 +285,10 @@ void r600_context_emit_fence(struct r600_context *ctx, struct r600_resource *fen
void r600_context_flush_all(struct r600_context *ctx, unsigned flush_flags);
void r600_context_flush_dest_caches(struct r600_context *ctx);
+void r600_context_streamout_begin(struct r600_context *ctx);
+void r600_context_streamout_end(struct r600_context *ctx);
+void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t);
+
int evergreen_context_init(struct r600_context *ctx, struct r600_screen *screen);
void evergreen_context_draw(struct r600_context *ctx, const struct r600_draw *draw);
void evergreen_context_flush_dest_caches(struct r600_context *ctx);
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 1ab16f28821..e617b167905 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -1672,6 +1672,21 @@ static int r600_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode
cf->output.inst |
S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program);
break;
+ case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
+ case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
+ case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
+ case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
+ bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
+ S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
+ S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
+ S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
+ bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
+ S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->output.barrier) |
+ cf->output.inst |
+ S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program) |
+ S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(cf->output.array_size) |
+ S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(cf->output.comp_mask);
+ break;
case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
case V_SQ_CF_WORD1_SQ_CF_INST_POP:
@@ -1730,6 +1745,22 @@ int r600_bytecode_build(struct r600_bytecode *bc)
case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3:
case EG_V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
case EG_V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
case EG_V_SQ_CF_WORD1_SQ_CF_INST_POP:
@@ -1760,6 +1791,10 @@ int r600_bytecode_build(struct r600_bytecode *bc)
case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
+ case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
+ case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
+ case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
+ case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
case V_SQ_CF_WORD1_SQ_CF_INST_POP:
@@ -1849,6 +1884,22 @@ int r600_bytecode_build(struct r600_bytecode *bc)
break;
case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3:
case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
@@ -1923,6 +1974,10 @@ int r600_bytecode_build(struct r600_bytecode *bc)
break;
case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
+ case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
+ case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
+ case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
+ case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
@@ -2057,6 +2112,44 @@ void r600_bytecode_dump(struct r600_bytecode *bc)
fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
break;
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2:
+ case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3:
+ fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i_BUF%i ", id, bc->bytecode[id],
+ (EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
+ EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) / 4,
+ (EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
+ EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) % 4);
+ fprintf(stderr, "GPR:%X ", cf->output.gpr);
+ fprintf(stderr, "ELEM_SIZE:%i ", cf->output.elem_size);
+ fprintf(stderr, "ARRAY_BASE:%i ", cf->output.array_base);
+ fprintf(stderr, "TYPE:%X\n", cf->output.type);
+ id++;
+ fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i_BUF%i ", id, bc->bytecode[id],
+ (EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
+ EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) / 4,
+ (EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
+ EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) % 4);
+ fprintf(stderr, "ARRAY_SIZE:%i ", cf->output.array_size);
+ fprintf(stderr, "COMP_MASK:%X ", cf->output.comp_mask);
+ fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
+ fprintf(stderr, "INST:%d ", cf->output.inst);
+ fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
+ fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
+ break;
case EG_V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
case EG_V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
case EG_V_SQ_CF_WORD1_SQ_CF_INST_POP:
@@ -2125,6 +2218,28 @@ void r600_bytecode_dump(struct r600_bytecode *bc)
fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
break;
+ case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
+ case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
+ case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
+ case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
+ fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i ", id, bc->bytecode[id],
+ R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
+ R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0));
+ fprintf(stderr, "GPR:%X ", cf->output.gpr);
+ fprintf(stderr, "ELEM_SIZE:%i ", cf->output.elem_size);
+ fprintf(stderr, "ARRAY_BASE:%i ", cf->output.array_base);
+ fprintf(stderr, "TYPE:%X\n", cf->output.type);
+ id++;
+ fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i ", id, bc->bytecode[id],
+ R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
+ R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0));
+ fprintf(stderr, "ARRAY_SIZE:%i ", cf->output.array_size);
+ fprintf(stderr, "COMP_MASK:%X ", cf->output.comp_mask);
+ fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
+ fprintf(stderr, "INST:%d ", cf->output.inst);
+ fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
+ fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
+ break;
case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
case V_SQ_CF_WORD1_SQ_CF_INST_POP:
diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h
index 0fd44677c34..d0ff75d6e2f 100644
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -107,6 +107,8 @@ struct r600_bytecode_vtx {
struct r600_bytecode_output {
unsigned array_base;
+ unsigned array_size;
+ unsigned comp_mask;
unsigned type;
unsigned end_of_program;
diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index 9326dc657ff..313ed128f06 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -65,6 +65,8 @@ static void r600_blitter_begin(struct pipe_context *ctx, enum r600_blitter_op op
util_blitter_save_vertex_buffers(rctx->blitter,
rctx->vbuf_mgr->nr_vertex_buffers,
rctx->vbuf_mgr->vertex_buffer);
+ util_blitter_save_so_targets(rctx->blitter, rctx->ctx.num_so_targets,
+ (struct pipe_stream_output_target**)rctx->ctx.so_targets);
if (op & R600_SAVE_FRAMEBUFFER)
util_blitter_save_framebuffer(rctx->blitter, &rctx->framebuffer);
diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index 52e0be7d5e2..1dba96642aa 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -943,6 +943,9 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
/* Count in queries_suspend. */
num_dw += ctx->num_cs_dw_queries_suspend;
+ /* Count in streamout_end at the end of CS. */
+ num_dw += ctx->num_cs_dw_streamout_end;
+
/* Count in render_condition(NULL) at the end of CS. */
if (ctx->predicate_drawing) {
num_dw += 3;
@@ -1471,6 +1474,12 @@ void r600_context_draw(struct r600_context *ctx, const struct r600_draw *draw)
r600_context_block_resource_emit_dirty(ctx, dirty_block);
}
+ /* Enable stream out if needed. */
+ if (ctx->streamout_start) {
+ r600_context_streamout_begin(ctx);
+ ctx->streamout_start = FALSE;
+ }
+
/* draw packet */
pm4 = &ctx->pm4[ctx->pm4_cdwords];
@@ -1503,6 +1512,7 @@ void r600_context_flush(struct r600_context *ctx, unsigned flags)
{
struct r600_block *enable_block = NULL;
bool queries_suspended = false;
+ bool streamout_suspended = false;
if (ctx->pm4_cdwords == ctx->init_dwords)
return;
@@ -1513,6 +1523,11 @@ void r600_context_flush(struct r600_context *ctx, unsigned flags)
queries_suspended = true;
}
+ if (ctx->num_cs_dw_streamout_end) {
+ r600_context_streamout_end(ctx);
+ streamout_suspended = true;
+ }
+
if (ctx->screen->chip_class >= EVERGREEN)
evergreen_context_flush_dest_caches(ctx);
else
@@ -1542,6 +1557,11 @@ void r600_context_flush(struct r600_context *ctx, unsigned flags)
r600_init_cs(ctx);
+ if (streamout_suspended) {
+ ctx->streamout_start = TRUE;
+ ctx->streamout_append_bitmask = ~0;
+ }
+
/* resume queries */
if (queries_suspended) {
r600_context_queries_resume(ctx);
@@ -1636,6 +1656,44 @@ static boolean r600_query_result(struct r600_context *ctx, struct r600_query *qu
results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
}
break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ /* SAMPLE_STREAMOUTSTATS stores this structure:
+ * {
+ * u64 NumPrimitivesWritten;
+ * u64 PrimitiveStorageNeeded;
+ * }
+ * We only need NumPrimitivesWritten here. */
+ while (results_base != query->results_end) {
+ query->result.u64 +=
+ r600_query_read_result(map + results_base, 2, 6, true);
+ results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
+ }
+ break;
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ /* Here we read PrimitiveStorageNeeded. */
+ while (results_base != query->results_end) {
+ query->result.u64 +=
+ r600_query_read_result(map + results_base, 0, 4, true);
+ results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
+ }
+ break;
+ case PIPE_QUERY_SO_STATISTICS:
+ while (results_base != query->results_end) {
+ query->result.so.num_primitives_written +=
+ r600_query_read_result(map + results_base, 2, 6, true);
+ query->result.so.primitives_storage_needed +=
+ r600_query_read_result(map + results_base, 0, 4, true);
+ results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
+ }
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ while (results_base != query->results_end) {
+ query->result.b = query->result.b ||
+ r600_query_read_result(map + results_base, 2, 6, true) !=
+ r600_query_read_result(map + results_base, 0, 4, true);
+ results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
+ }
+ break;
default:
assert(0);
}
@@ -1679,6 +1737,15 @@ void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
break;
case PIPE_QUERY_TIME_ELAPSED:
break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ case PIPE_QUERY_SO_STATISTICS:
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ results = ctx->ws->buffer_map(query->buffer->buf, ctx->cs, PIPE_TRANSFER_WRITE);
+ results = (u32*)((char*)results + query->results_end);
+ memset(results, 0, query->result_size);
+ ctx->ws->buffer_unmap(query->buffer->buf);
+ break;
default:
assert(0);
}
@@ -1692,6 +1759,15 @@ void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
ctx->pm4[ctx->pm4_cdwords++] = query->results_end;
ctx->pm4[ctx->pm4_cdwords++] = 0;
break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ case PIPE_QUERY_SO_STATISTICS:
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
+ ctx->pm4[ctx->pm4_cdwords++] = query->results_end;
+ ctx->pm4[ctx->pm4_cdwords++] = 0;
+ break;
case PIPE_QUERY_TIME_ELAPSED:
ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
@@ -1720,6 +1796,15 @@ void r600_query_end(struct r600_context *ctx, struct r600_query *query)
ctx->pm4[ctx->pm4_cdwords++] = query->results_end + 8;
ctx->pm4[ctx->pm4_cdwords++] = 0;
break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ case PIPE_QUERY_SO_STATISTICS:
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
+ ctx->pm4[ctx->pm4_cdwords++] = query->results_end + query->result_size/2;
+ ctx->pm4[ctx->pm4_cdwords++] = 0;
+ break;
case PIPE_QUERY_TIME_ELAPSED:
ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
@@ -1798,6 +1883,14 @@ struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned
query->result_size = 16;
query->num_cs_dw = 8;
break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ case PIPE_QUERY_SO_STATISTICS:
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
+ query->result_size = 32;
+ query->num_cs_dw = 6;
+ break;
default:
assert(0);
FREE(query);
@@ -1832,20 +1925,28 @@ boolean r600_context_query_result(struct r600_context *ctx,
{
boolean *result_b = (boolean*)vresult;
uint64_t *result_u64 = (uint64_t*)vresult;
+ struct pipe_query_data_so_statistics *result_so =
+ (struct pipe_query_data_so_statistics*)vresult;
if (!r600_query_result(ctx, query, wait))
return FALSE;
switch (query->type) {
case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
*result_u64 = query->result.u64;
break;
case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
*result_b = query->result.b;
break;
case PIPE_QUERY_TIME_ELAPSED:
*result_u64 = (1000000 * query->result.u64) / ctx->screen->info.r600_clock_crystal_freq;
break;
+ case PIPE_QUERY_SO_STATISTICS:
+ *result_so = query->result.so;
+ break;
default:
assert(0);
}
@@ -1872,3 +1973,237 @@ void r600_context_queries_resume(struct r600_context *ctx)
r600_query_begin(ctx, query);
}
}
+
+static void r600_flush_vgt_streamout(struct r600_context *ctx)
+{
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = (R_008490_CP_STRMOUT_CNTL - R600_CONFIG_REG_OFFSET) >> 2;
+ ctx->pm4[ctx->pm4_cdwords++] = 0;
+
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0);
+
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
+ ctx->pm4[ctx->pm4_cdwords++] = R_008490_CP_STRMOUT_CNTL >> 2; /* register */
+ ctx->pm4[ctx->pm4_cdwords++] = 0;
+ ctx->pm4[ctx->pm4_cdwords++] = S_008490_OFFSET_UPDATE_DONE(1); /* reference value */
+ ctx->pm4[ctx->pm4_cdwords++] = S_008490_OFFSET_UPDATE_DONE(1); /* mask */
+ ctx->pm4[ctx->pm4_cdwords++] = 4; /* poll interval */
+}
+
+static void r600_set_streamout_enable(struct r600_context *ctx, unsigned buffer_enable_bit)
+{
+ if (buffer_enable_bit) {
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = (R_028AB0_VGT_STRMOUT_EN - R600_CONTEXT_REG_OFFSET) >> 2;
+ ctx->pm4[ctx->pm4_cdwords++] = S_028AB0_STREAMOUT(1);
+
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = (R_028B20_VGT_STRMOUT_BUFFER_EN - R600_CONTEXT_REG_OFFSET) >> 2;
+ ctx->pm4[ctx->pm4_cdwords++] = buffer_enable_bit;
+ } else {
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = (R_028AB0_VGT_STRMOUT_EN - R600_CONTEXT_REG_OFFSET) >> 2;
+ ctx->pm4[ctx->pm4_cdwords++] = S_028AB0_STREAMOUT(0);
+ }
+}
+
+void r600_context_streamout_begin(struct r600_context *ctx)
+{
+ struct r600_so_target **t = ctx->so_targets;
+ unsigned *strides = ctx->vs_shader_so_strides;
+ unsigned buffer_en, i, update_flags = 0;
+
+ buffer_en = (ctx->num_so_targets >= 1 && t[0] ? 1 : 0) |
+ (ctx->num_so_targets >= 2 && t[1] ? 2 : 0) |
+ (ctx->num_so_targets >= 3 && t[2] ? 4 : 0) |
+ (ctx->num_so_targets >= 4 && t[3] ? 8 : 0);
+
+ ctx->num_cs_dw_streamout_end =
+ 12 + /* flush_vgt_streamout */
+ util_bitcount(buffer_en) * 8 +
+ 8;
+
+ r600_need_cs_space(ctx,
+ 12 + /* flush_vgt_streamout */
+ 6 + /* enables */
+ util_bitcount(buffer_en & ctx->streamout_append_bitmask) * 8 +
+ util_bitcount(buffer_en & ~ctx->streamout_append_bitmask) * 6 +
+ (ctx->screen->family > CHIP_R600 && ctx->screen->family < CHIP_RV770 ? 2 : 0) +
+ ctx->num_cs_dw_streamout_end, TRUE);
+
+ if (ctx->screen->chip_class >= EVERGREEN) {
+ evergreen_flush_vgt_streamout(ctx);
+ evergreen_set_streamout_enable(ctx, buffer_en);
+ } else {
+ r600_flush_vgt_streamout(ctx);
+ r600_set_streamout_enable(ctx, buffer_en);
+ }
+
+ for (i = 0; i < ctx->num_so_targets; i++) {
+ if (t[i]) {
+ t[i]->stride = strides[i];
+ t[i]->so_index = i;
+
+ update_flags |= SURFACE_BASE_UPDATE_STRMOUT(i);
+
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 3, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = (R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 +
+ 16*i - R600_CONTEXT_REG_OFFSET) >> 2;
+ ctx->pm4[ctx->pm4_cdwords++] = (t[i]->b.buffer_offset +
+ t[i]->b.buffer_size) >> 2; /* BUFFER_SIZE (in DW) */
+ ctx->pm4[ctx->pm4_cdwords++] = strides[i] >> 2; /* VTX_STRIDE (in DW) */
+ ctx->pm4[ctx->pm4_cdwords++] = 0; /* BUFFER_BASE */
+
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
+ ctx->pm4[ctx->pm4_cdwords++] =
+ r600_context_bo_reloc(ctx, r600_resource(t[i]->b.buffer),
+ RADEON_USAGE_WRITE);
+
+ if (ctx->streamout_append_bitmask & (1 << i)) {
+ /* Append. */
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = STRMOUT_SELECT_BUFFER(i) |
+ STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM); /* control */
+ ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+ ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+ ctx->pm4[ctx->pm4_cdwords++] = 0; /* src address lo */
+ ctx->pm4[ctx->pm4_cdwords++] = 0; /* src address hi */
+
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
+ ctx->pm4[ctx->pm4_cdwords++] =
+ r600_context_bo_reloc(ctx, t[i]->filled_size,
+ RADEON_USAGE_READ);
+ } else {
+ /* Start from the beginning. */
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = STRMOUT_SELECT_BUFFER(i) |
+ STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET); /* control */
+ ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+ ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+ ctx->pm4[ctx->pm4_cdwords++] = t[i]->b.buffer_offset >> 2; /* buffer offset in DW */
+ ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+ }
+ }
+ }
+
+ if (ctx->screen->family > CHIP_R600 && ctx->screen->family < CHIP_RV770) {
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SURFACE_BASE_UPDATE, 0, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = update_flags;
+ }
+}
+
+void r600_context_streamout_end(struct r600_context *ctx)
+{
+ struct r600_so_target **t = ctx->so_targets;
+ unsigned i, flush_flags = 0;
+
+ if (ctx->screen->chip_class >= EVERGREEN) {
+ evergreen_flush_vgt_streamout(ctx);
+ } else {
+ r600_flush_vgt_streamout(ctx);
+ }
+
+ for (i = 0; i < ctx->num_so_targets; i++) {
+ if (t[i]) {
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = STRMOUT_SELECT_BUFFER(i) |
+ STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
+ STRMOUT_STORE_BUFFER_FILLED_SIZE; /* control */
+ ctx->pm4[ctx->pm4_cdwords++] = 0; /* dst address lo */
+ ctx->pm4[ctx->pm4_cdwords++] = 0; /* dst address hi */
+ ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+ ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
+ ctx->pm4[ctx->pm4_cdwords++] =
+ r600_context_bo_reloc(ctx, t[i]->filled_size,
+ RADEON_USAGE_WRITE);
+
+ flush_flags |= S_0085F0_SO0_DEST_BASE_ENA(1) << i;
+ }
+ }
+
+ if (ctx->screen->chip_class >= EVERGREEN) {
+ evergreen_set_streamout_enable(ctx, 0);
+ } else {
+ r600_set_streamout_enable(ctx, 0);
+ }
+
+ if (ctx->screen->family < CHIP_RV770) {
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT) | EVENT_INDEX(0);
+ } else {
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SURFACE_SYNC, 3, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = flush_flags; /* CP_COHER_CNTL */
+ ctx->pm4[ctx->pm4_cdwords++] = 0xffffffff; /* CP_COHER_SIZE */
+ ctx->pm4[ctx->pm4_cdwords++] = 0; /* CP_COHER_BASE */
+ ctx->pm4[ctx->pm4_cdwords++] = 0x0000000A; /* POLL_INTERVAL */
+ }
+
+ ctx->num_cs_dw_streamout_end = 0;
+
+ /* XXX print some debug info */
+ for (i = 0; i < ctx->num_so_targets; i++) {
+ if (!t[i])
+ continue;
+
+ uint32_t *ptr = ctx->ws->buffer_map(t[i]->filled_size->buf, ctx->cs, RADEON_USAGE_READ);
+ printf("FILLED_SIZE%i: %u\n", i, *ptr);
+ ctx->ws->buffer_unmap(t[i]->filled_size->buf);
+ }
+}
+
+void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t)
+{
+ r600_need_cs_space(ctx, 14 + 21, TRUE);
+
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = (R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET - R600_CONTEXT_REG_OFFSET) >> 2;
+ ctx->pm4[ctx->pm4_cdwords++] = 0;
+
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - R600_CONTEXT_REG_OFFSET) >> 2;
+ ctx->pm4[ctx->pm4_cdwords++] = t->stride >> 2;
+
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_COPY_DW, 4, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG;
+ ctx->pm4[ctx->pm4_cdwords++] = 0; /* src address lo */
+ ctx->pm4[ctx->pm4_cdwords++] = 0; /* src address hi */
+ ctx->pm4[ctx->pm4_cdwords++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* dst register */
+ ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = r600_context_bo_reloc(ctx, t->filled_size,
+ RADEON_USAGE_READ);
+
+#if 0 /* I have not found this useful yet. */
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_COPY_DW, 4, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = COPY_DW_SRC_IS_REG | COPY_DW_DST_IS_REG;
+ ctx->pm4[ctx->pm4_cdwords++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* src register */
+ ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+ ctx->pm4[ctx->pm4_cdwords++] = R_0085F4_CP_COHER_SIZE >> 2; /* dst register */
+ ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = (R_0085F0_CP_COHER_CNTL - R600_CONFIG_REG_OFFSET) >> 2;
+ ctx->pm4[ctx->pm4_cdwords++] = S_0085F0_SO0_DEST_BASE_ENA(1) << t->so_index;
+
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = (R_0085F8_CP_COHER_BASE - R600_CONFIG_REG_OFFSET) >> 2;
+ ctx->pm4[ctx->pm4_cdwords++] = t->b.buffer_offset >> 2;
+
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = r600_context_bo_reloc(ctx, (struct r600_resource*)t->b.buffer,
+ RADEON_USAGE_WRITE);
+
+ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
+ ctx->pm4[ctx->pm4_cdwords++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
+ ctx->pm4[ctx->pm4_cdwords++] = R_0085FC_CP_COHER_STATUS >> 2; /* register */
+ ctx->pm4[ctx->pm4_cdwords++] = 0;
+ ctx->pm4[ctx->pm4_cdwords++] = 0; /* reference value */
+ ctx->pm4[ctx->pm4_cdwords++] = 0xffffffff; /* mask */
+ ctx->pm4[ctx->pm4_cdwords++] = 4; /* poll interval */
+#endif
+}
diff --git a/src/gallium/drivers/r600/r600_hw_context_priv.h b/src/gallium/drivers/r600/r600_hw_context_priv.h
index bea613551a7..206de7e819f 100644
--- a/src/gallium/drivers/r600/r600_hw_context_priv.h
+++ b/src/gallium/drivers/r600/r600_hw_context_priv.h
@@ -76,6 +76,13 @@ void r600_context_reg(struct r600_context *ctx,
void r600_init_cs(struct r600_context *ctx);
int r600_resource_init(struct r600_context *ctx, struct r600_range *range, unsigned offset, unsigned nblocks, unsigned stride, struct r600_reg *reg, int nreg, unsigned offset_base);
+/*
+ * evergreen_hw_context.c
+ */
+void evergreen_flush_vgt_streamout(struct r600_context *ctx);
+void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_enable_bit);
+
+
static INLINE unsigned r600_context_bo_reloc(struct r600_context *ctx, struct r600_resource *rbo,
enum radeon_bo_usage usage)
{
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 97c6808260d..22fefaa2c49 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -378,6 +378,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
case PIPE_CAP_PRIMITIVE_RESTART:
case PIPE_CAP_CONDITIONAL_RENDER:
case PIPE_CAP_TEXTURE_BARRIER:
+ case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
return 1;
/* Supported except the original R600. */
@@ -391,17 +392,21 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
return family >= CHIP_CEDAR ? 1 : 0;
/* Unsupported features. */
- case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
- case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_ATTRIBS:
- case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
- case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
- case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
case PIPE_CAP_TGSI_INSTANCEID:
case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
case PIPE_CAP_SCALED_RESOLVE:
return 0;
+ /* Stream output. */
+ case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+ return debug_get_bool_option("R600_STREAMOUT", FALSE) ? 4 : 0;
+ case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_ATTRIBS:
+ return 16;
+ case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+ case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+ return 16*4;
+
/* Texturing. */
case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index a127eed6f38..46443af0a07 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -133,6 +133,8 @@ struct r600_pipe_shader {
struct r600_vertex_element vertex_elements;
struct tgsi_token *tokens;
unsigned sprite_coord_enable;
+ struct pipe_stream_output_info so;
+ unsigned so_strides[4];
};
struct r600_pipe_sampler_state {
@@ -348,6 +350,19 @@ void r600_delete_ps_shader(struct pipe_context *ctx, void *state);
void r600_delete_vs_shader(struct pipe_context *ctx, void *state);
void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint index,
struct pipe_resource *buffer);
+struct pipe_stream_output_target *
+r600_create_so_target(struct pipe_context *ctx,
+ struct pipe_resource *buffer,
+ unsigned buffer_offset,
+ unsigned buffer_size);
+void r600_so_target_destroy(struct pipe_context *ctx,
+ struct pipe_stream_output_target *target);
+void r600_set_so_targets(struct pipe_context *ctx,
+ unsigned num_targets,
+ struct pipe_stream_output_target **targets,
+ unsigned append_bitmask);
+
+
void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info);
/*
diff --git a/src/gallium/drivers/r600/r600_query.c b/src/gallium/drivers/r600/r600_query.c
index ec0d91f0874..ee2d04b3684 100644
--- a/src/gallium/drivers/r600/r600_query.c
+++ b/src/gallium/drivers/r600/r600_query.c
@@ -100,7 +100,21 @@ static void r600_render_condition(struct pipe_context *ctx,
}
rctx->ctx.predicate_drawing = true;
- r600_query_predication(&rctx->ctx, rquery, PREDICATION_OP_ZPASS, wait_flag);
+
+ switch (rquery->type) {
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ r600_query_predication(&rctx->ctx, rquery, PREDICATION_OP_ZPASS, wait_flag);
+ break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ case PIPE_QUERY_SO_STATISTICS:
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ r600_query_predication(&rctx->ctx, rquery, PREDICATION_OP_PRIMCOUNT, wait_flag);
+ break;
+ default:
+ assert(0);
+ }
}
void r600_init_query_functions(struct r600_pipe_context *rctx)
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 61f88f4efd0..ad4aded95cb 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -119,6 +119,19 @@ int r600_pipe_shader_create(struct pipe_context *ctx, struct r600_pipe_shader *s
if (dump_shaders) {
fprintf(stderr, "--------------------------------------------------------------\n");
tgsi_dump(shader->tokens, 0);
+
+ if (shader->so.num_outputs) {
+ unsigned i;
+ fprintf(stderr, "STREAMOUT\n");
+ for (i = 0; i < shader->so.num_outputs; i++) {
+ fprintf(stderr, " %i: MEM_STREAM0_BUF%i OUT[%i].%s%s%s%s\n", i,
+ shader->so.output[i].output_buffer, shader->so.output[i].register_index,
+ shader->so.output[i].register_mask & 1 ? "x" : "_",
+ (shader->so.output[i].register_mask >> 1) & 1 ? "y" : "_",
+ (shader->so.output[i].register_mask >> 2) & 1 ? "z" : "_",
+ (shader->so.output[i].register_mask >> 3) & 1 ? "w" : "_");
+ }
+ }
}
r = r600_shader_from_tgsi(rctx, shader);
if (r) {
@@ -681,6 +694,7 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi
{
struct r600_shader *shader = &pipeshader->shader;
struct tgsi_token *tokens = pipeshader->tokens;
+ struct pipe_stream_output_info so = pipeshader->so;
struct tgsi_full_immediate *immediate;
struct tgsi_full_property *property;
struct r600_shader_ctx ctx;
@@ -847,6 +861,93 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi
}
}
+ /* Add stream outputs. */
+ if (ctx.type == TGSI_PROCESSOR_VERTEX && so.num_outputs) {
+ unsigned buffer_offset[PIPE_MAX_SO_BUFFERS] = {0};
+
+ for (i = 0; i < so.num_outputs; i++) {
+ struct r600_bytecode_output output;
+ unsigned comps;
+
+ if (so.output[i].output_buffer >= 4) {
+ R600_ERR("exceeded the max number of stream output buffers, got: %d\n",
+ so.output[i].output_buffer);
+ r = -EINVAL;
+ goto out_err;
+ }
+
+ switch (so.output[i].register_mask) {
+ case TGSI_WRITEMASK_XYZW:
+ comps = 4;
+ break;
+ case TGSI_WRITEMASK_XYZ:
+ comps = 3;
+ break;
+ case TGSI_WRITEMASK_XY:
+ comps = 2;
+ break;
+ case TGSI_WRITEMASK_X:
+ comps = 1;
+ break;
+ default:
+ R600_ERR("streamout: invalid register_mask, got: %x\n",
+ so.output[i].register_mask);
+ r = -EINVAL;
+ goto out_err;
+ }
+
+ memset(&output, 0, sizeof(struct r600_bytecode_output));
+ output.gpr = shader->output[so.output[i].register_index].gpr;
+ output.elem_size = 0;
+ output.array_base = buffer_offset[so.output[i].output_buffer];
+ output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
+ output.burst_count = 1;
+ output.barrier = 1;
+ output.array_size = 0;
+ output.comp_mask = so.output[i].register_mask;
+ if (ctx.bc->chip_class >= EVERGREEN) {
+ switch (so.output[i].output_buffer) {
+ case 0:
+ output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0;
+ break;
+ case 1:
+ output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1;
+ break;
+ case 2:
+ output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2;
+ break;
+ case 3:
+ output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3;
+ break;
+ }
+ } else {
+ switch (so.output[i].output_buffer) {
+ case 0:
+ output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0;
+ break;
+ case 1:
+ output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1;
+ break;
+ case 2:
+ output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2;
+ break;
+ case 3:
+ output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3;
+ break;
+ }
+ }
+ r = r600_bytecode_add_output(ctx.bc, &output);
+ if (r)
+ goto out_err;
+
+ buffer_offset[so.output[i].output_buffer] += comps;
+ }
+
+ for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+ pipeshader->so_strides[i] = buffer_offset[i] * 4;
+ }
+ }
+
/* export output */
j = 0;
for (i = 0, pos0 = 0; i < noutput; i++) {
diff --git a/src/gallium/drivers/r600/r600_sq.h b/src/gallium/drivers/r600/r600_sq.h
index 56ed35e8b32..b9c41260f56 100644
--- a/src/gallium/drivers/r600/r600_sq.h
+++ b/src/gallium/drivers/r600/r600_sq.h
@@ -114,6 +114,10 @@
#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS 0x00000001
#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM 0x00000002
#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_SX 0x00000003
+#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE 0x00000000
+#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND 0x00000001
+#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ 0x00000002
+#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND 0x00000003
#define S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x) (((x) & 0x7F) << 15)
#define G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x) (((x) >> 15) & 0x7F)
#define C_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR 0xFFC07FFF
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index 8410cfe352f..7f44035d788 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -933,7 +933,7 @@ static void *r600_create_dsa_state(struct pipe_context *ctx,
}
static void *r600_create_rs_state(struct pipe_context *ctx,
- const struct pipe_rasterizer_state *state)
+ const struct pipe_rasterizer_state *state)
{
struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
struct r600_pipe_rasterizer *rs = CALLOC_STRUCT(r600_pipe_rasterizer);
@@ -978,8 +978,8 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
state->fill_back != PIPE_POLYGON_MODE_FILL);
r600_pipe_state_add_reg(rstate, R_028814_PA_SU_SC_MODE_CNTL,
S_028814_PROVOKING_VTX_LAST(prov_vtx) |
- S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
- S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
+ S_028814_CULL_FRONT(state->rasterizer_discard || (state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
+ S_028814_CULL_BACK(state->rasterizer_discard || (state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
S_028814_FACE(!state->front_ccw) |
S_028814_POLY_OFFSET_FRONT_ENABLE(state->offset_tri) |
S_028814_POLY_OFFSET_BACK_ENABLE(state->offset_tri) |
@@ -1758,6 +1758,9 @@ void r600_init_state_functions(struct r600_pipe_context *rctx)
rctx->context.sampler_view_destroy = r600_sampler_view_destroy;
rctx->context.redefine_user_buffer = u_default_redefine_user_buffer;
rctx->context.texture_barrier = r600_texture_barrier;
+ rctx->context.create_stream_output_target = r600_create_so_target;
+ rctx->context.stream_output_target_destroy = r600_so_target_destroy;
+ rctx->context.set_stream_output_targets = r600_set_so_targets;
}
void r600_adjust_gprs(struct r600_pipe_context *rctx)
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index d6ffda47c2c..9f6f5142c09 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -247,10 +247,11 @@ void *r600_create_vertex_elements(struct pipe_context *ctx,
void *r600_create_shader_state(struct pipe_context *ctx,
const struct pipe_shader_state *state)
{
- struct r600_pipe_shader *shader = CALLOC_STRUCT(r600_pipe_shader);
+ struct r600_pipe_shader *shader = CALLOC_STRUCT(r600_pipe_shader);
int r;
shader->tokens = tgsi_dup_tokens(state->tokens);
+ shader->so = state->stream_output;
r = r600_pipe_shader_create(ctx, shader);
if (r) {
@@ -412,6 +413,71 @@ void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint index,
pipe_resource_reference((struct pipe_resource**)&rbuffer, NULL);
}
+struct pipe_stream_output_target *
+r600_create_so_target(struct pipe_context *ctx,
+ struct pipe_resource *buffer,
+ unsigned buffer_offset,
+ unsigned buffer_size)
+{
+ struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+ struct r600_so_target *t;
+ void *ptr;
+
+ t = CALLOC_STRUCT(r600_so_target);
+ if (!t) {
+ return NULL;
+ }
+
+ t->b.reference.count = 1;
+ t->b.context = ctx;
+ pipe_resource_reference(&t->b.buffer, buffer);
+ t->b.buffer_offset = buffer_offset;
+ t->b.buffer_size = buffer_size;
+
+ t->filled_size = (struct r600_resource*)
+ pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_STATIC, 4);
+ ptr = rctx->ws->buffer_map(t->filled_size->buf, rctx->ctx.cs, PIPE_TRANSFER_WRITE);
+ memset(ptr, 0, t->filled_size->buf->size);
+ rctx->ws->buffer_unmap(t->filled_size->buf);
+
+ return &t->b;
+}
+
+void r600_so_target_destroy(struct pipe_context *ctx,
+ struct pipe_stream_output_target *target)
+{
+ struct r600_so_target *t = (struct r600_so_target*)target;
+ pipe_resource_reference(&t->b.buffer, NULL);
+ pipe_resource_reference((struct pipe_resource**)&t->filled_size, NULL);
+ FREE(t);
+}
+
+void r600_set_so_targets(struct pipe_context *ctx,
+ unsigned num_targets,
+ struct pipe_stream_output_target **targets,
+ unsigned append_bitmask)
+{
+ struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+ unsigned i;
+
+ /* Stop streamout. */
+ if (rctx->ctx.num_so_targets) {
+ r600_context_streamout_end(&rctx->ctx);
+ }
+
+ /* Set the new targets. */
+ for (i = 0; i < num_targets; i++) {
+ pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->ctx.so_targets[i], targets[i]);
+ }
+ for (; i < rctx->ctx.num_so_targets; i++) {
+ pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->ctx.so_targets[i], NULL);
+ }
+
+ rctx->ctx.num_so_targets = num_targets;
+ rctx->ctx.streamout_start = num_targets != 0;
+ rctx->ctx.streamout_append_bitmask = append_bitmask;
+}
+
static void r600_vertex_buffer_update(struct r600_pipe_context *rctx)
{
struct r600_pipe_resource_state *rstate;
@@ -528,7 +594,7 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo)
struct pipe_index_buffer ib = {};
unsigned prim, mask, ls_mask = 0;
- if (!info.count ||
+ if ((!info.count && (info.indexed || !info.count_from_stream_output)) ||
(info.indexed && !rctx->vbuf_mgr->index_buffer.buffer) ||
!r600_conv_pipe_prim(info.mode, &prim)) {
return;
@@ -572,8 +638,15 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo)
} else {
info.index_bias = info.start;
rdraw.vgt_draw_initiator = V_0287F0_DI_SRC_SEL_AUTO_INDEX;
+ if (info.count_from_stream_output) {
+ rdraw.vgt_draw_initiator |= S_0287F0_USE_OPAQUE(1);
+
+ r600_context_draw_opaque_count(&rctx->ctx, (struct r600_so_target*)info.count_from_stream_output);
+ }
}
+ rctx->ctx.vs_shader_so_strides = rctx->vs_shader->so_strides;
+
mask = (1ULL << ((unsigned)rctx->framebuffer.nr_cbufs * 4)) - 1;
if (rctx->vgt.id != R600_PIPE_STATE_VGT) {
diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h
index 7a2fe0265bc..ccdf82e6533 100644
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -76,10 +76,23 @@
#define PKT3_DRAW_INDEX_IMMD 0x2E
#define PKT3_NUM_INSTANCES 0x2F
#define PKT3_STRMOUT_BUFFER_UPDATE 0x34
+#define STRMOUT_STORE_BUFFER_FILLED_SIZE 1
+#define STRMOUT_OFFSET_SOURCE(x) (((x) & 0x3) << 1)
+#define STRMOUT_OFFSET_FROM_PACKET 0
+#define STRMOUT_OFFSET_FROM_VGT_FILLED_SIZE 1
+#define STRMOUT_OFFSET_FROM_MEM 2
+#define STRMOUT_OFFSET_NONE 3
+#define STRMOUT_SELECT_BUFFER(x) (((x) & 0x3) << 8)
#define PKT3_INDIRECT_BUFFER_MP 0x38
#define PKT3_MEM_SEMAPHORE 0x39
#define PKT3_MPEG_INDEX 0x3A
+#define PKT3_COPY_DW 0x3B
+#define COPY_DW_SRC_IS_REG (0 << 0)
+#define COPY_DW_SRC_IS_MEM (1 << 0)
+#define COPY_DW_DST_IS_REG (0 << 1)
+#define COPY_DW_DST_IS_MEM (1 << 1)
#define PKT3_WAIT_REG_MEM 0x3C
+#define WAIT_REG_MEM_EQUAL 3
#define PKT3_MEM_WRITE 0x3D
#define PKT3_INDIRECT_BUFFER 0x32
#define PKT3_CP_INTERRUPT 0x40
@@ -106,6 +119,8 @@
#define EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT 0x14
#define EVENT_TYPE_ZPASS_DONE 0x15
#define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT 0x16
+#define EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH 0x1f
+#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS 0x20
#define EVENT_TYPE(x) ((x) << 0)
#define EVENT_INDEX(x) ((x) << 8)
/* 0 - any non-TS event
@@ -147,6 +162,12 @@
#define PKT3(op, count, predicate) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count) | PKT3_PRED_S(predicate))
/* Registers */
+#define R_008490_CP_STRMOUT_CNTL 0x008490
+#define S_008490_OFFSET_UPDATE_DONE(x) (((x) & 0x1) << 0)
+#define R_008960_VGT_STRMOUT_BUFFER_FILLED_SIZE_0 0x008960 /* read-only */
+#define R_008964_VGT_STRMOUT_BUFFER_FILLED_SIZE_1 0x008964 /* read-only */
+#define R_008968_VGT_STRMOUT_BUFFER_FILLED_SIZE_2 0x008968 /* read-only */
+#define R_00896C_VGT_STRMOUT_BUFFER_FILLED_SIZE_3 0x00896C /* read-only */
#define R_008C00_SQ_CONFIG 0x00008C00
#define S_008C00_VC_ENABLE(x) (((x) & 0x1) << 0)
#define G_008C00_VC_ENABLE(x) (((x) >> 0) & 0x1)
@@ -3144,6 +3165,26 @@
#define S_028AB8_VTX_CNT_EN(x) (((x) & 0x1) << 0)
#define G_028AB8_VTX_CNT_EN(x) (((x) >> 0) & 0x1)
#define C_028AB8_VTX_CNT_EN 0xFFFFFFFE
+#define R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 0x028AD0
+#define R_028AD4_VGT_STRMOUT_VTX_STRIDE_0 0x028AD4
+#define R_028AD8_VGT_STRMOUT_BUFFER_BASE_0 0x028AD8
+#define R_028ADC_VGT_STRMOUT_BUFFER_OFFSET_0 0x028ADC
+#define R_028AE0_VGT_STRMOUT_BUFFER_SIZE_1 0x028AE0
+#define R_028AE4_VGT_STRMOUT_VTX_STRIDE_1 0x028AE4
+#define R_028AE8_VGT_STRMOUT_BUFFER_BASE_1 0x028AE8
+#define R_028AEC_VGT_STRMOUT_BUFFER_OFFSET_1 0x028AEC
+#define R_028AF0_VGT_STRMOUT_BUFFER_SIZE_2 0x028AF0
+#define R_028AF4_VGT_STRMOUT_VTX_STRIDE_2 0x028AF4
+#define R_028AF8_VGT_STRMOUT_BUFFER_BASE_2 0x028AF8
+#define R_028AFC_VGT_STRMOUT_BUFFER_OFFSET_2 0x028AFC
+#define R_028B00_VGT_STRMOUT_BUFFER_SIZE_3 0x028B00
+#define R_028B04_VGT_STRMOUT_VTX_STRIDE_3 0x028B04
+#define R_028B08_VGT_STRMOUT_BUFFER_BASE_3 0x028B08
+#define R_028B0C_VGT_STRMOUT_BUFFER_OFFSET_3 0x028B0C
+#define R_028B10_VGT_STRMOUT_BASE_OFFSET_0 0x028B10
+#define R_028B14_VGT_STRMOUT_BASE_OFFSET_1 0x028B14
+#define R_028B18_VGT_STRMOUT_BASE_OFFSET_2 0x028B18
+#define R_028B1C_VGT_STRMOUT_BASE_OFFSET_3 0x028B1C
#define R_028B20_VGT_STRMOUT_BUFFER_EN 0x028B20
#define S_028B20_BUFFER_0_EN(x) (((x) & 0x1) << 0)
#define G_028B20_BUFFER_0_EN(x) (((x) >> 0) & 0x1)
@@ -3157,6 +3198,13 @@
#define S_028B20_BUFFER_3_EN(x) (((x) & 0x1) << 3)
#define G_028B20_BUFFER_3_EN(x) (((x) >> 3) & 0x1)
#define C_028B20_BUFFER_3_EN 0xFFFFFFF7
+#define R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET 0x028B28
+#define R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE 0x028B2C
+#define R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE 0x028B30
+#define R_028B44_VGT_STRMOUT_BASE_OFFSET_HI_0 0x028B44
+#define R_028B48_VGT_STRMOUT_BASE_OFFSET_HI_1 0x028B48
+#define R_028B4C_VGT_STRMOUT_BASE_OFFSET_HI_2 0x028B4C
+#define R_028B50_VGT_STRMOUT_BASE_OFFSET_HI_3 0x028B50
#define R_028C20_PA_SC_AA_SAMPLE_LOCS_8S_WD1_MCTX 0x028C20
#define S_028C20_S4_X(x) (((x) & 0xF) << 0)
#define G_028C20_S4_X(x) (((x) >> 0) & 0xF)
@@ -3280,6 +3328,9 @@
#define S_0085F0_CR2_ACTION_ENA(x) (((x) & 0x1) << 31)
#define G_0085F0_CR2_ACTION_ENA(x) (((x) >> 31) & 0x1)
#define C_0085F0_CR2_ACTION_ENA 0x7FFFFFFF
+#define R_0085F4_CP_COHER_SIZE 0x0085F4
+#define R_0085F8_CP_COHER_BASE 0x0085F8
+#define R_0085FC_CP_COHER_STATUS 0x0085FC
#define R_02812C_CB_CLEAR_ALPHA 0x02812C