diff options
author | Dave Airlie <[email protected]> | 2017-12-04 05:31:46 +1000 |
---|---|---|
committer | Dave Airlie <[email protected]> | 2018-01-29 05:42:28 +1000 |
commit | 1c9ea24a19a28e87f6038281c516287f25ad88b5 (patch) | |
tree | f59e78bd4516fc09ee7133de40c2dfa6fe5eb7e3 /src | |
parent | a7ec366e503cc2b05d6920fa5027b0f001ae9e58 (diff) |
r600: add ARB_query_buffer_object support
This uses a different shader than radeonsi, as we can't address non-256
aligned ssbos, which the radeonsi code does. This passes some extra
offsets into the shader.
It also contains a set of u64 instruction implementation that may
or may not be complete (at least the u64div is definitely not something
that works outside this use-case). If r600 grows 64-bit integers,
it will use the GLSL lowering for divmod.
Reviewed-by: Roland Scheidegger <[email protected]>
Signed-off-by: Dave Airlie <[email protected]>
Diffstat (limited to 'src')
-rw-r--r-- | src/gallium/drivers/r600/evergreen_state.c | 60 | ||||
-rw-r--r-- | src/gallium/drivers/r600/r600_hw_context.c | 5 | ||||
-rw-r--r-- | src/gallium/drivers/r600/r600_pipe.c | 4 | ||||
-rw-r--r-- | src/gallium/drivers/r600/r600_pipe.h | 1 | ||||
-rw-r--r-- | src/gallium/drivers/r600/r600_pipe_common.c | 5 | ||||
-rw-r--r-- | src/gallium/drivers/r600/r600_pipe_common.h | 1 | ||||
-rw-r--r-- | src/gallium/drivers/r600/r600_query.c | 63 | ||||
-rw-r--r-- | src/gallium/drivers/r600/r600_shader.c | 707 | ||||
-rw-r--r-- | src/gallium/drivers/r600/r600_state_common.c | 1 | ||||
-rw-r--r-- | src/gallium/drivers/r600/r600d.h | 1 |
10 files changed, 817 insertions, 31 deletions
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index ccd20a8ddd8..63a39a23f8f 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -22,6 +22,7 @@ */ #include "r600_formats.h" #include "r600_shader.h" +#include "r600_query.h" #include "evergreend.h" #include "pipe/p_shader_tokens.h" @@ -4235,6 +4236,64 @@ static void evergreen_set_shader_images(struct pipe_context *ctx, r600_mark_atom_dirty(rctx, &istate->atom); } +static void evergreen_get_pipe_constant_buffer(struct r600_context *rctx, + enum pipe_shader_type shader, uint slot, + struct pipe_constant_buffer *cbuf) +{ + struct r600_constbuf_state *state = &rctx->constbuf_state[shader]; + struct pipe_constant_buffer *cb; + cbuf->user_buffer = NULL; + + cb = &state->cb[slot]; + + cbuf->buffer_size = cb->buffer_size; + pipe_resource_reference(&cbuf->buffer, cb->buffer); +} + +static void evergreen_get_shader_buffers(struct r600_context *rctx, + enum pipe_shader_type shader, + uint start_slot, uint count, + struct pipe_shader_buffer *sbuf) +{ + assert(shader == PIPE_SHADER_COMPUTE); + int idx, i; + struct r600_image_state *istate = &rctx->compute_buffers; + struct r600_image_view *rview; + + for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) { + + rview = &istate->views[i]; + + pipe_resource_reference(&sbuf[idx].buffer, rview->base.resource); + if (rview->base.resource) { + uint64_t rview_va = ((struct r600_resource *)rview->base.resource)->gpu_address; + + uint64_t prog_va = rview->resource_words[0]; + + prog_va += ((uint64_t)G_030008_BASE_ADDRESS_HI(rview->resource_words[2])) << 32; + prog_va -= rview_va; + + sbuf[idx].buffer_offset = prog_va & 0xffffffff; + sbuf[idx].buffer_size = rview->resource_words[1] + 1;; + } else { + sbuf[idx].buffer_offset = 0; + sbuf[idx].buffer_size = 0; + } + } +} + +static void evergreen_save_qbo_state(struct pipe_context *ctx, struct r600_qbo_state *st) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + st->saved_compute = rctx->cs_shader_state.shader; + + /* save constant buffer 0 */ + evergreen_get_pipe_constant_buffer(rctx, PIPE_SHADER_COMPUTE, 0, &st->saved_const0); + /* save ssbo 0 */ + evergreen_get_shader_buffers(rctx, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo); +} + + void evergreen_init_state_functions(struct r600_context *rctx) { unsigned id = 1; @@ -4332,6 +4391,7 @@ void evergreen_init_state_functions(struct r600_context *rctx) else rctx->b.b.get_sample_position = cayman_get_sample_position; rctx->b.dma_copy = evergreen_dma_copy; + rctx->b.save_qbo_state = evergreen_save_qbo_state; evergreen_init_compute_state_functions(rctx); } diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c index 259e1a826a3..4b6d951af6b 100644 --- a/src/gallium/drivers/r600/r600_hw_context.c +++ b/src/gallium/drivers/r600/r600_hw_context.c @@ -123,6 +123,11 @@ void r600_flush_emit(struct r600_context *rctx) radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); } + if (rctx->b.flags & R600_CONTEXT_CS_PARTIAL_FLUSH) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + } + if (wait_until) { /* Use of WAIT_UNTIL is deprecated on Cayman+ */ if (rctx->b.family < CHIP_CAYMAN) { diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index e7f8ae83ecc..bd2cd884cb9 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -352,6 +352,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_TGSI_PACK_HALF_FLOAT: case PIPE_CAP_TGSI_CLOCK: case PIPE_CAP_TGSI_ARRAY_COMPONENTS: + case PIPE_CAP_QUERY_BUFFER_OBJECT: return family >= CHIP_CEDAR ? 1 : 0; case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: return family >= CHIP_CEDAR ? 4 : 0; @@ -384,7 +385,6 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: case PIPE_CAP_GENERATE_MIPMAP: case PIPE_CAP_STRING_MARKER: - case PIPE_CAP_QUERY_BUFFER_OBJECT: case PIPE_CAP_PRIMITIVE_RESTART_FOR_PATCHES: case PIPE_CAP_TGSI_VOTE: case PIPE_CAP_MAX_WINDOW_RECTANGLES: @@ -759,7 +759,7 @@ struct pipe_screen *r600_screen_create(struct radeon_winsys *ws, R600_CONTEXT_INV_VERTEX_CACHE | R600_CONTEXT_INV_TEX_CACHE | R600_CONTEXT_INV_CONST_CACHE; - rscreen->b.barrier_flags.compute_to_L2 = R600_CONTEXT_PS_PARTIAL_FLUSH; + rscreen->b.barrier_flags.compute_to_L2 = R600_CONTEXT_CS_PARTIAL_FLUSH | R600_CONTEXT_FLUSH_AND_INV; rscreen->global_pool = compute_memory_pool_new(rscreen); diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h index 0b4542fb7ee..e9f95a512f6 100644 --- a/src/gallium/drivers/r600/r600_pipe.h +++ b/src/gallium/drivers/r600/r600_pipe.h @@ -63,6 +63,7 @@ #define R600_CONTEXT_PS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 8) #define R600_CONTEXT_WAIT_3D_IDLE (R600_CONTEXT_PRIVATE_FLAG << 9) #define R600_CONTEXT_WAIT_CP_DMA_IDLE (R600_CONTEXT_PRIVATE_FLAG << 10) +#define R600_CONTEXT_CS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 11) /* the number of CS dwords for flushing and drawing */ #define R600_MAX_FLUSH_CS_DWORDS 18 diff --git a/src/gallium/drivers/r600/r600_pipe_common.c b/src/gallium/drivers/r600/r600_pipe_common.c index e02c43f93b8..5d14e588b51 100644 --- a/src/gallium/drivers/r600/r600_pipe_common.c +++ b/src/gallium/drivers/r600/r600_pipe_common.c @@ -134,6 +134,7 @@ unsigned r600_gfx_write_fence_dwords(struct r600_common_screen *screen) } void r600_gfx_wait_fence(struct r600_common_context *ctx, + struct r600_resource *buf, uint64_t va, uint32_t ref, uint32_t mask) { struct radeon_winsys_cs *cs = ctx->gfx.cs; @@ -145,6 +146,10 @@ void r600_gfx_wait_fence(struct r600_common_context *ctx, radeon_emit(cs, ref); /* reference value */ radeon_emit(cs, mask); /* mask */ radeon_emit(cs, 4); /* poll interval */ + + if (buf) + r600_emit_reloc(ctx, &ctx->gfx, buf, RADEON_USAGE_READ, + RADEON_PRIO_QUERY); } void r600_draw_rectangle(struct blitter_context *blitter, diff --git a/src/gallium/drivers/r600/r600_pipe_common.h b/src/gallium/drivers/r600/r600_pipe_common.h index 1fdbc39a560..86a20f8639b 100644 --- a/src/gallium/drivers/r600/r600_pipe_common.h +++ b/src/gallium/drivers/r600/r600_pipe_common.h @@ -679,6 +679,7 @@ void r600_gfx_write_event_eop(struct r600_common_context *ctx, uint32_t new_fence, unsigned query_type); unsigned r600_gfx_write_fence_dwords(struct r600_common_screen *screen); void r600_gfx_wait_fence(struct r600_common_context *ctx, + struct r600_resource *buf, uint64_t va, uint32_t ref, uint32_t mask); void r600_draw_rectangle(struct blitter_context *blitter, void *vertex_elements_cso, diff --git a/src/gallium/drivers/r600/r600_query.c b/src/gallium/drivers/r600/r600_query.c index b4519830cc4..729c6f2aaa7 100644 --- a/src/gallium/drivers/r600/r600_query.c +++ b/src/gallium/drivers/r600/r600_query.c @@ -535,7 +535,8 @@ static bool r600_query_hw_prepare_buffer(struct r600_common_screen *rscreen, memset(results, 0, buffer->b.b.width0); if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER || - query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE) { + query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE || + query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) { unsigned max_rbs = rscreen->info.num_render_backends; unsigned enabled_rb_mask = rscreen->info.enabled_rb_mask; unsigned num_results; @@ -620,6 +621,7 @@ static struct pipe_query *r600_query_hw_create(struct r600_common_screen *rscree switch (query_type) { case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: query->result_size = 16 * rscreen->info.num_render_backends; query->result_size += 16; /* for the fence + alignment */ query->num_cs_dw_begin = 6; @@ -676,7 +678,8 @@ static void r600_update_occlusion_query_state(struct r600_common_context *rctx, unsigned type, int diff) { if (type == PIPE_QUERY_OCCLUSION_COUNTER || - type == PIPE_QUERY_OCCLUSION_PREDICATE) { + type == PIPE_QUERY_OCCLUSION_PREDICATE || + type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) { bool old_enable = rctx->num_occlusion_queries != 0; bool old_perfect_enable = rctx->num_perfect_occlusion_queries != 0; @@ -685,7 +688,7 @@ static void r600_update_occlusion_query_state(struct r600_common_context *rctx, rctx->num_occlusion_queries += diff; assert(rctx->num_occlusion_queries >= 0); - if (type == PIPE_QUERY_OCCLUSION_COUNTER) { + if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) { rctx->num_perfect_occlusion_queries += diff; assert(rctx->num_perfect_occlusion_queries >= 0); } @@ -730,6 +733,7 @@ static void r600_query_hw_do_emit_start(struct r600_common_context *ctx, switch (query->b.type) { case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1)); radeon_emit(cs, va); @@ -810,6 +814,7 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx, switch (query->b.type) { case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: va += 8; radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1)); @@ -922,6 +927,7 @@ static void r600_emit_query_predication(struct r600_common_context *ctx, switch (query->b.type) { case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: op = PRED_OP(PREDICATION_OP_ZPASS); break; case PIPE_QUERY_SO_OVERFLOW_PREDICATE: @@ -1084,6 +1090,7 @@ static void r600_get_hw_query_params(struct r600_common_context *rctx, switch (rquery->b.type) { case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: params->start_offset = 0; params->end_offset = 8; params->fence_offset = max_rbs * 16; @@ -1176,7 +1183,8 @@ static void r600_query_hw_add_result(struct r600_common_screen *rscreen, } break; } - case PIPE_QUERY_OCCLUSION_PREDICATE: { + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: { for (unsigned i = 0; i < max_rbs; ++i) { unsigned results_base = i * 16; result->b = result->b || @@ -1383,6 +1391,8 @@ bool r600_query_hw_get_result(struct r600_common_context *rctx, * 1.x = fence_offset * 1.y = pair_stride * 1.z = pair_count + * 1.w = result_offset + * 2.x = buffer0 offset * * BUFFER[0] = query result buffer * BUFFER[1] = previous summary buffer @@ -1404,7 +1414,7 @@ static void r600_create_query_result_shader(struct r600_common_context *rctx) "DCL BUFFER[0]\n" "DCL BUFFER[1]\n" "DCL BUFFER[2]\n" - "DCL CONST[0][0..1]\n" + "DCL CONST[0][0..2]\n" "DCL TEMP[0..5]\n" "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n" "IMM[1] UINT32 {1, 2, 4, 8}\n" @@ -1415,14 +1425,16 @@ static void r600_create_query_result_shader(struct r600_common_context *rctx) "AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n" "UIF TEMP[5]\n" /* Check result availability. */ - "LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n" + "UADD TEMP[1].x, CONST[0][1].xxxx, CONST[0][2].xxxx\n" + "LOAD TEMP[1].x, BUFFER[0], TEMP[1].xxxx\n" "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n" "MOV TEMP[1], TEMP[0].zzzz\n" "NOT TEMP[0].z, TEMP[0].zzzz\n" /* Load result if available. */ "UIF TEMP[1]\n" - "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n" + "UADD TEMP[0].x, IMM[0].xxxx, CONST[0][2].xxxx\n" + "LOAD TEMP[0].xy, BUFFER[0], TEMP[0].xxxx\n" "ENDIF\n" "ELSE\n" /* Load previously accumulated result if requested. */ @@ -1447,6 +1459,7 @@ static void r600_create_query_result_shader(struct r600_common_context *rctx) /* Load fence and check result availability */ "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n" + "UADD TEMP[5].x, TEMP[5].xxxx, CONST[0][2].xxxx\n" "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n" "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n" "NOT TEMP[0].z, TEMP[0].zzzz\n" @@ -1459,6 +1472,7 @@ static void r600_create_query_result_shader(struct r600_common_context *rctx) /* Load start and end. */ "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n" "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n" + "UADD TEMP[5].x, TEMP[5].xxxx, CONST[0][2].xxxx\n" "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n" "UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n" @@ -1497,18 +1511,18 @@ static void r600_create_query_result_shader(struct r600_common_context *rctx) "AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n" "UIF TEMP[4]\n" /* Store accumulated data for chaining. */ - "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n" + "STORE BUFFER[2].xyz, CONST[0][1].wwww, TEMP[0]\n" "ELSE\n" "AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n" "UIF TEMP[4]\n" /* Store result availability. */ "NOT TEMP[0].z, TEMP[0]\n" "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n" - "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n" + "STORE BUFFER[2].x, CONST[0][1].wwww, TEMP[0].zzzz\n" "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n" "UIF TEMP[4]\n" - "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n" + "STORE BUFFER[2].y, CONST[0][1].wwww, IMM[0].xxxx\n" "ENDIF\n" "ELSE\n" /* Store result if it is available. */ @@ -1531,7 +1545,7 @@ static void r600_create_query_result_shader(struct r600_common_context *rctx) "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n" "UIF TEMP[4]\n" - "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n" + "STORE BUFFER[2].xy, CONST[0][1].wwww, TEMP[0].xyxy\n" "ELSE\n" /* Clamping */ "UIF TEMP[0].yyyy\n" @@ -1543,7 +1557,7 @@ static void r600_create_query_result_shader(struct r600_common_context *rctx) "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n" "ENDIF\n" - "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n" + "STORE BUFFER[2].x, CONST[0][1].wwww, TEMP[0].xxxx\n" "ENDIF\n" "ENDIF\n" "ENDIF\n" @@ -1611,6 +1625,8 @@ static void r600_query_hw_get_result_resource(struct r600_common_context *rctx, uint32_t fence_offset; uint32_t pair_stride; uint32_t pair_count; + uint32_t buffer_offset; + uint32_t buffer0_offset; } consts; if (!rctx->query_result_shader) { @@ -1656,7 +1672,8 @@ static void r600_query_hw_get_result_resource(struct r600_common_context *rctx, consts.config = 0; if (index < 0) consts.config |= 4; - if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE) + if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE || + query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) consts.config |= 8; else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) @@ -1696,18 +1713,20 @@ static void r600_query_hw_get_result_resource(struct r600_common_context *rctx, params.start_offset += qbuf->results_end - query->result_size; } - rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer); - ssbo[0].buffer = &qbuf->buf->b.b; - ssbo[0].buffer_offset = params.start_offset; - ssbo[0].buffer_size = qbuf->results_end - params.start_offset; - + ssbo[0].buffer_offset = params.start_offset & ~0xff; + ssbo[0].buffer_size = qbuf->results_end - ssbo[0].buffer_offset; + consts.buffer0_offset = (params.start_offset & 0xff); if (!qbuf->previous) { + ssbo[2].buffer = resource; - ssbo[2].buffer_offset = offset; - ssbo[2].buffer_size = 8; + ssbo[2].buffer_offset = offset & ~0xff; + ssbo[2].buffer_size = offset + 8; + consts.buffer_offset = (offset & 0xff); + } else + consts.buffer_offset = 0; - } + rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer); rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo); @@ -1721,7 +1740,7 @@ static void r600_query_hw_get_result_resource(struct r600_common_context *rctx, va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size; va += params.fence_offset; - r600_gfx_wait_fence(rctx, va, 0x80000000, 0x80000000); + r600_gfx_wait_fence(rctx, qbuf->buf, va, 0x80000000, 0x80000000); } rctx->b.launch_grid(&rctx->b, &grid); diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index cf669781202..3344bcb76a7 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -770,7 +770,7 @@ static int single_alu_op3(struct r600_shader_ctx *ctx, int op, int r; /* validate this for other ops */ - assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT); + assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT || op == ALU_OP3_BFE_UINT); memset(&alu, 0, sizeof(struct r600_bytecode_alu)); alu.op = op; alu.src[0].sel = src0_sel; @@ -9457,7 +9457,8 @@ static int tgsi_opdst(struct r600_shader_ctx *ctx) return 0; } -static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type) +static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type, + struct r600_bytecode_alu_src *src) { struct r600_bytecode_alu alu; int r; @@ -9471,7 +9472,7 @@ static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type alu.dst.write = 1; alu.dst.chan = 0; - r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); + alu.src[0] = *src; alu.src[1].sel = V_SQ_ALU_SRC_0; alu.src[1].chan = 0; @@ -9697,7 +9698,8 @@ static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp) } #endif -static int emit_if(struct r600_shader_ctx *ctx, int opcode) +static int emit_if(struct r600_shader_ctx *ctx, int opcode, + struct r600_bytecode_alu_src *src) { int alu_type = CF_OP_ALU_PUSH_BEFORE; @@ -9711,7 +9713,7 @@ static int emit_if(struct r600_shader_ctx *ctx, int opcode) alu_type = CF_OP_ALU; } - emit_logic_pred(ctx, opcode, alu_type); + emit_logic_pred(ctx, opcode, alu_type, src); r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP); @@ -9723,12 +9725,17 @@ static int emit_if(struct r600_shader_ctx *ctx, int opcode) static int tgsi_if(struct r600_shader_ctx *ctx) { - return emit_if(ctx, ALU_OP2_PRED_SETNE); + struct r600_bytecode_alu_src alu_src; + r600_bytecode_src(&alu_src, &ctx->src[0], 0); + + return emit_if(ctx, ALU_OP2_PRED_SETNE, &alu_src); } static int tgsi_uif(struct r600_shader_ctx *ctx) { - return emit_if(ctx, ALU_OP2_PRED_SETNE_INT); + struct r600_bytecode_alu_src alu_src; + r600_bytecode_src(&alu_src, &ctx->src[0], 0); + return emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); } static int tgsi_else(struct r600_shader_ctx *ctx) @@ -10077,6 +10084,684 @@ static int tgsi_clock(struct r600_shader_ctx *ctx) return 0; } +static int emit_u64add(struct r600_shader_ctx *ctx, int op, + int treg, + int src0_sel, int src0_chan, + int src1_sel, int src1_chan) +{ + struct r600_bytecode_alu alu; + int r; + int opc; + + if (op == ALU_OP2_ADD_INT) + opc = ALU_OP2_ADDC_UINT; + else + opc = ALU_OP2_SUBB_UINT; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = op; ; + alu.dst.sel = treg; + alu.dst.chan = 0; + alu.dst.write = 1; + alu.src[0].sel = src0_sel; + alu.src[0].chan = src0_chan + 0; + alu.src[1].sel = src1_sel; + alu.src[1].chan = src1_chan + 0; + alu.src[1].neg = 0; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = op; + alu.dst.sel = treg; + alu.dst.chan = 1; + alu.dst.write = 1; + alu.src[0].sel = src0_sel; + alu.src[0].chan = src0_chan + 1; + alu.src[1].sel = src1_sel; + alu.src[1].chan = src1_chan + 1; + alu.src[1].neg = 0; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = opc; + alu.dst.sel = treg; + alu.dst.chan = 2; + alu.dst.write = 1; + alu.last = 1; + alu.src[0].sel = src0_sel; + alu.src[0].chan = src0_chan + 0; + alu.src[1].sel = src1_sel; + alu.src[1].chan = src1_chan + 0; + alu.src[1].neg = 0; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = op; + alu.dst.sel = treg; + alu.dst.chan = 1; + alu.dst.write = 1; + alu.src[0].sel = treg; + alu.src[0].chan = 1; + alu.src[1].sel = treg; + alu.src[1].chan = 2; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + return 0; +} + +static int egcm_u64add(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bytecode_alu alu; + int r; + int treg = ctx->temp_reg; + int op = ALU_OP2_ADD_INT, opc = ALU_OP2_ADDC_UINT; + + if (ctx->src[1].neg) { + op = ALU_OP2_SUB_INT; + opc = ALU_OP2_SUBB_UINT; + } + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = op; ; + alu.dst.sel = treg; + alu.dst.chan = 0; + alu.dst.write = 1; + r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); + r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); + alu.src[1].neg = 0; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = op; + alu.dst.sel = treg; + alu.dst.chan = 1; + alu.dst.write = 1; + r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); + r600_bytecode_src(&alu.src[1], &ctx->src[1], 1); + alu.src[1].neg = 0; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = opc ; + alu.dst.sel = treg; + alu.dst.chan = 2; + alu.dst.write = 1; + alu.last = 1; + r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); + r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); + alu.src[1].neg = 0; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = op; + tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); + alu.src[0].sel = treg; + alu.src[0].chan = 1; + alu.src[1].sel = treg; + alu.src[1].chan = 2; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); + alu.src[0].sel = treg; + alu.src[0].chan = 0; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + return 0; +} + +/* result.y = mul_high a, b + result.x = mul a,b + result.y += a.x * b.y + a.y * b.x; +*/ +static int egcm_u64mul(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bytecode_alu alu; + int r; + int treg = ctx->temp_reg; + + /* temp.x = mul_lo a.x, b.x */ + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP2_MULLO_UINT; + alu.dst.sel = treg; + alu.dst.chan = 0; + alu.dst.write = 1; + r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); + r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); + r = emit_mul_int_op(ctx->bc, &alu); + if (r) + return r; + + /* temp.y = mul_hi a.x, b.x */ + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP2_MULHI_UINT; + alu.dst.sel = treg; + alu.dst.chan = 1; + alu.dst.write = 1; + r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); + r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); + r = emit_mul_int_op(ctx->bc, &alu); + if (r) + return r; + + /* temp.z = mul a.x, b.y */ + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP2_MULLO_UINT; + alu.dst.sel = treg; + alu.dst.chan = 2; + alu.dst.write = 1; + r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); + r600_bytecode_src(&alu.src[1], &ctx->src[1], 1); + r = emit_mul_int_op(ctx->bc, &alu); + if (r) + return r; + + /* temp.w = mul a.y, b.x */ + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP2_MULLO_UINT; + alu.dst.sel = treg; + alu.dst.chan = 3; + alu.dst.write = 1; + r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); + r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); + r = emit_mul_int_op(ctx->bc, &alu); + if (r) + return r; + + /* temp.z = temp.z + temp.w */ + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP2_ADD_INT; + alu.dst.sel = treg; + alu.dst.chan = 2; + alu.dst.write = 1; + alu.src[0].sel = treg; + alu.src[0].chan = 2; + alu.src[1].sel = treg; + alu.src[1].chan = 3; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + /* temp.y = temp.y + temp.z */ + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP2_ADD_INT; + alu.dst.sel = treg; + alu.dst.chan = 1; + alu.dst.write = 1; + alu.src[0].sel = treg; + alu.src[0].chan = 1; + alu.src[1].sel = treg; + alu.src[1].chan = 2; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + /* dst.x = temp.x */ + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); + alu.src[0].sel = treg; + alu.src[0].chan = 0; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + /* dst.y = temp.y */ + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); + alu.src[0].sel = treg; + alu.src[0].chan = 1; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + return 0; +} + +static int emit_u64sge(struct r600_shader_ctx *ctx, + int treg, + int src0_sel, int src0_base_chan, + int src1_sel, int src1_base_chan) +{ + int r; + /* for 64-bit sge */ + /* result = (src0.y > src1.y) || ((src0.y == src1.y) && src0.x >= src1.x)) */ + r = single_alu_op2(ctx, ALU_OP2_SETGT_UINT, + treg, 1, + src0_sel, src0_base_chan + 1, + src1_sel, src1_base_chan + 1); + if (r) + return r; + + r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, + treg, 0, + src0_sel, src0_base_chan, + src1_sel, src1_base_chan); + if (r) + return r; + + r = single_alu_op2(ctx, ALU_OP2_SETE_INT, + treg, 2, + src0_sel, src0_base_chan + 1, + src1_sel, src1_base_chan + 1); + if (r) + return r; + + r = single_alu_op2(ctx, ALU_OP2_AND_INT, + treg, 0, + treg, 0, + treg, 2); + if (r) + return r; + + r = single_alu_op2(ctx, ALU_OP2_OR_INT, + treg, 0, + treg, 0, + treg, 1); + if (r) + return r; + return 0; +} + +/* this isn't a complete div it's just enough for qbo shader to work */ +static int egcm_u64div(struct r600_shader_ctx *ctx) +{ + struct r600_bytecode_alu alu; + struct r600_bytecode_alu_src alu_num_hi, alu_num_lo, alu_denom_hi, alu_denom_lo, alu_src; + int r, i; + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + + /* make sure we are dividing my a const with 0 in the high bits */ + if (ctx->src[1].sel != V_SQ_ALU_SRC_LITERAL) + return -1; + if (ctx->src[1].value[ctx->src[1].swizzle[1]] != 0) + return -1; + /* make sure we are doing one division */ + if (inst->Dst[0].Register.WriteMask != 0x3) + return -1; + + /* emit_if uses ctx->temp_reg so we can't */ + int treg = r600_get_temp(ctx); + int tmp_num = r600_get_temp(ctx); + int sub_tmp = r600_get_temp(ctx); + + /* tmp quot are tmp_num.zw */ + r600_bytecode_src(&alu_num_lo, &ctx->src[0], 0); + r600_bytecode_src(&alu_num_hi, &ctx->src[0], 1); + r600_bytecode_src(&alu_denom_lo, &ctx->src[1], 0); + r600_bytecode_src(&alu_denom_hi, &ctx->src[1], 1); + + /* MOV tmp_num.xy, numerator */ + r = single_alu_op2(ctx, ALU_OP1_MOV, + tmp_num, 0, + alu_num_lo.sel, alu_num_lo.chan, + 0, 0); + if (r) + return r; + r = single_alu_op2(ctx, ALU_OP1_MOV, + tmp_num, 1, + alu_num_hi.sel, alu_num_hi.chan, + 0, 0); + if (r) + return r; + + r = single_alu_op2(ctx, ALU_OP1_MOV, + tmp_num, 2, + V_SQ_ALU_SRC_LITERAL, 0, + 0, 0); + if (r) + return r; + + r = single_alu_op2(ctx, ALU_OP1_MOV, + tmp_num, 3, + V_SQ_ALU_SRC_LITERAL, 0, + 0, 0); + if (r) + return r; + + /* treg 0 is log2_denom */ + /* normally this gets the MSB for the denom high value + - however we know this will always be 0 here. */ + r = single_alu_op2(ctx, + ALU_OP1_MOV, + treg, 0, + V_SQ_ALU_SRC_LITERAL, 32, + 0, 0); + if (r) + return r; + + /* normally check demon hi for 0, but we know it is already */ + /* t0.z = num_hi >= denom_lo */ + r = single_alu_op2(ctx, + ALU_OP2_SETGE_UINT, + treg, 1, + alu_num_hi.sel, alu_num_hi.chan, + V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value); + if (r) + return r; + + memset(&alu_src, 0, sizeof(alu_src)); + alu_src.sel = treg; + alu_src.chan = 1; + r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); + if (r) + return r; + + /* for loops in here */ + /* get msb t0.x = msb(src[1].x) first */ + int msb_lo = util_last_bit(alu_denom_lo.value); + r = single_alu_op2(ctx, ALU_OP1_MOV, + treg, 0, + V_SQ_ALU_SRC_LITERAL, msb_lo, + 0, 0); + if (r) + return r; + + /* unroll the asm here */ + for (i = 0; i < 31; i++) { + r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, + treg, 2, + V_SQ_ALU_SRC_LITERAL, i, + treg, 0); + if (r) + return r; + + /* we can do this on the CPU */ + uint32_t denom_lo_shl = alu_denom_lo.value << (31 - i); + /* t0.z = tmp_num.y >= t0.z */ + r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, + treg, 1, + tmp_num, 1, + V_SQ_ALU_SRC_LITERAL, denom_lo_shl); + if (r) + return r; + + r = single_alu_op2(ctx, ALU_OP2_AND_INT, + treg, 1, + treg, 1, + treg, 2); + if (r) + return r; + + memset(&alu_src, 0, sizeof(alu_src)); + alu_src.sel = treg; + alu_src.chan = 1; + r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); + if (r) + return r; + + r = single_alu_op2(ctx, ALU_OP2_SUB_INT, + tmp_num, 1, + tmp_num, 1, + V_SQ_ALU_SRC_LITERAL, denom_lo_shl); + if (r) + return r; + + r = single_alu_op2(ctx, ALU_OP2_OR_INT, + tmp_num, 3, + tmp_num, 3, + V_SQ_ALU_SRC_LITERAL, 1U << (31 - i)); + if (r) + return r; + + r = tgsi_endif(ctx); + if (r) + return r; + } + + /* log2_denom is always <= 31, so manually peel the last loop + * iteration. + */ + r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, + treg, 1, + tmp_num, 1, + V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value); + if (r) + return r; + + memset(&alu_src, 0, sizeof(alu_src)); + alu_src.sel = treg; + alu_src.chan = 1; + r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); + if (r) + return r; + + r = single_alu_op2(ctx, ALU_OP2_SUB_INT, + tmp_num, 1, + tmp_num, 1, + V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value); + if (r) + return r; + + r = single_alu_op2(ctx, ALU_OP2_OR_INT, + tmp_num, 3, + tmp_num, 3, + V_SQ_ALU_SRC_LITERAL, 1U); + if (r) + return r; + r = tgsi_endif(ctx); + if (r) + return r; + + r = tgsi_endif(ctx); + if (r) + return r; + + /* onto the second loop to unroll */ + for (i = 0; i < 31; i++) { + r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, + treg, 1, + V_SQ_ALU_SRC_LITERAL, (63 - (31 - i)), + treg, 0); + if (r) + return r; + + uint64_t denom_shl = (uint64_t)alu_denom_lo.value << (31 - i); + r = single_alu_op2(ctx, ALU_OP1_MOV, + treg, 2, + V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff), + 0, 0); + if (r) + return r; + + r = single_alu_op2(ctx, ALU_OP1_MOV, + treg, 3, + V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32), + 0, 0); + if (r) + return r; + + r = emit_u64sge(ctx, sub_tmp, + tmp_num, 0, + treg, 2); + if (r) + return r; + + r = single_alu_op2(ctx, ALU_OP2_AND_INT, + treg, 1, + treg, 1, + sub_tmp, 0); + if (r) + return r; + + memset(&alu_src, 0, sizeof(alu_src)); + alu_src.sel = treg; + alu_src.chan = 1; + r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); + if (r) + return r; + + + r = emit_u64add(ctx, ALU_OP2_SUB_INT, + sub_tmp, + tmp_num, 0, + treg, 2); + if (r) + return r; + + r = single_alu_op2(ctx, ALU_OP1_MOV, + tmp_num, 0, + sub_tmp, 0, + 0, 0); + if (r) + return r; + + r = single_alu_op2(ctx, ALU_OP1_MOV, + tmp_num, 1, + sub_tmp, 1, + 0, 0); + if (r) + return r; + + r = single_alu_op2(ctx, ALU_OP2_OR_INT, + tmp_num, 2, + tmp_num, 2, + V_SQ_ALU_SRC_LITERAL, 1U << (31 - i)); + if (r) + return r; + + r = tgsi_endif(ctx); + if (r) + return r; + } + + /* log2_denom is always <= 63, so manually peel the last loop + * iteration. + */ + uint64_t denom_shl = (uint64_t)alu_denom_lo.value; + r = single_alu_op2(ctx, ALU_OP1_MOV, + treg, 2, + V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff), + 0, 0); + if (r) + return r; + + r = single_alu_op2(ctx, ALU_OP1_MOV, + treg, 3, + V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32), + 0, 0); + if (r) + return r; + + r = emit_u64sge(ctx, sub_tmp, + tmp_num, 0, + treg, 2); + if (r) + return r; + + memset(&alu_src, 0, sizeof(alu_src)); + alu_src.sel = sub_tmp; + alu_src.chan = 0; + r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); + if (r) + return r; + + r = emit_u64add(ctx, ALU_OP2_SUB_INT, + sub_tmp, + tmp_num, 0, + treg, 2); + if (r) + return r; + + r = single_alu_op2(ctx, ALU_OP2_OR_INT, + tmp_num, 2, + tmp_num, 2, + V_SQ_ALU_SRC_LITERAL, 1U); + if (r) + return r; + r = tgsi_endif(ctx); + if (r) + return r; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); + alu.src[0].sel = tmp_num; + alu.src[0].chan = 2; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); + alu.src[0].sel = tmp_num; + alu.src[0].chan = 3; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + return 0; +} + +static int egcm_u64sne(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bytecode_alu alu; + int r; + int treg = ctx->temp_reg; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP2_SETNE_INT; + alu.dst.sel = treg; + alu.dst.chan = 0; + alu.dst.write = 1; + r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); + r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP2_SETNE_INT; + alu.dst.sel = treg; + alu.dst.chan = 1; + alu.dst.write = 1; + r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); + r600_bytecode_src(&alu.src[1], &ctx->src[1], 1); + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP2_OR_INT; + tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); + alu.src[0].sel = treg; + alu.src[0].chan = 0; + alu.src[1].sel = treg; + alu.src[1].chan = 1; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + return 0; +} + static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = { [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl}, [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, @@ -10497,6 +11182,10 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int}, [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double}, [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr}, + [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne }, + [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add }, + [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul }, + [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div }, [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported}, }; @@ -10719,5 +11408,9 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int}, [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double}, [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr}, + [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne }, + [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add }, + [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul }, + [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div }, [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported}, }; diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c index b49b05608d6..d9c74b5f47c 100644 --- a/src/gallium/drivers/r600/r600_state_common.c +++ b/src/gallium/drivers/r600/r600_state_common.c @@ -3219,6 +3219,7 @@ void r600_init_common_state_functions(struct r600_context *rctx) rctx->b.b.texture_barrier = r600_texture_barrier; rctx->b.b.set_stream_output_targets = r600_set_streamout_targets; rctx->b.b.set_active_query_state = r600_set_active_query_state; + rctx->b.b.draw_vbo = r600_draw_vbo; rctx->b.invalidate_buffer = r600_invalidate_buffer; rctx->b.need_gfx_cs_space = r600_need_gfx_cs_space; diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h index 0d04708043e..0717eab9617 100644 --- a/src/gallium/drivers/r600/r600d.h +++ b/src/gallium/drivers/r600/r600d.h @@ -124,6 +124,7 @@ #define SURFACE_BASE_UPDATE_COLOR_NUM(x) (((1 << x) - 1) << 1) #define SURFACE_BASE_UPDATE_STRMOUT(x) (0x200 << (x)) +#define EVENT_TYPE_CS_PARTIAL_FLUSH 0x07 /* eg+ */ #define EVENT_TYPE_PS_PARTIAL_FLUSH 0x10 #define EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT 0x14 #define EVENT_TYPE_ZPASS_DONE 0x15 |