diff options
author | Dave Airlie <[email protected]> | 2017-11-02 10:26:51 +1000 |
---|---|---|
committer | Dave Airlie <[email protected]> | 2017-11-10 08:39:36 +1000 |
commit | 06993e4ee350b9c2ab1e3ee7686878add3900d39 (patch) | |
tree | 354a8654e8f0ef19dd201b469b922f937e0f8085 | |
parent | 9e62654d4b47adfd5bdd60389dee17fdd17dba73 (diff) |
r600: add support for hw atomic counters. (v3)
This adds support for the evergreen/cayman atomic counters.
These are implemented using GDS append/consume counters. The values
for each counter are loaded before drawing and saved after each draw
using special CP packets.
v2: move hw atomic assignment into driver.
v3: fix messing up caps (Gert Wollny), only store ranges in driver,
drop buffers.
Signed-off-by: Dave Airlie <[email protected]>
Acked-by: Nicolai Hähnle <[email protected]>
Tested-By: Gert Wollny <[email protected]>
-rw-r--r-- | src/gallium/drivers/r600/evergreen_state.c | 159 | ||||
-rw-r--r-- | src/gallium/drivers/r600/r600_pipe.c | 15 | ||||
-rw-r--r-- | src/gallium/drivers/r600/r600_pipe.h | 22 | ||||
-rw-r--r-- | src/gallium/drivers/r600/r600_shader.c | 239 | ||||
-rw-r--r-- | src/gallium/drivers/r600/r600_shader.h | 19 | ||||
-rw-r--r-- | src/gallium/drivers/r600/r600_state_common.c | 46 | ||||
-rw-r--r-- | src/gallium/drivers/r600/r600d_common.h | 2 |
7 files changed, 480 insertions, 22 deletions
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index 131778dea9f..eb8b139910d 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -3717,6 +3717,38 @@ static void evergreen_set_tess_state(struct pipe_context *ctx, rctx->tess_state_dirty = true; } +static void evergreen_set_hw_atomic_buffers(struct pipe_context *ctx, + unsigned start_slot, + unsigned count, + const struct pipe_shader_buffer *buffers) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_atomic_buffer_state *astate; + int i, idx; + + astate = &rctx->atomic_buffer_state; + + /* we'd probably like to expand this to 8 later so put the logic in */ + for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) { + const struct pipe_shader_buffer *buf; + struct pipe_shader_buffer *abuf; + + abuf = &astate->buffer[i]; + + if (!buffers || !buffers[idx].buffer) { + pipe_resource_reference(&abuf->buffer, NULL); + astate->enabled_mask &= ~(1 << i); + continue; + } + buf = &buffers[idx]; + + pipe_resource_reference(&abuf->buffer, buf->buffer); + abuf->buffer_offset = buf->buffer_offset; + abuf->buffer_size = buf->buffer_size; + astate->enabled_mask |= (1 << i); + } +} + void evergreen_init_state_functions(struct r600_context *rctx) { unsigned id = 1; @@ -3802,6 +3834,7 @@ void evergreen_init_state_functions(struct r600_context *rctx) rctx->b.b.set_polygon_stipple = evergreen_set_polygon_stipple; rctx->b.b.set_min_samples = evergreen_set_min_samples; rctx->b.b.set_tess_state = evergreen_set_tess_state; + rctx->b.b.set_hw_atomic_buffers = evergreen_set_hw_atomic_buffers; if (rctx->b.chip_class == EVERGREEN) rctx->b.b.get_sample_position = evergreen_get_sample_position; else @@ -4108,3 +4141,129 @@ void eg_trace_emit(struct r600_context *rctx) radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, AC_ENCODE_TRACE_POINT(rctx->trace_id)); } + +bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx, + struct r600_shader_atomic *combined_atomics, + uint8_t *atomic_used_mask_p) +{ + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; + struct r600_atomic_buffer_state *astate = &rctx->atomic_buffer_state; + unsigned pkt_flags = 0; + uint8_t atomic_used_mask = 0; + int i, j, k; + + for (i = 0; i < EG_NUM_HW_STAGES; i++) { + uint8_t num_atomic_stage; + struct r600_pipe_shader *pshader; + + pshader = rctx->hw_shader_stages[i].shader; + if (!pshader) + continue; + + num_atomic_stage = pshader->shader.nhwatomic_ranges; + if (!num_atomic_stage) + continue; + + for (j = 0; j < num_atomic_stage; j++) { + struct r600_shader_atomic *atomic = &pshader->shader.atomics[j]; + int natomics = atomic->end - atomic->start + 1; + + for (k = 0; k < natomics; k++) { + /* seen this in a previous stage */ + if (atomic_used_mask & (1u << (atomic->hw_idx + k))) + continue; + + combined_atomics[atomic->hw_idx + k].hw_idx = atomic->hw_idx + k; + combined_atomics[atomic->hw_idx + k].buffer_id = atomic->buffer_id; + combined_atomics[atomic->hw_idx + k].start = atomic->start + k; + combined_atomics[atomic->hw_idx + k].end = combined_atomics[atomic->hw_idx + k].start + 1; + atomic_used_mask |= (1u << (atomic->hw_idx + k)); + } + } + } + + uint32_t mask = atomic_used_mask; + while (mask) { + unsigned atomic_index = u_bit_scan(&mask); + struct r600_shader_atomic *atomic = &combined_atomics[atomic_index]; + struct r600_resource *resource = r600_resource(astate->buffer[atomic->buffer_id].buffer); + assert(resource); + unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, + resource, + RADEON_USAGE_READ, + RADEON_PRIO_SHADER_RW_BUFFER); + uint64_t dst_offset = resource->gpu_address + (atomic->start * 4); + uint32_t base_reg_0 = R_02872C_GDS_APPEND_COUNT_0; + + uint32_t reg_val = (base_reg_0 + atomic->hw_idx * 4 - EVERGREEN_CONTEXT_REG_OFFSET) >> 2; + + radeon_emit(cs, PKT3(PKT3_SET_APPEND_CNT, 2, 0) | pkt_flags); + radeon_emit(cs, (reg_val << 16) | 0x3); + radeon_emit(cs, dst_offset & 0xfffffffc); + radeon_emit(cs, (dst_offset >> 32) & 0xff); + radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); + radeon_emit(cs, reloc); + } + *atomic_used_mask_p = atomic_used_mask; + return true; +} + +void evergreen_emit_atomic_buffer_save(struct r600_context *rctx, + struct r600_shader_atomic *combined_atomics, + uint8_t *atomic_used_mask_p) +{ + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; + struct r600_atomic_buffer_state *astate = &rctx->atomic_buffer_state; + uint32_t pkt_flags = 0; + uint32_t event = EVENT_TYPE_PS_DONE; + uint32_t mask = astate->enabled_mask; + uint64_t dst_offset; + unsigned reloc; + + mask = *atomic_used_mask_p; + while (mask) { + unsigned atomic_index = u_bit_scan(&mask); + struct r600_shader_atomic *atomic = &combined_atomics[atomic_index]; + struct r600_resource *resource = r600_resource(astate->buffer[atomic->buffer_id].buffer); + assert(resource); + + uint32_t base_reg_0 = R_02872C_GDS_APPEND_COUNT_0; + reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, + resource, + RADEON_USAGE_WRITE, + RADEON_PRIO_SHADER_RW_BUFFER); + dst_offset = resource->gpu_address + (atomic->start * 4); + uint32_t reg_val = (base_reg_0 + atomic->hw_idx * 4) >> 2; + + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOS, 3, 0) | pkt_flags); + radeon_emit(cs, EVENT_TYPE(event) | EVENT_INDEX(6)); + radeon_emit(cs, (dst_offset) & 0xffffffff); + radeon_emit(cs, (0 << 29) | ((dst_offset >> 32) & 0xff)); + radeon_emit(cs, reg_val); + radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); + radeon_emit(cs, reloc); + } + ++rctx->append_fence_id; + reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, + r600_resource(rctx->append_fence), + RADEON_USAGE_READWRITE, + RADEON_PRIO_SHADER_RW_BUFFER); + dst_offset = r600_resource(rctx->append_fence)->gpu_address; + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOS, 3, 0) | pkt_flags); + radeon_emit(cs, EVENT_TYPE(event) | EVENT_INDEX(6)); + radeon_emit(cs, dst_offset & 0xffffffff); + radeon_emit(cs, (2 << 29) | ((dst_offset >> 32) & 0xff)); + radeon_emit(cs, rctx->append_fence_id); + radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); + radeon_emit(cs, reloc); + + radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0) | pkt_flags); + radeon_emit(cs, WAIT_REG_MEM_GEQUAL | WAIT_REG_MEM_MEMORY | (1 << 8)); + radeon_emit(cs, dst_offset & 0xffffffff); + radeon_emit(cs, ((dst_offset >> 32) & 0xff)); + radeon_emit(cs, rctx->append_fence_id); + radeon_emit(cs, 0xffffffff); + radeon_emit(cs, 0xa); + radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); + radeon_emit(cs, reloc); +} diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index 0b815d4aef6..96017dc0477 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -74,6 +74,8 @@ static void r600_destroy_context(struct pipe_context *context) r600_resource_reference(&rctx->dummy_cmask, NULL); r600_resource_reference(&rctx->dummy_fmask, NULL); + if (rctx->append_fence) + pipe_resource_reference((struct pipe_resource**)&rctx->append_fence, NULL); for (sh = 0; sh < PIPE_SHADER_TYPES; sh++) { rctx->b.b.set_constant_buffer(&rctx->b.b, sh, R600_BUFFER_INFO_CONST_BUFFER, NULL); free(rctx->driver_consts[sh].constants); @@ -186,6 +188,9 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, rctx->b.family == CHIP_CAICOS || rctx->b.family == CHIP_CAYMAN || rctx->b.family == CHIP_ARUBA); + + rctx->append_fence = pipe_buffer_create(rctx->b.b.screen, PIPE_BIND_CUSTOM, + PIPE_USAGE_DEFAULT, 32); break; default: R600_ERR("Unsupported chip class %d.\n", rctx->b.chip_class); @@ -605,8 +610,17 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD: case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS: + return 0; case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: + if (rscreen->b.family >= CHIP_CEDAR && rscreen->has_atomics) + return 8; + return 0; case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: + /* having to allocate the atomics out amongst shaders stages is messy, + so give compute 8 buffers and all the others one */ + if (rscreen->b.family >= CHIP_CEDAR && rscreen->has_atomics) { + return EG_MAX_ATOMIC_BUFFERS; + } return 0; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: /* due to a bug in the shader compiler, some loops hang @@ -741,6 +755,7 @@ struct pipe_screen *r600_screen_create(struct radeon_winsys *ws, /* Create the auxiliary context. This must be done last. */ rscreen->b.aux_context = rscreen->b.b.context_create(&rscreen->b.b, NULL, 0); + rscreen->has_atomics = rscreen->b.info.drm_minor >= 44; #if 0 /* This is for testing whether aux_context and buffer clearing work correctly. */ struct pipe_resource templ = {}; diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h index 0d2551ac566..3dae56e3054 100644 --- a/src/gallium/drivers/r600/r600_pipe.h +++ b/src/gallium/drivers/r600/r600_pipe.h @@ -64,6 +64,8 @@ #define R600_MAX_DRIVER_CONST_BUFFERS 3 #define R600_MAX_CONST_BUFFERS (R600_MAX_USER_CONST_BUFFERS + R600_MAX_DRIVER_CONST_BUFFERS) +#define EG_MAX_ATOMIC_BUFFERS 8 + /* start driver buffers after user buffers */ #define R600_BUFFER_INFO_CONST_BUFFER (R600_MAX_USER_CONST_BUFFERS) #define R600_UCP_SIZE (4*4*8) @@ -247,6 +249,7 @@ struct r600_screen { struct r600_common_screen b; bool has_msaa; bool has_compressed_msaa_texturing; + bool has_atomics; /*for compute global memory binding, we allocate stuff here, instead of * buffers. @@ -416,6 +419,12 @@ struct r600_shader_state { struct r600_pipe_shader *shader; }; +struct r600_atomic_buffer_state { + uint32_t enabled_mask; + uint32_t dirty_mask; + struct pipe_shader_buffer buffer[EG_MAX_ATOMIC_BUFFERS]; +}; + struct r600_context { struct r600_common_context b; struct r600_screen *screen; @@ -470,6 +479,7 @@ struct r600_context { struct r600_config_state config_state; struct r600_stencil_ref_state stencil_ref; struct r600_vgt_state vgt_state; + struct r600_atomic_buffer_state atomic_buffer_state; /* Shaders and shader resources. */ struct r600_cso_state vertex_fetch_shader; struct r600_shader_state hw_shader_stages[EG_NUM_HW_STAGES]; @@ -531,6 +541,9 @@ struct r600_context { struct r600_resource *last_trace_buf; struct r600_resource *trace_buf; unsigned trace_id; + + struct pipe_resource *append_fence; + uint32_t append_fence_id; }; static inline void r600_emit_command_buffer(struct radeon_winsys_cs *cs, @@ -959,4 +972,13 @@ unsigned r600_conv_prim_to_gs_out(unsigned mode); void eg_trace_emit(struct r600_context *rctx); void eg_dump_debug_state(struct pipe_context *ctx, FILE *f, unsigned flags); + +struct r600_shader_atomic; +bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx, + struct r600_shader_atomic *combined_atomics, + uint8_t *atomic_used_mask_p); +void evergreen_emit_atomic_buffer_save(struct r600_context *rctx, + struct r600_shader_atomic *combined_atomics, + uint8_t *atomic_used_mask_p); + #endif diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 188fbc9d47d..af866c4bddb 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -194,6 +194,8 @@ int r600_pipe_shader_create(struct pipe_context *ctx, /* disable SB for shaders using doubles */ use_sb &= !shader->shader.uses_doubles; + use_sb &= !shader->shader.uses_atomics; + /* Check if the bytecode has already been built. */ if (!shader->shader.bc.bytecode) { r = r600_bytecode_build(&shader->shader.bc); @@ -407,6 +409,7 @@ static int tgsi_is_supported(struct r600_shader_ctx *ctx) if (i->Src[j].Register.Dimension) { switch (i->Src[j].Register.File) { case TGSI_FILE_CONSTANT: + case TGSI_FILE_HW_ATOMIC: break; case TGSI_FILE_INPUT: if (ctx->type == PIPE_SHADER_GEOMETRY || @@ -966,6 +969,17 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx) case TGSI_FILE_ADDRESS: break; + case TGSI_FILE_HW_ATOMIC: + i = ctx->shader->nhwatomic_ranges; + ctx->shader->atomics[i].start = d->Range.First; + ctx->shader->atomics[i].end = d->Range.Last; + ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic; + ctx->shader->atomics[i].array_id = d->Array.ArrayID; + ctx->shader->atomics[i].buffer_id = d->Dim.Index2D; + ctx->shader->nhwatomic_ranges++; + ctx->shader->nhwatomic += count; + break; + case TGSI_FILE_SYSTEM_VALUE: if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK || d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID || @@ -2946,6 +2960,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, shader->indirect_files = ctx.info.indirect_files; shader->uses_doubles = ctx.info.uses_doubles; + shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC]; shader->nsys_inputs = 0; indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER)); @@ -2959,6 +2974,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, shader->vs_as_gs_a = key.vs.as_gs_a; shader->vs_as_es = key.vs.as_es; shader->vs_as_ls = key.vs.as_ls; + shader->atomic_base = key.vs.first_atomic_counter; if (shader->vs_as_es) ring_outputs = true; if (shader->vs_as_ls) @@ -2966,20 +2982,24 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, break; case PIPE_SHADER_GEOMETRY: ring_outputs = true; + shader->atomic_base = key.gs.first_atomic_counter; break; case PIPE_SHADER_TESS_CTRL: shader->tcs_prim_mode = key.tcs.prim_mode; + shader->atomic_base = key.tcs.first_atomic_counter; lds_outputs = true; lds_inputs = true; break; case PIPE_SHADER_TESS_EVAL: shader->tes_as_es = key.tes.as_es; + shader->atomic_base = key.tes.first_atomic_counter; lds_inputs = true; if (shader->tes_as_es) ring_outputs = true; break; case PIPE_SHADER_FRAGMENT: shader->two_side = key.ps.color_two_side; + shader->atomic_base = key.ps.first_atomic_counter; break; default: break; @@ -7533,6 +7553,181 @@ static int tgsi_tex(struct r600_shader_ctx *ctx) return 0; } +static int find_hw_atomic_counter(struct r600_shader_ctx *ctx, + struct tgsi_full_src_register *src) +{ + int i; + + if (src->Register.Indirect) { + for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) { + if (src->Indirect.ArrayID == ctx->shader->atomics[i].array_id) + return ctx->shader->atomics[i].hw_idx; + } + } else { + uint32_t index = src->Register.Index; + for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) { + if (ctx->shader->atomics[i].buffer_id != src->Dimension.Index) + continue; + if (index > ctx->shader->atomics[i].end) + continue; + if (index < ctx->shader->atomics[i].start) + continue; + uint32_t offset = (index - ctx->shader->atomics[i].start); + return ctx->shader->atomics[i].hw_idx + offset; + } + } + assert(0); + return -1; +} + + +static int tgsi_load_gds(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + int r; + struct r600_bytecode_gds gds; + int uav_id = 0; + int uav_index_mode = 0; + + uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]); + + if (inst->Src[0].Register.Indirect) + uav_index_mode = 2; + + memset(&gds, 0, sizeof(struct r600_bytecode_gds)); + gds.op = FETCH_OP_GDS_READ_RET; + gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; + gds.uav_id = uav_id; + gds.uav_index_mode = uav_index_mode; + gds.src_gpr = ctx->temp_reg; + gds.src_sel_x = 4; + gds.src_sel_y = 4; + gds.src_sel_z = 4; + gds.dst_sel_x = 0; + gds.dst_sel_y = 7; + gds.dst_sel_z = 7; + gds.dst_sel_w = 7; + gds.src_gpr2 = ctx->temp_reg; + gds.alloc_consume = 1; + r = r600_bytecode_add_gds(ctx->bc, &gds); + if (r) + return r; + + ctx->bc->cf_last->vpm = 1; + return 0; +} + +static int tgsi_load(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC) + return tgsi_load_gds(ctx); + return 0; +} + +static int get_gds_op(int opcode) +{ + switch (opcode) { + case TGSI_OPCODE_ATOMUADD: + return FETCH_OP_GDS_ADD_RET; + case TGSI_OPCODE_ATOMAND: + return FETCH_OP_GDS_AND_RET; + case TGSI_OPCODE_ATOMOR: + return FETCH_OP_GDS_OR_RET; + case TGSI_OPCODE_ATOMXOR: + return FETCH_OP_GDS_XOR_RET; + case TGSI_OPCODE_ATOMUMIN: + return FETCH_OP_GDS_MIN_UINT_RET; + case TGSI_OPCODE_ATOMUMAX: + return FETCH_OP_GDS_MAX_UINT_RET; + case TGSI_OPCODE_ATOMXCHG: + return FETCH_OP_GDS_XCHG_RET; + case TGSI_OPCODE_ATOMCAS: + return FETCH_OP_GDS_CMP_XCHG_RET; + default: + return -1; + } +} + +static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bytecode_gds gds; + struct r600_bytecode_alu alu; + int gds_op = get_gds_op(inst->Instruction.Opcode); + int r; + int uav_id = 0; + int uav_index_mode = 0; + + if (gds_op == -1) { + fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode); + return -1; + } + + uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]); + + if (inst->Src[0].Register.Indirect) + uav_index_mode = 2; + + if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) { + int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]); + int abs_value = abs(value); + if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET) + gds_op = FETCH_OP_GDS_SUB_RET; + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = 0; + alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[0].value = abs_value; + alu.last = 1; + alu.dst.write = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } else { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = 0; + r600_bytecode_src(&alu.src[0], &ctx->src[2], 0); + alu.last = 1; + alu.dst.write = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + + memset(&gds, 0, sizeof(struct r600_bytecode_gds)); + gds.op = gds_op; + gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; + gds.uav_id = uav_id; + gds.uav_index_mode = uav_index_mode; + gds.src_gpr = ctx->temp_reg; + gds.src_gpr2 = ctx->temp_reg; + gds.src_sel_x = 4; + gds.src_sel_y = 0; + gds.src_sel_z = 4; + gds.dst_sel_x = 0; + gds.dst_sel_y = 7; + gds.dst_sel_z = 7; + gds.dst_sel_w = 7; + gds.alloc_consume = 1; + r = r600_bytecode_add_gds(ctx->bc, &gds); + if (r) + return r; + ctx->bc->cf_last->vpm = 1; + return 0; +} + +static int tgsi_atomic_op(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC) + return tgsi_atomic_op_gds(ctx); + return 0; +} + static int tgsi_lrp(struct r600_shader_ctx *ctx) { struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; @@ -9190,22 +9385,22 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp}, [TGSI_OPCODE_IABS] = { 0, tgsi_iabs}, [TGSI_OPCODE_ISSG] = { 0, tgsi_issg}, - [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported}, + [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load}, [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported}, [163] = { ALU_OP0_NOP, tgsi_unsupported}, [164] = { ALU_OP0_NOP, tgsi_unsupported}, [165] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, - [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported}, + [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op}, + [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op}, + [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op}, + [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op}, + [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op}, + [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op}, + [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op}, + [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op}, + [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op}, + [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op}, [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex}, [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex}, @@ -9413,22 +9608,22 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp}, [TGSI_OPCODE_IABS] = { 0, tgsi_iabs}, [TGSI_OPCODE_ISSG] = { 0, tgsi_issg}, - [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported}, + [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load}, [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported}, [163] = { ALU_OP0_NOP, tgsi_unsupported}, [164] = { ALU_OP0_NOP, tgsi_unsupported}, [165] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, - [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported}, + [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op}, + [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op}, + [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op}, + [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op}, + [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op}, + [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op}, + [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op}, + [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op}, + [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op}, + [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op}, [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex}, [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex}, diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h index 9032d508383..3fecda4c800 100644 --- a/src/gallium/drivers/r600/r600_shader.h +++ b/src/gallium/drivers/r600/r600_shader.h @@ -56,15 +56,25 @@ struct r600_shader_io { int ring_offset; }; +struct r600_shader_atomic { + unsigned start, end; + unsigned buffer_id; + unsigned hw_idx; + unsigned array_id; +}; + struct r600_shader { unsigned processor_type; struct r600_bytecode bc; unsigned ninput; unsigned noutput; + unsigned nhwatomic; unsigned nlds; unsigned nsys_inputs; struct r600_shader_io input[64]; struct r600_shader_io output[64]; + struct r600_shader_atomic atomics[8]; + unsigned nhwatomic_ranges; boolean uses_kill; boolean fs_write_all; boolean two_side; @@ -105,26 +115,35 @@ struct r600_shader { struct r600_shader_array * arrays; boolean uses_doubles; + boolean uses_atomics; + uint8_t atomic_base; }; union r600_shader_key { struct { unsigned nr_cbufs:4; + unsigned first_atomic_counter:4; unsigned color_two_side:1; unsigned alpha_to_one:1; } ps; struct { unsigned prim_id_out:8; + unsigned first_atomic_counter:4; unsigned as_es:1; /* export shader */ unsigned as_ls:1; /* local shader */ unsigned as_gs_a:1; } vs; struct { + unsigned first_atomic_counter:4; unsigned as_es:1; } tes; struct { + unsigned first_atomic_counter:4; unsigned prim_mode:3; } tcs; + struct { + unsigned first_atomic_counter:4; + } gs; }; struct r600_shader_array { diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c index 0e8c5d666ef..750fd411baf 100644 --- a/src/gallium/drivers/r600/r600_state_common.c +++ b/src/gallium/drivers/r600/r600_state_common.c @@ -698,6 +698,38 @@ static void r600_update_compressed_colortex_mask(struct r600_samplerview_state * } } +static int r600_get_hw_atomic_count(const struct pipe_context *ctx, + enum pipe_shader_type shader) +{ + const struct r600_context *rctx = (struct r600_context *)ctx; + int value = 0; + switch (shader) { + case PIPE_SHADER_FRAGMENT: + case PIPE_SHADER_COMPUTE: + default: + break; + case PIPE_SHADER_VERTEX: + value = rctx->ps_shader->info.file_count[TGSI_FILE_HW_ATOMIC]; + break; + case PIPE_SHADER_GEOMETRY: + value = rctx->ps_shader->info.file_count[TGSI_FILE_HW_ATOMIC] + + rctx->vs_shader->info.file_count[TGSI_FILE_HW_ATOMIC]; + break; + case PIPE_SHADER_TESS_EVAL: + value = rctx->ps_shader->info.file_count[TGSI_FILE_HW_ATOMIC] + + rctx->vs_shader->info.file_count[TGSI_FILE_HW_ATOMIC] + + (rctx->gs_shader ? rctx->gs_shader->info.file_count[TGSI_FILE_HW_ATOMIC] : 0); + break; + case PIPE_SHADER_TESS_CTRL: + value = rctx->ps_shader->info.file_count[TGSI_FILE_HW_ATOMIC] + + rctx->vs_shader->info.file_count[TGSI_FILE_HW_ATOMIC] + + (rctx->gs_shader ? rctx->gs_shader->info.file_count[TGSI_FILE_HW_ATOMIC] : 0) + + rctx->tes_shader->info.file_count[TGSI_FILE_HW_ATOMIC]; + break; + } + return value; +} + /* Compute the key for the hw shader variant */ static inline void r600_shader_selector_key(const struct pipe_context *ctx, const struct r600_pipe_shader_selector *sel, @@ -716,11 +748,14 @@ static inline void r600_shader_selector_key(const struct pipe_context *ctx, key->vs.as_gs_a = true; key->vs.prim_id_out = rctx->ps_shader->current->shader.input[rctx->ps_shader->current->shader.ps_prim_id_input].spi_sid; } + key->vs.first_atomic_counter = r600_get_hw_atomic_count(ctx, PIPE_SHADER_VERTEX); break; } case PIPE_SHADER_GEOMETRY: + key->gs.first_atomic_counter = r600_get_hw_atomic_count(ctx, PIPE_SHADER_GEOMETRY); break; case PIPE_SHADER_FRAGMENT: { + key->ps.first_atomic_counter = r600_get_hw_atomic_count(ctx, PIPE_SHADER_FRAGMENT); key->ps.color_two_side = rctx->rasterizer && rctx->rasterizer->two_side; key->ps.alpha_to_one = rctx->alpha_to_one && rctx->rasterizer && rctx->rasterizer->multisample_enable && @@ -733,9 +768,11 @@ static inline void r600_shader_selector_key(const struct pipe_context *ctx, } case PIPE_SHADER_TESS_EVAL: key->tes.as_es = (rctx->gs_shader != NULL); + key->tes.first_atomic_counter = r600_get_hw_atomic_count(ctx, PIPE_SHADER_TESS_EVAL); break; case PIPE_SHADER_TESS_CTRL: key->tcs.prim_mode = rctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE]; + key->tcs.first_atomic_counter = r600_get_hw_atomic_count(ctx, PIPE_SHADER_TESS_CTRL); break; default: assert(0); @@ -1700,6 +1737,8 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info unsigned num_patches, dirty_tex_counter, index_offset = 0; unsigned index_size = info->index_size; int index_bias; + struct r600_shader_atomic combined_atomics[8]; + uint8_t atomic_used_mask; if (!info->indirect && !info->count && (index_size || !info->count_from_stream_output)) { return; @@ -1739,6 +1778,9 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info : (rctx->tes_shader)? rctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] : info->mode; + if (rctx->b.chip_class >= EVERGREEN) + evergreen_emit_atomic_buffer_setup(rctx, combined_atomics, &atomic_used_mask); + if (index_size) { index_offset += info->start * index_size; @@ -2019,6 +2061,10 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SQ_NON_EVENT)); } + + if (rctx->b.chip_class >= EVERGREEN) + evergreen_emit_atomic_buffer_save(rctx, combined_atomics, &atomic_used_mask); + if (rctx->trace_buf) eg_trace_emit(rctx); diff --git a/src/gallium/drivers/r600/r600d_common.h b/src/gallium/drivers/r600/r600d_common.h index ed1d46076c0..b06f90f8edd 100644 --- a/src/gallium/drivers/r600/r600d_common.h +++ b/src/gallium/drivers/r600/r600d_common.h @@ -51,6 +51,8 @@ #define STRMOUT_SELECT_BUFFER(x) (((unsigned)(x) & 0x3) << 8) #define PKT3_WAIT_REG_MEM 0x3C #define WAIT_REG_MEM_EQUAL 3 +#define WAIT_REG_MEM_GEQUAL 5 +#define WAIT_REG_MEM_MEMORY (1 << 4) #define WAIT_REG_MEM_MEM_SPACE(x) (((unsigned)(x) & 0x3) << 4) #define PKT3_COPY_DATA 0x40 #define COPY_DATA_SRC_SEL(x) ((x) & 0xf) |