diff options
Diffstat (limited to 'src/gallium/drivers/radeonsi/radeonsi_shader.c')
-rw-r--r-- | src/gallium/drivers/radeonsi/radeonsi_shader.c | 279 |
1 files changed, 269 insertions, 10 deletions
diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c b/src/gallium/drivers/radeonsi/radeonsi_shader.c index 335cd79b1a8..80dd773c10c 100644 --- a/src/gallium/drivers/radeonsi/radeonsi_shader.c +++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c @@ -34,6 +34,7 @@ #include "gallivm/lp_bld_logic.h" #include "gallivm/lp_bld_tgsi.h" #include "gallivm/lp_bld_arit.h" +#include "gallivm/lp_bld_flow.h" #include "radeon_llvm.h" #include "radeon_llvm_emit.h" #include "util/u_memory.h" @@ -59,6 +60,11 @@ struct si_shader_context struct tgsi_token * tokens; struct si_pipe_shader *shader; unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */ + int param_streamout_config; + int param_streamout_write_index; + int param_streamout_offset[4]; + int param_vertex_id; + int param_instance_id; LLVMValueRef const_md; LLVMValueRef const_resource; #if HAVE_LLVM >= 0x0304 @@ -67,6 +73,7 @@ struct si_shader_context LLVMValueRef *constants; LLVMValueRef *resources; LLVMValueRef *samplers; + LLVMValueRef so_buffers[4]; }; static struct si_shader_context * si_shader_context( @@ -119,9 +126,12 @@ static LLVMValueRef get_instance_index( struct radeon_llvm_context * radeon_bld, unsigned divisor) { + struct si_shader_context *si_shader_ctx = + si_shader_context(&radeon_bld->soa.bld_base); struct gallivm_state * gallivm = radeon_bld->soa.bld_base.base.gallivm; - LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_INSTANCE_ID); + LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn, + si_shader_ctx->param_instance_id); result = LLVMBuildAdd(gallivm->builder, result, LLVMGetParam( radeon_bld->main_fn, SI_PARAM_START_INSTANCE), ""); @@ -168,7 +178,8 @@ static void declare_input_vs( } else { /* Load the buffer index, which is always stored in VGPR0 * for Vertex Shaders */ - buffer_index = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_ID); + buffer_index = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + si_shader_ctx->param_vertex_id); } vec4_type = LLVMVectorType(base->elem_type, 4); @@ -397,7 +408,8 @@ static void declare_system_value( unsigned index, const struct tgsi_full_declaration *decl) { - + struct si_shader_context *si_shader_ctx = + si_shader_context(&radeon_bld->soa.bld_base); LLVMValueRef value = 0; switch (decl->Semantic.Name) { @@ -406,7 +418,8 @@ static void declare_system_value( break; case TGSI_SEMANTIC_VERTEXID: - value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_VERTEX_ID); + value = LLVMGetParam(radeon_bld->main_fn, + si_shader_ctx->param_vertex_id); break; default: @@ -651,6 +664,206 @@ static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context * bld_base, } } +static void si_dump_streamout(struct pipe_stream_output_info *so) +{ + unsigned i; + + if (so->num_outputs) + fprintf(stderr, "STREAMOUT\n"); + + for (i = 0; i < so->num_outputs; i++) { + unsigned mask = ((1 << so->output[i].num_components) - 1) << + so->output[i].start_component; + fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n", + i, so->output[i].output_buffer, + so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1, + so->output[i].register_index, + mask & 1 ? "x" : "", + mask & 2 ? "y" : "", + mask & 4 ? "z" : "", + mask & 8 ? "w" : ""); + } +} + +/* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4. + * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2), + * or v4i32 (num_channels=3,4). */ +static void build_tbuffer_store(struct si_shader_context *shader, + LLVMValueRef rsrc, + LLVMValueRef vdata, + unsigned num_channels, + LLVMValueRef vaddr, + LLVMValueRef soffset, + unsigned inst_offset, + unsigned dfmt, + unsigned nfmt, + unsigned offen, + unsigned idxen, + unsigned glc, + unsigned slc, + unsigned tfe) +{ + struct gallivm_state *gallivm = &shader->radeon_bld.gallivm; + LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context); + LLVMValueRef args[] = { + rsrc, + vdata, + LLVMConstInt(i32, num_channels, 0), + vaddr, + soffset, + LLVMConstInt(i32, inst_offset, 0), + LLVMConstInt(i32, dfmt, 0), + LLVMConstInt(i32, nfmt, 0), + LLVMConstInt(i32, offen, 0), + LLVMConstInt(i32, idxen, 0), + LLVMConstInt(i32, glc, 0), + LLVMConstInt(i32, slc, 0), + LLVMConstInt(i32, tfe, 0) + }; + + /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */ + unsigned func = CLAMP(num_channels, 1, 3) - 1; + const char *types[] = {"i32", "v2i32", "v4i32"}; + char name[256]; + snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]); + + lp_build_intrinsic(gallivm->builder, name, + LLVMVoidTypeInContext(gallivm->context), + args, Elements(args)); +} + +static void build_streamout_store(struct si_shader_context *shader, + LLVMValueRef rsrc, + LLVMValueRef vdata, + unsigned num_channels, + LLVMValueRef vaddr, + LLVMValueRef soffset, + unsigned inst_offset) +{ + static unsigned dfmt[] = { + V_008F0C_BUF_DATA_FORMAT_32, + V_008F0C_BUF_DATA_FORMAT_32_32, + V_008F0C_BUF_DATA_FORMAT_32_32_32, + V_008F0C_BUF_DATA_FORMAT_32_32_32_32 + }; + assert(num_channels >= 1 && num_channels <= 4); + + build_tbuffer_store(shader, rsrc, vdata, num_channels, vaddr, soffset, + inst_offset, dfmt[num_channels-1], + V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0); +} + +/* On SI, the vertex shader is responsible for writing streamout data + * to buffers. */ +static void si_llvm_emit_streamout(struct si_shader_context *shader) +{ + struct pipe_stream_output_info *so = &shader->shader->selector->so; + struct gallivm_state *gallivm = &shader->radeon_bld.gallivm; + LLVMBuilderRef builder = gallivm->builder; + int i, j; + struct lp_build_if_state if_ctx; + + LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context); + + LLVMValueRef so_param = + LLVMGetParam(shader->radeon_bld.main_fn, + shader->param_streamout_config); + + /* Get bits [22:16], i.e. (so_param >> 16) & 127; */ + LLVMValueRef so_vtx_count = + LLVMBuildAnd(builder, + LLVMBuildLShr(builder, so_param, + LLVMConstInt(i32, 16, 0), ""), + LLVMConstInt(i32, 127, 0), ""); + + LLVMValueRef tid = build_intrinsic(builder, "llvm.SI.tid", i32, + NULL, 0, LLVMReadNoneAttribute); + + /* can_emit = tid < so_vtx_count; */ + LLVMValueRef can_emit = + LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, ""); + + /* Emit the streamout code conditionally. This actually avoids + * out-of-bounds buffer access. The hw tells us via the SGPR + * (so_vtx_count) which threads are allowed to emit streamout data. */ + lp_build_if(&if_ctx, gallivm, can_emit); + { + /* The buffer offset is computed as follows: + * ByteOffset = streamout_offset[buffer_id]*4 + + * (streamout_write_index + thread_id)*stride[buffer_id] + + * attrib_offset + */ + + LLVMValueRef so_write_index = + LLVMGetParam(shader->radeon_bld.main_fn, + shader->param_streamout_write_index); + + /* Compute (streamout_write_index + thread_id). */ + so_write_index = LLVMBuildAdd(builder, so_write_index, tid, ""); + + /* Compute the write offset for each enabled buffer. */ + LLVMValueRef so_write_offset[4] = {}; + for (i = 0; i < 4; i++) { + if (!so->stride[i]) + continue; + + LLVMValueRef so_offset = LLVMGetParam(shader->radeon_bld.main_fn, + shader->param_streamout_offset[i]); + so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(i32, 4, 0), ""); + + so_write_offset[i] = LLVMBuildMul(builder, so_write_index, + LLVMConstInt(i32, so->stride[i]*4, 0), ""); + so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, ""); + } + + LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS] = shader->radeon_bld.soa.outputs; + + /* Write streamout data. */ + for (i = 0; i < so->num_outputs; i++) { + unsigned buf_idx = so->output[i].output_buffer; + unsigned reg = so->output[i].register_index; + unsigned start = so->output[i].start_component; + unsigned num_comps = so->output[i].num_components; + LLVMValueRef out[4]; + + assert(num_comps && num_comps <= 4); + if (!num_comps || num_comps > 4) + continue; + + /* Load the output as int. */ + for (j = 0; j < num_comps; j++) { + out[j] = LLVMBuildLoad(builder, outputs[reg][start+j], ""); + out[j] = LLVMBuildBitCast(builder, out[j], i32, ""); + } + + /* Pack the output. */ + LLVMValueRef vdata = NULL; + + switch (num_comps) { + case 1: /* as i32 */ + vdata = out[0]; + break; + case 2: /* as v2i32 */ + case 3: /* as v4i32 (aligned to 4) */ + case 4: /* as v4i32 */ + vdata = LLVMGetUndef(LLVMVectorType(i32, util_next_power_of_two(num_comps))); + for (j = 0; j < num_comps; j++) { + vdata = LLVMBuildInsertElement(builder, vdata, out[j], + LLVMConstInt(i32, j, 0), ""); + } + break; + } + + build_streamout_store(shader, shader->so_buffers[buf_idx], + vdata, num_comps, + so_write_offset[buf_idx], + LLVMConstInt(i32, 0, 0), + so->output[i].dst_offset*4); + } + } + lp_build_endif(&if_ctx); +} + /* XXX: This is partially implemented for VS only at this point. It is not complete */ static void si_llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base) { @@ -669,6 +882,10 @@ static void si_llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base) int depth_index = -1, stencil_index = -1; int i; + if (si_shader_ctx->shader->selector->so.num_outputs) { + si_llvm_emit_streamout(si_shader_ctx); + } + while (!tgsi_parse_end_of_tokens(parse)) { struct tgsi_full_declaration *d = &parse->FullToken.FullDeclaration; @@ -1366,12 +1583,30 @@ static void create_function(struct si_shader_context *si_shader_ctx) params[SI_PARAM_VERTEX_BUFFER] = params[SI_PARAM_CONST]; params[SI_PARAM_SO_BUFFER] = params[SI_PARAM_CONST]; params[SI_PARAM_START_INSTANCE] = i32; - last_sgpr = SI_PARAM_START_INSTANCE; - params[SI_PARAM_VERTEX_ID] = i32; - params[SI_PARAM_DUMMY_0] = i32; - params[SI_PARAM_DUMMY_1] = i32; - params[SI_PARAM_INSTANCE_ID] = i32; - num_params = SI_PARAM_INSTANCE_ID+1; + num_params = SI_PARAM_START_INSTANCE+1; + + /* The locations of the other parameters are assigned dynamically. */ + + /* Streamout SGPRs. */ + if (si_shader_ctx->shader->selector->so.num_outputs) { + params[si_shader_ctx->param_streamout_config = num_params++] = i32; + params[si_shader_ctx->param_streamout_write_index = num_params++] = i32; + } + /* A streamout buffer offset is loaded if the stride is non-zero. */ + for (i = 0; i < 4; i++) { + if (!si_shader_ctx->shader->selector->so.stride[i]) + continue; + + params[si_shader_ctx->param_streamout_offset[i] = num_params++] = i32; + } + + last_sgpr = num_params-1; + + /* VGPRs */ + params[si_shader_ctx->param_vertex_id = num_params++] = i32; + params[num_params++] = i32; /* unused*/ + params[num_params++] = i32; /* unused */ + params[si_shader_ctx->param_instance_id = num_params++] = i32; break; case TGSI_PROCESSOR_FRAGMENT: @@ -1492,6 +1727,28 @@ static void preload_samplers(struct si_shader_context *si_shader_ctx) } } +static void preload_streamout_buffers(struct si_shader_context *si_shader_ctx) +{ + struct lp_build_tgsi_context * bld_base = &si_shader_ctx->radeon_bld.soa.bld_base; + struct gallivm_state * gallivm = bld_base->base.gallivm; + unsigned i; + + if (!si_shader_ctx->shader->selector->so.num_outputs) + return; + + LLVMValueRef buf_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + SI_PARAM_SO_BUFFER); + + /* Load the resources, we rely on the code sinking to do the rest */ + for (i = 0; i < 4; ++i) { + if (si_shader_ctx->shader->selector->so.stride[i]) { + LLVMValueRef offset = lp_build_const_int32(gallivm, i); + + si_shader_ctx->so_buffers[i] = build_indexed_load(si_shader_ctx, buf_ptr, offset); + } + } +} + int si_compile_llvm(struct r600_context *rctx, struct si_pipe_shader *shader, LLVMModuleRef mod) { @@ -1625,6 +1882,7 @@ int si_pipe_shader_create( create_function(&si_shader_ctx); preload_constants(&si_shader_ctx); preload_samplers(&si_shader_ctx); + preload_streamout_buffers(&si_shader_ctx); shader->shader.nr_cbufs = rctx->framebuffer.nr_cbufs; @@ -1632,6 +1890,7 @@ int si_pipe_shader_create( * conversion fails. */ if (dump) { tgsi_dump(sel->tokens, 0); + si_dump_streamout(&sel->so); } if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) { |