diff options
-rw-r--r-- | src/amd/common/ac_nir_to_llvm.c | 31 | ||||
-rw-r--r-- | src/amd/common/ac_nir_to_llvm.h | 4 | ||||
-rw-r--r-- | src/amd/vulkan/radv_cmd_buffer.c | 61 | ||||
-rw-r--r-- | src/amd/vulkan/radv_device.c | 40 | ||||
-rw-r--r-- | src/amd/vulkan/radv_private.h | 2 |
5 files changed, 87 insertions, 51 deletions
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index c8ea0da7740..dfe2732f78d 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -109,7 +109,7 @@ struct nir_to_llvm_context { LLVMValueRef hs_ring_tess_factor; LLVMValueRef prim_mask; - LLVMValueRef sample_positions; + LLVMValueRef sample_pos_offset; LLVMValueRef persp_sample, persp_center, persp_centroid; LLVMValueRef linear_sample, linear_center, linear_centroid; LLVMValueRef front_face; @@ -573,6 +573,7 @@ static void create_function(struct nir_to_llvm_context *ctx) ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_TESS_EVAL || + ctx->stage == MESA_SHADER_FRAGMENT || ctx->is_gs_copy_shader) need_ring_offsets = true; @@ -584,7 +585,7 @@ static void create_function(struct nir_to_llvm_context *ctx) need_push_constants = false; if (need_ring_offsets && !ctx->options->supports_spill) { - arg_types[arg_idx++] = const_array(ctx->v16i8, 8); /* address of rings */ + arg_types[arg_idx++] = const_array(ctx->v16i8, 16); /* address of rings */ } /* 1 for each descriptor set */ @@ -679,7 +680,7 @@ static void create_function(struct nir_to_llvm_context *ctx) arg_types[arg_idx++] = ctx->i32; // GS instance id break; case MESA_SHADER_FRAGMENT: - arg_types[arg_idx++] = const_array(ctx->f32, 32); /* sample positions */ + arg_types[arg_idx++] = ctx->i32; /* sample position offset */ user_sgpr_count = arg_idx; arg_types[arg_idx++] = ctx->i32; /* prim mask */ sgpr_count = arg_idx; @@ -735,7 +736,7 @@ static void create_function(struct nir_to_llvm_context *ctx) LLVMPointerType(ctx->i8, CONST_ADDR_SPACE), NULL, 0, AC_FUNC_ATTR_READNONE); ctx->ring_offsets = LLVMBuildBitCast(ctx->builder, ctx->ring_offsets, - const_array(ctx->v16i8, 8), ""); + const_array(ctx->v16i8, 16), ""); } else ctx->ring_offsets = LLVMGetParam(ctx->main_function, arg_idx++); } @@ -844,9 +845,9 @@ static void create_function(struct nir_to_llvm_context *ctx) ctx->gs_invocation_id = LLVMGetParam(ctx->main_function, arg_idx++); break; case MESA_SHADER_FRAGMENT: - set_userdata_location_shader(ctx, AC_UD_PS_SAMPLE_POS, user_sgpr_idx, 2); - user_sgpr_idx += 2; - ctx->sample_positions = LLVMGetParam(ctx->main_function, arg_idx++); + set_userdata_location_shader(ctx, AC_UD_PS_SAMPLE_POS_OFFSET, user_sgpr_idx, 1); + user_sgpr_idx += 1; + ctx->sample_pos_offset = LLVMGetParam(ctx->main_function, arg_idx++); ctx->prim_mask = LLVMGetParam(ctx->main_function, arg_idx++); ctx->persp_sample = LLVMGetParam(ctx->main_function, arg_idx++); ctx->persp_center = LLVMGetParam(ctx->main_function, arg_idx++); @@ -3518,15 +3519,17 @@ static LLVMValueRef lookup_interp_param(struct nir_to_llvm_context *ctx, static LLVMValueRef load_sample_position(struct nir_to_llvm_context *ctx, LLVMValueRef sample_id) { - /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */ - LLVMValueRef offset0 = LLVMBuildMul(ctx->builder, sample_id, LLVMConstInt(ctx->i32, 8, false), ""); - LLVMValueRef offset1 = LLVMBuildAdd(ctx->builder, offset0, LLVMConstInt(ctx->i32, 4, false), ""); - LLVMValueRef result[2]; + LLVMValueRef result; + LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->i32, RING_PS_SAMPLE_POSITIONS, false)); - result[0] = ac_build_indexed_load_const(&ctx->ac, ctx->sample_positions, offset0); - result[1] = ac_build_indexed_load_const(&ctx->ac, ctx->sample_positions, offset1); + ptr = LLVMBuildBitCast(ctx->builder, ptr, + const_array(ctx->v2f32, 64), ""); - return ac_build_gather_values(&ctx->ac, result, 2); + sample_id = LLVMBuildAdd(ctx->builder, sample_id, ctx->sample_pos_offset, ""); + result = ac_build_indexed_load(&ctx->ac, ptr, sample_id, false); + + ctx->shader_info->fs.uses_sample_positions = true; + return result; } static LLVMValueRef load_sample_pos(struct nir_to_llvm_context *ctx) diff --git a/src/amd/common/ac_nir_to_llvm.h b/src/amd/common/ac_nir_to_llvm.h index c468d93f428..3d0b456f9f5 100644 --- a/src/amd/common/ac_nir_to_llvm.h +++ b/src/amd/common/ac_nir_to_llvm.h @@ -88,7 +88,7 @@ enum ac_ud_index { AC_UD_VS_BASE_VERTEX_START_INSTANCE, AC_UD_VS_LS_TCS_IN_LAYOUT, AC_UD_VS_MAX_UD, - AC_UD_PS_SAMPLE_POS = AC_UD_SHADER_START, + AC_UD_PS_SAMPLE_POS_OFFSET = AC_UD_SHADER_START, AC_UD_PS_MAX_UD, AC_UD_CS_GRID_SIZE = AC_UD_SHADER_START, AC_UD_CS_MAX_UD, @@ -109,6 +109,7 @@ enum ac_ud_index { #define RING_GSVS_GS 4 #define RING_HS_TESS_FACTOR 5 #define RING_HS_TESS_OFFCHIP 6 +#define RING_PS_SAMPLE_POSITIONS 7 // Match MAX_SETS from radv_descriptor_set.h #define AC_UD_MAX_SETS MAX_SETS @@ -165,6 +166,7 @@ struct ac_shader_variant_info { bool force_persample; bool prim_id_input; bool layer_input; + bool uses_sample_positions; } fs; struct { unsigned block_size[3]; diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index c95388e9fb2..b0a68548295 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -222,6 +222,7 @@ static void radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) cmd_buffer->esgs_ring_size_needed = 0; cmd_buffer->gsvs_ring_size_needed = 0; cmd_buffer->tess_rings_needed = false; + cmd_buffer->sample_positions_needed = false; if (cmd_buffer->upload.upload_bo) cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, @@ -452,37 +453,35 @@ radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, radv_cayman_emit_msaa_sample_locs(cmd_buffer->cs, num_samples); - uint32_t samples_offset; - void *samples_ptr; - void *src; - radv_cmd_buffer_upload_alloc(cmd_buffer, num_samples * 4 * 2, 256, &samples_offset, - &samples_ptr); - switch (num_samples) { - case 1: - src = cmd_buffer->device->sample_locations_1x; - break; - case 2: - src = cmd_buffer->device->sample_locations_2x; - break; - case 4: - src = cmd_buffer->device->sample_locations_4x; - break; - case 8: - src = cmd_buffer->device->sample_locations_8x; - break; - case 16: - src = cmd_buffer->device->sample_locations_16x; - break; - default: - unreachable("unknown number of samples"); - } - memcpy(samples_ptr, src, num_samples * 4 * 2); - - uint64_t va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo); - va += samples_offset; + if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.fs.uses_sample_positions) { + uint32_t offset; + struct ac_userdata_info *loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_FRAGMENT, AC_UD_PS_SAMPLE_POS_OFFSET); + uint32_t base_reg = shader_stage_to_user_data_0(MESA_SHADER_FRAGMENT, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); + if (loc->sgpr_idx == -1) + return; + assert(loc->num_sgprs == 1); + assert(!loc->indirect); + switch (num_samples) { + default: + offset = 0; + break; + case 2: + offset = 1; + break; + case 4: + offset = 3; + break; + case 8: + offset = 7; + break; + case 16: + offset = 15; + break; + } - radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_FRAGMENT, - AC_UD_PS_SAMPLE_POS, va); + radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, offset); + cmd_buffer->sample_positions_needed = true; + } } static void @@ -2197,6 +2196,8 @@ void radv_CmdExecuteCommands( primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed; if (secondary->tess_rings_needed) primary->tess_rings_needed = true; + if (secondary->sample_positions_needed) + primary->sample_positions_needed = true; if (secondary->ring_offsets_idx != -1) { if (primary->ring_offsets_idx == -1) diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 5c48be1d11a..24219e4ec4f 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -1188,6 +1188,7 @@ static void radv_dump_trace(struct radv_device *device, static void fill_geom_tess_rings(struct radv_queue *queue, uint32_t *map, + bool add_sample_positions, uint32_t esgs_ring_size, struct radeon_winsys_bo *esgs_ring_bo, uint32_t gsvs_ring_size, @@ -1315,6 +1316,18 @@ fill_geom_tess_rings(struct radv_queue *queue, S_008F0C_ELEMENT_SIZE(0) | S_008F0C_INDEX_STRIDE(0) | S_008F0C_ADD_TID_ENABLE(false); + desc += 4; + + /* add sample positions after all rings */ + memcpy(desc, queue->device->sample_locations_1x, 8); + desc += 2; + memcpy(desc, queue->device->sample_locations_2x, 16); + desc += 4; + memcpy(desc, queue->device->sample_locations_4x, 32); + desc += 8; + memcpy(desc, queue->device->sample_locations_8x, 64); + desc += 16; + memcpy(desc, queue->device->sample_locations_16x, 128); } static unsigned @@ -1374,6 +1387,7 @@ radv_get_preamble_cs(struct radv_queue *queue, uint32_t esgs_ring_size, uint32_t gsvs_ring_size, bool needs_tess_rings, + bool needs_sample_positions, struct radeon_winsys_cs **initial_preamble_cs, struct radeon_winsys_cs **continue_preamble_cs) { @@ -1385,7 +1399,7 @@ radv_get_preamble_cs(struct radv_queue *queue, struct radeon_winsys_bo *tess_factor_ring_bo = NULL; struct radeon_winsys_bo *tess_offchip_ring_bo = NULL; struct radeon_winsys_cs *dest_cs[2] = {0}; - bool add_tess_rings = false; + bool add_tess_rings = false, add_sample_positions = false; unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0; unsigned max_offchip_buffers; unsigned hs_offchip_param = 0; @@ -1393,6 +1407,10 @@ radv_get_preamble_cs(struct radv_queue *queue, if (needs_tess_rings) add_tess_rings = true; } + if (!queue->has_sample_positions) { + if (needs_sample_positions) + add_sample_positions = true; + } tess_factor_ring_size = 32768 * queue->device->physical_device->rad_info.max_se; hs_offchip_param = radv_get_hs_offchip_param(queue->device, &max_offchip_buffers); @@ -1403,7 +1421,7 @@ radv_get_preamble_cs(struct radv_queue *queue, compute_scratch_size <= queue->compute_scratch_size && esgs_ring_size <= queue->esgs_ring_size && gsvs_ring_size <= queue->gsvs_ring_size && - !add_tess_rings && + !add_tess_rings && !add_sample_positions && queue->initial_preamble_cs) { *initial_preamble_cs = queue->initial_preamble_cs; *continue_preamble_cs = queue->continue_preamble_cs; @@ -1485,11 +1503,14 @@ radv_get_preamble_cs(struct radv_queue *queue, esgs_ring_bo != queue->esgs_ring_bo || gsvs_ring_bo != queue->gsvs_ring_bo || tess_factor_ring_bo != queue->tess_factor_ring_bo || - tess_offchip_ring_bo != queue->tess_offchip_ring_bo) { + tess_offchip_ring_bo != queue->tess_offchip_ring_bo || add_sample_positions) { uint32_t size = 0; if (gsvs_ring_bo || esgs_ring_bo || - tess_factor_ring_bo || tess_offchip_ring_bo) + tess_factor_ring_bo || tess_offchip_ring_bo || add_sample_positions) { size = 112; /* 2 dword + 2 padding + 4 dword * 6 */ + if (add_sample_positions) + size += 256; /* 32+16+8+4+2+1 samples * 4 * 2 = 248 bytes. */ + } else if (scratch_bo) size = 8; /* 2 dword */ @@ -1541,8 +1562,9 @@ radv_get_preamble_cs(struct radv_queue *queue, map[1] = rsrc1; } - if (esgs_ring_bo || gsvs_ring_bo || tess_factor_ring_bo || tess_offchip_ring_bo) - fill_geom_tess_rings(queue, map, + if (esgs_ring_bo || gsvs_ring_bo || tess_factor_ring_bo || tess_offchip_ring_bo || + add_sample_positions) + fill_geom_tess_rings(queue, map, add_sample_positions, esgs_ring_size, esgs_ring_bo, gsvs_ring_size, gsvs_ring_bo, tess_factor_ring_size, tess_factor_ring_bo, @@ -1685,6 +1707,9 @@ radv_get_preamble_cs(struct radv_queue *queue, queue->descriptor_bo = descriptor_bo; } + if (add_sample_positions) + queue->has_sample_positions = true; + *initial_preamble_cs = queue->initial_preamble_cs; *continue_preamble_cs = queue->continue_preamble_cs; if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size) @@ -1730,6 +1755,7 @@ VkResult radv_QueueSubmit( VkResult result; bool fence_emitted = false; bool tess_rings_needed = false; + bool sample_positions_needed = false; /* Do this first so failing to allocate scratch buffers can't result in * partially executed submissions. */ @@ -1744,11 +1770,13 @@ VkResult radv_QueueSubmit( esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed); gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed); tess_rings_needed |= cmd_buffer->tess_rings_needed; + sample_positions_needed |= cmd_buffer->sample_positions_needed; } } result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size, esgs_ring_size, gsvs_ring_size, tess_rings_needed, + sample_positions_needed, &initial_preamble_cs, &continue_preamble_cs); if (result != VK_SUCCESS) return result; diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 3f92d59ac97..a28c8250367 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -460,6 +460,7 @@ struct radv_queue { uint32_t esgs_ring_size; uint32_t gsvs_ring_size; bool has_tess_rings; + bool has_sample_positions; struct radeon_winsys_bo *scratch_bo; struct radeon_winsys_bo *descriptor_bo; @@ -748,6 +749,7 @@ struct radv_cmd_buffer { uint32_t esgs_ring_size_needed; uint32_t gsvs_ring_size_needed; bool tess_rings_needed; + bool sample_positions_needed; int ring_offsets_idx; /* just used for verification */ }; |