diff options
author | Nicolai Hähnle <[email protected]> | 2018-05-22 16:14:16 +0200 |
---|---|---|
committer | Marek Olšák <[email protected]> | 2019-06-12 20:28:23 -0400 |
commit | bf8a1ca90246b0c987444de28ecd907942126de6 (patch) | |
tree | 661afe3e1564a6fa852ad2387b400de796280559 | |
parent | 16bee0e5f6a70e959be27c253c4be130bb034030 (diff) |
radeonsi: use the new run-time linker for shaders
v2:
- fix a memory leak
Reviewed-by: Marek Olšák <[email protected]>
-rw-r--r-- | src/gallium/drivers/radeonsi/si_compute.c | 63 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_debug.c | 74 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_pipe.c | 2 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_pipe.h | 2 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader.c | 293 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader.h | 19 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader_internal.h | 3 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c | 14 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state_shaders.c | 39 |
9 files changed, 272 insertions, 237 deletions
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 4a7ebac9ab7..7eadbcdd960 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -28,6 +28,7 @@ #include "util/u_memory.h" #include "util/u_upload_mgr.h" +#include "ac_rtld.h" #include "amd_kernel_code_t.h" #include "si_build_pm4.h" #include "si_compute.h" @@ -61,8 +62,26 @@ static const amd_kernel_code_t *si_compute_get_code_object( if (!program->use_code_object_v2) { return NULL; } - return (const amd_kernel_code_t*) - (program->shader.binary.code + symbol_offset); + + struct ac_rtld_binary rtld; + if (!ac_rtld_open(&rtld, 1, &program->shader.binary.elf_buffer, + &program->shader.binary.elf_size)) + return NULL; + + const amd_kernel_code_t *result = NULL; + const char *text; + size_t size; + if (!ac_rtld_get_section_by_name(&rtld, ".text", &text, &size)) + goto out; + + if (symbol_offset + sizeof(amd_kernel_code_t) > size) + goto out; + + result = (const amd_kernel_code_t*)(text + symbol_offset); + +out: + ac_rtld_close(&rtld); + return result; } static void code_object_to_config(const amd_kernel_code_t *code_object, @@ -145,7 +164,7 @@ static void si_create_compute_state_async(void *job, int thread_index) si_shader_dump(sscreen, shader, debug, PIPE_SHADER_COMPUTE, stderr, true); - if (!si_shader_binary_upload(sscreen, shader)) + if (!si_shader_binary_upload(sscreen, shader, 0)) program->shader.compilation_failed = true; } else { mtx_unlock(&sscreen->shader_cache_mutex); @@ -237,25 +256,23 @@ static void *si_create_compute_state( header = cso->prog; code = cso->prog + sizeof(struct pipe_llvm_program_header); - ac_elf_read(code, header->num_bytes, &program->shader.binary); - if (program->use_code_object_v2) { - const amd_kernel_code_t *code_object = - si_compute_get_code_object(program, 0); - code_object_to_config(code_object, &program->shader.config); - if (program->shader.binary.reloc_count != 0) { - fprintf(stderr, "Error: %d unsupported relocations\n", - program->shader.binary.reloc_count); - FREE(program); - return NULL; - } - } else { - ac_shader_binary_read_config(&program->shader.binary, - &program->shader.config, 0, false); + program->shader.binary.elf_size = header->num_bytes; + program->shader.binary.elf_buffer = malloc(header->num_bytes); + if (!program->shader.binary.elf_buffer) { + FREE(program); + return NULL; } + memcpy((void *)program->shader.binary.elf_buffer, code, header->num_bytes); + + const amd_kernel_code_t *code_object = + si_compute_get_code_object(program, 0); + code_object_to_config(code_object, &program->shader.config); + si_shader_dump(sctx->screen, &program->shader, &sctx->debug, PIPE_SHADER_COMPUTE, stderr, true); - if (!si_shader_binary_upload(sctx->screen, &program->shader)) { + if (!si_shader_binary_upload(sctx->screen, &program->shader, 0)) { fprintf(stderr, "LLVM failed to upload shader\n"); + free((void *)program->shader.binary.elf_buffer); FREE(program); return NULL; } @@ -390,9 +407,7 @@ static bool si_setup_compute_scratch_buffer(struct si_context *sctx, if (sctx->compute_scratch_buffer != shader->scratch_bo && scratch_needed) { uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address; - si_shader_apply_scratch_relocs(shader, scratch_va); - - if (!si_shader_binary_upload(sctx->screen, shader)) + if (!si_shader_binary_upload(sctx->screen, shader, scratch_va)) return false; si_resource_reference(&shader->scratch_bo, @@ -423,11 +438,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, unsigned lds_blocks; config = &inline_config; - if (code_object) { - code_object_to_config(code_object, config); - } else { - ac_shader_binary_read_config(&shader->binary, config, offset, false); - } + code_object_to_config(code_object, config); lds_blocks = config->lds_size; /* XXX: We are over allocating LDS. For GFX6, the shader reports diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c index 9df3175aa3c..b11f7cc6e4f 100644 --- a/src/gallium/drivers/radeonsi/si_debug.c +++ b/src/gallium/drivers/radeonsi/si_debug.c @@ -32,6 +32,7 @@ #include "util/u_memory.h" #include "util/u_string.h" #include "ac_debug.h" +#include "ac_rtld.h" static void si_dump_bo_list(struct si_context *sctx, const struct radeon_saved_cs *saved, FILE *f); @@ -201,15 +202,16 @@ static void si_dump_compute_shader(struct si_context *ctx, /** * Shader compiles can be overridden with arbitrary ELF objects by setting * the environment variable RADEON_REPLACE_SHADERS=num1:filename1[;num2:filename2] + * + * TODO: key this off some hash */ -bool si_replace_shader(unsigned num, struct ac_shader_binary *binary) +bool si_replace_shader(unsigned num, struct si_shader_binary *binary) { const char *p = debug_get_option_replace_shaders(); const char *semicolon; char *copy = NULL; FILE *f; long filesize, nread; - char *buf = NULL; bool replaced = false; if (!p) @@ -265,23 +267,25 @@ bool si_replace_shader(unsigned num, struct ac_shader_binary *binary) if (fseek(f, 0, SEEK_SET) != 0) goto file_error; - buf = MALLOC(filesize); - if (!buf) { + binary->elf_buffer = MALLOC(filesize); + if (!binary->elf_buffer) { fprintf(stderr, "out of memory\n"); goto out_close; } - nread = fread(buf, 1, filesize, f); - if (nread != filesize) + nread = fread((void*)binary->elf_buffer, 1, filesize, f); + if (nread != filesize) { + FREE((void*)binary->elf_buffer); + binary->elf_buffer = NULL; goto file_error; + } - ac_elf_read(buf, filesize, binary); + binary->elf_size = nread; replaced = true; out_close: fclose(f); out_free: - FREE(buf); free(copy); return replaced; @@ -922,33 +926,52 @@ struct si_shader_inst { }; /** - * Split a disassembly string into instructions and add them to the array - * pointed to by \p instructions. + * Open the given \p binary as \p rtld_binary and split the contained + * disassembly string into instructions and add them to the array + * pointed to by \p instructions, which must be sufficiently large. * * Labels are considered to be part of the following instruction. + * + * The caller must keep \p rtld_binary alive as long as \p instructions are + * used and then close it afterwards. */ -static void si_add_split_disasm(const char *disasm, +static void si_add_split_disasm(struct ac_rtld_binary *rtld_binary, + struct si_shader_binary *binary, uint64_t *addr, unsigned *num, struct si_shader_inst *instructions) { - const char *semicolon; + if (!ac_rtld_open(rtld_binary, 1, &binary->elf_buffer, &binary->elf_size)) + return; + + const char *disasm; + size_t nbytes; + if (!ac_rtld_get_section_by_name(rtld_binary, ".AMDGPU.disasm", + &disasm, &nbytes)) + return; + + const char *end = disasm + nbytes; + while (disasm < end) { + const char *semicolon = memchr(disasm, ';', end - disasm); + if (!semicolon) + break; - while ((semicolon = strchr(disasm, ';'))) { struct si_shader_inst *inst = &instructions[(*num)++]; - const char *end = util_strchrnul(semicolon, '\n'); + const char *inst_end = memchr(semicolon + 1, '\n', end - semicolon - 1); + if (!inst_end) + inst_end = end; inst->text = disasm; - inst->textlen = end - disasm; + inst->textlen = inst_end - disasm; inst->addr = *addr; /* More than 16 chars after ";" means the instruction is 8 bytes long. */ - inst->size = end - semicolon > 16 ? 8 : 4; + inst->size = inst_end - semicolon > 16 ? 8 : 4; *addr += inst->size; - if (!(*end)) + if (inst_end == end) break; - disasm = end + 1; + disasm = inst_end + 1; } } @@ -961,7 +984,7 @@ static void si_print_annotated_shader(struct si_shader *shader, unsigned num_waves, FILE *f) { - if (!shader || !shader->binary.disasm_string) + if (!shader) return; uint64_t start_addr = shader->bo->gpu_address; @@ -985,25 +1008,26 @@ static void si_print_annotated_shader(struct si_shader *shader, */ unsigned num_inst = 0; uint64_t inst_addr = start_addr; + struct ac_rtld_binary rtld_binaries[5] = {}; struct si_shader_inst *instructions = calloc(shader->bo->b.b.width0 / 4, sizeof(struct si_shader_inst)); if (shader->prolog) { - si_add_split_disasm(shader->prolog->binary.disasm_string, + si_add_split_disasm(&rtld_binaries[0], &shader->prolog->binary, &inst_addr, &num_inst, instructions); } if (shader->previous_stage) { - si_add_split_disasm(shader->previous_stage->binary.disasm_string, + si_add_split_disasm(&rtld_binaries[1], &shader->previous_stage->binary, &inst_addr, &num_inst, instructions); } if (shader->prolog2) { - si_add_split_disasm(shader->prolog2->binary.disasm_string, + si_add_split_disasm(&rtld_binaries[2], &shader->prolog2->binary, &inst_addr, &num_inst, instructions); } - si_add_split_disasm(shader->binary.disasm_string, + si_add_split_disasm(&rtld_binaries[3], &shader->binary, &inst_addr, &num_inst, instructions); if (shader->epilog) { - si_add_split_disasm(shader->epilog->binary.disasm_string, + si_add_split_disasm(&rtld_binaries[4], &shader->epilog->binary, &inst_addr, &num_inst, instructions); } @@ -1041,6 +1065,8 @@ static void si_print_annotated_shader(struct si_shader *shader, fprintf(f, "\n\n"); free(instructions); + for (unsigned i = 0; i < ARRAY_SIZE(rtld_binaries); ++i) + ac_rtld_close(&rtld_binaries[i]); } static void si_dump_annotated_shaders(struct si_context *sctx, FILE *f) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 1faaa22ab0d..8527999645b 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -721,7 +721,7 @@ static void si_destroy_screen(struct pipe_screen* pscreen) struct si_shader_part *part = parts[i]; parts[i] = part->next; - ac_shader_binary_clean(&part->binary); + si_shader_binary_clean(&part->binary); FREE(part); } } diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 20f769d09fd..d32feab52c2 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1296,7 +1296,7 @@ void si_log_compute_state(struct si_context *sctx, struct u_log_context *log); void si_init_debug_functions(struct si_context *sctx); void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved, enum ring_type ring); -bool si_replace_shader(unsigned num, struct ac_shader_binary *binary); +bool si_replace_shader(unsigned num, struct si_shader_binary *binary); /* si_dma.c */ void si_init_dma_functions(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index ad965a11750..04944e8bdab 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -29,8 +29,10 @@ #include "tgsi/tgsi_util.h" #include "tgsi/tgsi_dump.h" +#include "ac_binary.h" #include "ac_exp_param.h" #include "ac_shader_util.h" +#include "ac_rtld.h" #include "ac_llvm_util.h" #include "si_shader_internal.h" #include "si_pipe.h" @@ -5045,168 +5047,157 @@ static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx, ac_build_kill_if_false(&ctx->ac, bit); } -void si_shader_apply_scratch_relocs(struct si_shader *shader, - uint64_t scratch_va) -{ - unsigned i; - uint32_t scratch_rsrc_dword0 = scratch_va; - uint32_t scratch_rsrc_dword1 = - S_008F04_BASE_ADDRESS_HI(scratch_va >> 32); - - /* Enable scratch coalescing. */ - scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1); - - for (i = 0 ; i < shader->binary.reloc_count; i++) { - const struct ac_shader_reloc *reloc = - &shader->binary.relocs[i]; - if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) { - util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset, - &scratch_rsrc_dword0, 4); - } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { - util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset, - &scratch_rsrc_dword1, 4); - } - } -} - /* For the UMR disassembler. */ #define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */ #define DEBUGGER_NUM_MARKERS 5 +static bool si_shader_binary_open(const struct si_shader *shader, + struct ac_rtld_binary *rtld) +{ + const char *part_elfs[5]; + size_t part_sizes[5]; + unsigned num_parts = 0; + +#define add_part(shader_or_part) \ + if (shader_or_part) { \ + part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \ + part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \ + num_parts++; \ + } + + add_part(shader->prolog); + add_part(shader->previous_stage); + add_part(shader->prolog2); + add_part(shader); + add_part(shader->epilog); + +#undef add_part + + return ac_rtld_open(rtld, num_parts, part_elfs, part_sizes); +} + static unsigned si_get_shader_binary_size(const struct si_shader *shader) { - unsigned size = shader->binary.code_size; - - if (shader->prolog) - size += shader->prolog->binary.code_size; - if (shader->previous_stage) - size += shader->previous_stage->binary.code_size; - if (shader->prolog2) - size += shader->prolog2->binary.code_size; - if (shader->epilog) - size += shader->epilog->binary.code_size; - return size + DEBUGGER_NUM_MARKERS * 4; -} - -bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader) -{ - const struct ac_shader_binary *prolog = - shader->prolog ? &shader->prolog->binary : NULL; - const struct ac_shader_binary *previous_stage = - shader->previous_stage ? &shader->previous_stage->binary : NULL; - const struct ac_shader_binary *prolog2 = - shader->prolog2 ? &shader->prolog2->binary : NULL; - const struct ac_shader_binary *epilog = - shader->epilog ? &shader->epilog->binary : NULL; - const struct ac_shader_binary *mainb = &shader->binary; - unsigned bo_size = si_get_shader_binary_size(shader) + - (!epilog ? mainb->rodata_size : 0); - unsigned char *ptr; - - assert(!prolog || !prolog->rodata_size); - assert(!previous_stage || !previous_stage->rodata_size); - assert(!prolog2 || !prolog2->rodata_size); - assert((!prolog && !previous_stage && !prolog2 && !epilog) || - !mainb->rodata_size); - assert(!epilog || !epilog->rodata_size); + struct ac_rtld_binary rtld; + si_shader_binary_open(shader, &rtld); + return rtld.rx_size; +} + + +static bool si_get_external_symbol(void *data, const char *name, uint64_t *value) +{ + uint64_t *scratch_va = data; + + if (!strcmp(scratch_rsrc_dword0_symbol, name)) { + *value = (uint32_t)*scratch_va; + return true; + } + if (!strcmp(scratch_rsrc_dword1_symbol, name)) { + /* Enable scratch coalescing. */ + *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) | + S_008F04_SWIZZLE_ENABLE(1); + if (HAVE_LLVM < 0x0800) { + /* Old LLVM created an R_ABS32_HI relocation for + * this symbol. */ + *value <<= 32; + } + return true; + } + + return false; +} + +bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader, + uint64_t scratch_va) +{ + struct ac_rtld_binary binary; + if (!si_shader_binary_open(shader, &binary)) + return false; si_resource_reference(&shader->bo, NULL); shader->bo = si_aligned_buffer_create(&sscreen->b, sscreen->cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY, PIPE_USAGE_IMMUTABLE, - align(bo_size, SI_CPDMA_ALIGNMENT), + align(binary.rx_size, SI_CPDMA_ALIGNMENT), 256); if (!shader->bo) return false; /* Upload. */ - ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL, + struct ac_rtld_upload_info u = {}; + u.binary = &binary; + u.get_external_symbol = si_get_external_symbol; + u.cb_data = &scratch_va; + u.rx_va = shader->bo->gpu_address; + u.rx_ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL, PIPE_TRANSFER_READ_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED | RADEON_TRANSFER_TEMPORARY); + if (!u.rx_ptr) + return false; - /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are - * endian-independent. */ - if (prolog) { - memcpy(ptr, prolog->code, prolog->code_size); - ptr += prolog->code_size; - } - if (previous_stage) { - memcpy(ptr, previous_stage->code, previous_stage->code_size); - ptr += previous_stage->code_size; - } - if (prolog2) { - memcpy(ptr, prolog2->code, prolog2->code_size); - ptr += prolog2->code_size; - } - - memcpy(ptr, mainb->code, mainb->code_size); - ptr += mainb->code_size; - - if (epilog) { - memcpy(ptr, epilog->code, epilog->code_size); - ptr += epilog->code_size; - } else if (mainb->rodata_size > 0) { - memcpy(ptr, mainb->rodata, mainb->rodata_size); - ptr += mainb->rodata_size; - } - - /* Add end-of-code markers for the UMR disassembler. */ - uint32_t *ptr32 = (uint32_t*)ptr; - for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; i++) - ptr32[i] = DEBUGGER_END_OF_CODE_MARKER; + bool ok = ac_rtld_upload(&u); sscreen->ws->buffer_unmap(shader->bo->buf); - return true; + ac_rtld_close(&binary); + + return ok; } -static void si_shader_dump_disassembly(const struct ac_shader_binary *binary, +static void si_shader_dump_disassembly(const struct si_shader_binary *binary, struct pipe_debug_callback *debug, const char *name, FILE *file) { - char *line, *p; - unsigned i, count; + struct ac_rtld_binary rtld_binary; - if (binary->disasm_string) { - fprintf(file, "Shader %s disassembly:\n", name); - fprintf(file, "%s", binary->disasm_string); + if (!ac_rtld_open(&rtld_binary, 1, &binary->elf_buffer, &binary->elf_size)) + return; - if (debug && debug->debug_message) { - /* Very long debug messages are cut off, so send the - * disassembly one line at a time. This causes more - * overhead, but on the plus side it simplifies - * parsing of resulting logs. - */ - pipe_debug_message(debug, SHADER_INFO, - "Shader Disassembly Begin"); + const char *disasm; + size_t nbytes; - line = binary->disasm_string; - while (*line) { - p = util_strchrnul(line, '\n'); - count = p - line; + if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes)) + goto out; - if (count) { - pipe_debug_message(debug, SHADER_INFO, - "%.*s", count, line); - } + fprintf(file, "Shader %s disassembly:\n", name); + if (nbytes > INT_MAX) { + fprintf(file, "too long\n"); + goto out; + } - if (!*p) - break; - line = p + 1; + fprintf(file, "%*s", (int)nbytes, disasm); + + if (debug && debug->debug_message) { + /* Very long debug messages are cut off, so send the + * disassembly one line at a time. This causes more + * overhead, but on the plus side it simplifies + * parsing of resulting logs. + */ + pipe_debug_message(debug, SHADER_INFO, + "Shader Disassembly Begin"); + + uint64_t line = 0; + while (line < nbytes) { + int count = nbytes - line; + const char *nl = memchr(disasm + line, '\n', nbytes - line); + if (nl) + count = nl - disasm; + + if (count) { + pipe_debug_message(debug, SHADER_INFO, + "%.*s", count, disasm + line); } - pipe_debug_message(debug, SHADER_INFO, - "Shader Disassembly End"); - } - } else { - fprintf(file, "Shader %s binary:\n", name); - for (i = 0; i < binary->code_size; i += 4) { - fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i, - binary->code[i + 3], binary->code[i + 2], - binary->code[i + 1], binary->code[i]); + line += count + 1; } + + pipe_debug_message(debug, SHADER_INFO, + "Shader Disassembly End"); } + +out: + ac_rtld_close(&rtld_binary); } static void si_calculate_max_simd_waves(struct si_shader *shader) @@ -5398,8 +5389,21 @@ void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader, check_debug_option); } +bool si_shader_binary_read_config(struct si_shader_binary *binary, + struct ac_shader_config *conf) +{ + struct ac_rtld_binary rtld; + if (!ac_rtld_open(&rtld, 1, &binary->elf_buffer, &binary->elf_size)) + return false; + + bool ok = ac_rtld_read_config(&rtld, conf); + + ac_rtld_close(&rtld); + return ok; +} + static int si_compile_llvm(struct si_screen *sscreen, - struct ac_shader_binary *binary, + struct si_shader_binary *binary, struct ac_shader_config *conf, struct ac_llvm_compiler *compiler, LLVMModuleRef mod, @@ -5408,7 +5412,6 @@ static int si_compile_llvm(struct si_screen *sscreen, const char *name, bool less_optimized) { - int r = 0; unsigned count = p_atomic_inc_return(&sscreen->num_compilations); if (si_can_dump_shader(sscreen, processor)) { @@ -5428,13 +5431,14 @@ static int si_compile_llvm(struct si_screen *sscreen, } if (!si_replace_shader(count, binary)) { - r = si_llvm_compile(mod, binary, compiler, debug, - less_optimized); + unsigned r = si_llvm_compile(mod, binary, compiler, debug, + less_optimized); if (r) return r; } - ac_shader_binary_read_config(binary, conf, 0, false); + if (!si_shader_binary_read_config(binary, conf)) + return -1; /* Enable 64-bit and 16-bit denormals, because there is no performance * cost. @@ -5450,24 +5454,7 @@ static int si_compile_llvm(struct si_screen *sscreen, */ conf->float_mode |= V_00B028_FP_64_DENORMS; - FREE(binary->config); - FREE(binary->global_symbol_offsets); - binary->config = NULL; - binary->global_symbol_offsets = NULL; - - /* Some shaders can't have rodata because their binaries can be - * concatenated. - */ - if (binary->rodata_size && - (processor == PIPE_SHADER_VERTEX || - processor == PIPE_SHADER_TESS_CTRL || - processor == PIPE_SHADER_TESS_EVAL || - processor == PIPE_SHADER_FRAGMENT)) { - fprintf(stderr, "radeonsi: The shader can't have rodata."); - return -EINVAL; - } - - return r; + return 0; } static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret) @@ -5609,7 +5596,11 @@ si_generate_gs_copy_shader(struct si_screen *sscreen, fprintf(stderr, "GS Copy Shader:\n"); si_shader_dump(sscreen, ctx.shader, debug, PIPE_SHADER_GEOMETRY, stderr, true); - ok = si_shader_binary_upload(sscreen, ctx.shader); + + if (!ctx.shader->config.scratch_bytes_per_wave) + ok = si_shader_binary_upload(sscreen, ctx.shader, 0); + else + ok = true; } si_llvm_dispose(&ctx); @@ -8011,7 +8002,7 @@ bool si_shader_create(struct si_screen *sscreen, struct ac_llvm_compiler *compil stderr, true); /* Upload. */ - if (!si_shader_binary_upload(sscreen, shader)) { + if (!si_shader_binary_upload(sscreen, shader, 0)) { fprintf(stderr, "LLVM failed to upload shader\n"); return false; } @@ -8027,7 +8018,7 @@ void si_shader_destroy(struct si_shader *shader) si_resource_reference(&shader->bo, NULL); if (!shader->is_binary_shared) - ac_shader_binary_clean(&shader->binary); + si_shader_binary_clean(&shader->binary); free(shader->shader_log); } diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 145a03bd1ae..586460e2b4f 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -588,6 +588,13 @@ struct si_shader_info { unsigned max_simd_waves; }; +struct si_shader_binary { + const char *elf_buffer; + size_t elf_size; + + char *llvm_ir_string; +}; + struct si_shader { struct si_compiler_ctx_state compiler_ctx_state; @@ -612,7 +619,7 @@ struct si_shader { bool is_gs_copy_shader; /* The following data is all that's needed for binary shaders. */ - struct ac_shader_binary binary; + struct si_shader_binary binary; struct ac_shader_config config; struct si_shader_info info; @@ -669,7 +676,7 @@ struct si_shader { struct si_shader_part { struct si_shader_part *next; union si_shader_part_key key; - struct ac_shader_binary binary; + struct si_shader_binary binary; struct ac_shader_config config; }; @@ -690,7 +697,8 @@ void si_shader_destroy(struct si_shader *shader); unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index); unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index, unsigned is_varying); -bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader); +bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader, + uint64_t scratch_va); void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader, struct pipe_debug_callback *debug, unsigned processor, FILE *f, bool check_debug_option); @@ -698,9 +706,10 @@ void si_shader_dump_stats_for_shader_db(const struct si_shader *shader, struct pipe_debug_callback *debug); void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size); -void si_shader_apply_scratch_relocs(struct si_shader *shader, - uint64_t scratch_va); const char *si_get_shader_name(const struct si_shader *shader, unsigned processor); +bool si_shader_binary_read_config(struct si_shader_binary *binary, + struct ac_shader_config *conf); +void si_shader_binary_clean(struct si_shader_binary *binary); /* si_shader_nir.c */ void si_nir_scan_shader(const struct nir_shader *nir, diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 6e21bc7c26b..4a7b059de9a 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -36,7 +36,6 @@ #include <llvm-c/TargetMachine.h> struct pipe_debug_callback; -struct ac_shader_binary; #define RADEON_LLVM_MAX_INPUT_SLOTS 32 #define RADEON_LLVM_MAX_INPUTS 32 * 4 @@ -243,7 +242,7 @@ void si_create_function(struct si_shader_context *ctx, LLVMTypeRef *returns, unsigned num_returns, struct si_function_info *fninfo, unsigned max_workgroup_size); -unsigned si_llvm_compile(LLVMModuleRef M, struct ac_shader_binary *binary, +unsigned si_llvm_compile(LLVMModuleRef M, struct si_shader_binary *binary, struct ac_llvm_compiler *compiler, struct pipe_debug_callback *debug, bool less_optimized); diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c index f70c41ca8c4..33b40685f04 100644 --- a/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c +++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c @@ -80,7 +80,7 @@ static void si_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context) * * @returns 0 for success, 1 for failure */ -unsigned si_llvm_compile(LLVMModuleRef M, struct ac_shader_binary *binary, +unsigned si_llvm_compile(LLVMModuleRef M, struct si_shader_binary *binary, struct ac_llvm_compiler *compiler, struct pipe_debug_callback *debug, bool less_optimized) @@ -100,7 +100,8 @@ unsigned si_llvm_compile(LLVMModuleRef M, struct ac_shader_binary *binary, LLVMContextSetDiagnosticHandler(llvm_ctx, si_diagnostic_handler, &diag); /* Compile IR. */ - if (!ac_compile_module_to_binary(passes, M, binary)) + if (!ac_compile_module_to_elf(passes, M, (char **)&binary->elf_buffer, + &binary->elf_size)) diag.retval = 1; if (diag.retval != 0) @@ -108,6 +109,15 @@ unsigned si_llvm_compile(LLVMModuleRef M, struct ac_shader_binary *binary, return diag.retval; } +void si_shader_binary_clean(struct si_shader_binary *binary) +{ + free((void *)binary->elf_buffer); + binary->elf_buffer = NULL; + + free(binary->llvm_ir_string); + binary->llvm_ir_string = NULL; +} + LLVMTypeRef tgsi2llvmtype(struct lp_build_tgsi_context *bld_base, enum tgsi_opcode_type type) { diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 77d1c014305..6e47f7f55ba 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -127,21 +127,21 @@ static uint32_t *read_chunk(uint32_t *ptr, void **data, unsigned *size) static void *si_get_shader_binary(struct si_shader *shader) { /* There is always a size of data followed by the data itself. */ - unsigned relocs_size = shader->binary.reloc_count * - sizeof(shader->binary.relocs[0]); - unsigned disasm_size = shader->binary.disasm_string ? - strlen(shader->binary.disasm_string) + 1 : 0; unsigned llvm_ir_size = shader->binary.llvm_ir_string ? strlen(shader->binary.llvm_ir_string) + 1 : 0; + + /* Refuse to allocate overly large buffers and guard against integer + * overflow. */ + if (shader->binary.elf_size > UINT_MAX / 4 || + llvm_ir_size > UINT_MAX / 4) + return NULL; + unsigned size = 4 + /* total size */ 4 + /* CRC32 of the data below */ align(sizeof(shader->config), 4) + align(sizeof(shader->info), 4) + - 4 + align(shader->binary.code_size, 4) + - 4 + align(shader->binary.rodata_size, 4) + - 4 + align(relocs_size, 4) + - 4 + align(disasm_size, 4) + + 4 + align(shader->binary.elf_size, 4) + 4 + align(llvm_ir_size, 4); void *buffer = CALLOC(1, size); uint32_t *ptr = (uint32_t*)buffer; @@ -154,10 +154,7 @@ static void *si_get_shader_binary(struct si_shader *shader) ptr = write_data(ptr, &shader->config, sizeof(shader->config)); ptr = write_data(ptr, &shader->info, sizeof(shader->info)); - ptr = write_chunk(ptr, shader->binary.code, shader->binary.code_size); - ptr = write_chunk(ptr, shader->binary.rodata, shader->binary.rodata_size); - ptr = write_chunk(ptr, shader->binary.relocs, relocs_size); - ptr = write_chunk(ptr, shader->binary.disasm_string, disasm_size); + ptr = write_chunk(ptr, shader->binary.elf_buffer, shader->binary.elf_size); ptr = write_chunk(ptr, shader->binary.llvm_ir_string, llvm_ir_size); assert((char *)ptr - (char *)buffer == size); @@ -175,6 +172,7 @@ static bool si_load_shader_binary(struct si_shader *shader, void *binary) uint32_t size = *ptr++; uint32_t crc32 = *ptr++; unsigned chunk_size; + unsigned elf_size; if (util_hash_crc32(ptr, size - 8) != crc32) { fprintf(stderr, "radeonsi: binary shader has invalid CRC32\n"); @@ -183,13 +181,9 @@ static bool si_load_shader_binary(struct si_shader *shader, void *binary) ptr = read_data(ptr, &shader->config, sizeof(shader->config)); ptr = read_data(ptr, &shader->info, sizeof(shader->info)); - ptr = read_chunk(ptr, (void**)&shader->binary.code, - &shader->binary.code_size); - ptr = read_chunk(ptr, (void**)&shader->binary.rodata, - &shader->binary.rodata_size); - ptr = read_chunk(ptr, (void**)&shader->binary.relocs, &chunk_size); - shader->binary.reloc_count = chunk_size / sizeof(shader->binary.relocs[0]); - ptr = read_chunk(ptr, (void**)&shader->binary.disasm_string, &chunk_size); + ptr = read_chunk(ptr, (void**)&shader->binary.elf_buffer, + &elf_size); + shader->binary.elf_size = elf_size; ptr = read_chunk(ptr, (void**)&shader->binary.llvm_ir_string, &chunk_size); return true; @@ -3132,13 +3126,8 @@ static int si_update_scratch_buffer(struct si_context *sctx, assert(sctx->scratch_buffer); - if (shader->previous_stage) - si_shader_apply_scratch_relocs(shader->previous_stage, scratch_va); - - si_shader_apply_scratch_relocs(shader, scratch_va); - /* Replace the shader bo with a new bo that has the relocs applied. */ - if (!si_shader_binary_upload(sctx->screen, shader)) { + if (!si_shader_binary_upload(sctx->screen, shader, scratch_va)) { si_shader_unlock(shader); return -1; } |