From 5c92c21369ee3b4f52eb5aed183092ba3ee7e079 Mon Sep 17 00:00:00 2001 From: Marek Olšák Date: Sat, 11 Jun 2016 19:57:40 +0200 Subject: radeonsi: do compilation from si_create_shader_selector asynchronously MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Main shader parts and geometry shaders are compiled asynchronously by util_queue. si_create_shader_selector doesn't wait and returns. si_draw_vbo(si_shader_select) waits for completion. This has the best effect when shaders are compiled at app-loading time. It doesn't help much for shaders compiled on demand, even though VS+PS compilation should take as much as time as the bigger one of the two. If an app creates more shaders, at most 4 threads will be used to compile them. Debug output disables this for shader stats to be printed in the correct order. (We could go even further and build variants asynchronously too, then emit draw calls without waiting and emit incomplete shader states, then force IB chaining to give the compiler more time, then sync the compilation at the IB flush and patch the IB with correct shader states. This is great for compilation before draw calls, but there are some difficulties such as scratch and tess states requiring the compiler output, and an on-disk shader cache will likely be a much better and simpler solution.) Reviewed-by: Nicolai Hähnle --- src/gallium/drivers/radeonsi/si_pipe.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'src/gallium/drivers/radeonsi/si_pipe.c') diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 06b32db43db..ee97bcfaea5 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -663,6 +663,13 @@ static void si_destroy_screen(struct pipe_screen* pscreen) if (!sscreen->b.ws->unref(sscreen->b.ws)) return; + if (util_queue_is_initialized(&sscreen->shader_compiler_queue)) + util_queue_destroy(&sscreen->shader_compiler_queue); + + for (i = 0; i < ARRAY_SIZE(sscreen->tm); i++) + if (sscreen->tm[i]) + LLVMDisposeTargetMachine(sscreen->tm[i]); + /* Free shader parts. */ for (i = 0; i < ARRAY_SIZE(parts); i++) { while (parts[i]) { @@ -710,6 +717,7 @@ static bool si_init_gs_info(struct si_screen *sscreen) struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws) { struct si_screen *sscreen = CALLOC_STRUCT(si_screen); + unsigned num_cpus, num_compiler_threads, i; if (!sscreen) { return NULL; @@ -754,6 +762,16 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws) if (debug_get_bool_option("RADEON_DUMP_SHADERS", false)) sscreen->b.debug_flags |= DBG_FS | DBG_VS | DBG_GS | DBG_PS | DBG_CS; + /* Only enable as many threads as we have target machines and CPUs. */ + num_cpus = sysconf(_SC_NPROCESSORS_ONLN); + num_compiler_threads = MIN2(num_cpus, ARRAY_SIZE(sscreen->tm)); + + for (i = 0; i < num_compiler_threads; i++) + sscreen->tm[i] = si_create_llvm_target_machine(sscreen); + + util_queue_init(&sscreen->shader_compiler_queue, "si_shader", + 32, num_compiler_threads); + /* Create the auxiliary context. This must be done last. */ sscreen->b.aux_context = sscreen->b.b.context_create(&sscreen->b.b, NULL, 0); -- cgit v1.2.3