diff options
Diffstat (limited to 'src/gallium/drivers')
143 files changed, 2957 insertions, 1574 deletions
diff --git a/src/gallium/drivers/freedreno/.gitignore b/src/gallium/drivers/freedreno/.gitignore new file mode 100644 index 00000000000..150f5d19f5b --- /dev/null +++ b/src/gallium/drivers/freedreno/.gitignore @@ -0,0 +1 @@ +ir3_compiler diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources index baae9144005..74ef4168655 100644 --- a/src/gallium/drivers/freedreno/Makefile.sources +++ b/src/gallium/drivers/freedreno/Makefile.sources @@ -128,6 +128,7 @@ ir3_SOURCES := \ ir3/ir3_group.c \ ir3/ir3.h \ ir3/ir3_legalize.c \ + ir3/ir3_nir.c \ ir3/ir3_nir.h \ ir3/ir3_nir_lower_if_else.c \ ir3/ir3_print.c \ diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.c b/src/gallium/drivers/freedreno/a3xx/fd3_context.c index 74cbbf2edd8..e47bbff5643 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_context.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.c @@ -171,8 +171,8 @@ fd3_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags) fd3_query_context_init(pctx); - fd3_ctx->border_color_uploader = u_upload_create(pctx, 4096, - 2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, 0); + fd3_ctx->border_color_uploader = u_upload_create(pctx, 4096, 0, + PIPE_USAGE_STREAM); return pctx; } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c index 24afbc9e956..e65a352e7f6 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c @@ -145,7 +145,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring, void *ptr; u_upload_alloc(fd3_ctx->border_color_uploader, - 0, 2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, &off, + 0, BORDER_COLOR_UPLOAD_SIZE, + BORDER_COLOR_UPLOAD_SIZE, &off, &fd3_ctx->border_color_buf, &ptr); diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.c b/src/gallium/drivers/freedreno/a4xx/fd4_context.c index e53e0c56c9a..7d6365bbb6d 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_context.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.c @@ -171,8 +171,8 @@ fd4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags) fd4_query_context_init(pctx); - fd4_ctx->border_color_uploader = u_upload_create(pctx, 4096, - 2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, 0); + fd4_ctx->border_color_uploader = u_upload_create(pctx, 4096, 0, + PIPE_USAGE_STREAM); return pctx; } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c index b9a28149722..bc62a5d9a4b 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c @@ -133,7 +133,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring, void *ptr; u_upload_alloc(fd4_ctx->border_color_uploader, - 0, 2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, &off, + 0, BORDER_COLOR_UPLOAD_SIZE, + BORDER_COLOR_UPLOAD_SIZE, &off, &fd4_ctx->border_color_buf, &ptr); diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h index 571c8142bf7..418b71b95de 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.h +++ b/src/gallium/drivers/freedreno/freedreno_context.h @@ -40,6 +40,8 @@ #include "freedreno_gmem.h" #include "freedreno_util.h" +#define BORDER_COLOR_UPLOAD_SIZE (2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE) + struct fd_vertex_stateobj; struct fd_texture_stateobj { diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index 5bbe4016a2a..9d0cdd8e545 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -226,6 +226,8 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_TEXTURE_GATHER_OFFSETS: case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION: case PIPE_CAP_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: case PIPE_CAP_POLYGON_OFFSET_CLAMP: case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: @@ -238,6 +240,11 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: case PIPE_CAP_CLEAR_TEXTURE: + case PIPE_CAP_DRAW_PARAMETERS: + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: + case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: + case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: return 0; case PIPE_CAP_MAX_VIEWPORTS: @@ -414,6 +421,8 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, return PIPE_SHADER_IR_TGSI; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; + case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: + return 0; } debug_printf("unknown shader param %d\n", param); return 0; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c index d55daeefe06..481859efb17 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c @@ -40,6 +40,7 @@ #include "freedreno_util.h" #include "ir3_compiler.h" +#include "ir3_nir.h" #include "instr-a3xx.h" #include "ir3.h" @@ -105,10 +106,10 @@ int main(int argc, char **argv) const char *filename; struct tgsi_token toks[65536]; struct tgsi_parse_context parse; - struct ir3_compiler *compiler; struct ir3_shader_variant v; struct ir3_shader s; struct ir3_shader_key key = {}; + /* TODO cmdline option to target different gpus: */ unsigned gpu_id = 320; const char *info; void *ptr; @@ -228,7 +229,12 @@ int main(int argc, char **argv) if (!tgsi_text_translate(ptr, toks, Elements(toks))) errx(1, "could not parse `%s'", filename); - s.tokens = toks; + if (fd_mesa_debug & FD_DBG_OPTMSGS) + tgsi_dump(toks, 0); + + nir_shader *nir = ir3_tgsi_to_nir(toks); + s.compiler = ir3_compiler_create(gpu_id); + s.nir = ir3_optimize_nir(&s, nir, NULL); v.key = key; v.shader = &s; @@ -246,11 +252,8 @@ int main(int argc, char **argv) break; } - /* TODO cmdline option to target different gpus: */ - compiler = ir3_compiler_create(gpu_id); - info = "NIR compiler"; - ret = ir3_compile_shader_nir(compiler, &v); + ret = ir3_compile_shader_nir(s.compiler, &v); if (ret) { fprintf(stderr, "compiler failed!\n"); return ret; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index 224f7806b3c..86afda4ba08 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -32,10 +32,6 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "tgsi/tgsi_lowering.h" -#include "tgsi/tgsi_strings.h" - -#include "nir/tgsi_to_nir.h" #include "freedreno_util.h" @@ -123,97 +119,10 @@ struct ir3_compile { static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val); static struct ir3_block * get_block(struct ir3_compile *ctx, nir_block *nblock); -static struct nir_shader *to_nir(struct ir3_compile *ctx, - const struct tgsi_token *tokens, struct ir3_shader_variant *so) -{ - static const nir_shader_compiler_options options = { - .lower_fpow = true, - .lower_fsat = true, - .lower_scmp = true, - .lower_flrp = true, - .lower_ffract = true, - .native_integers = true, - }; - struct nir_lower_tex_options tex_options = { - .lower_rect = 0, - }; - bool progress; - - switch (so->type) { - case SHADER_FRAGMENT: - case SHADER_COMPUTE: - tex_options.saturate_s = so->key.fsaturate_s; - tex_options.saturate_t = so->key.fsaturate_t; - tex_options.saturate_r = so->key.fsaturate_r; - break; - case SHADER_VERTEX: - tex_options.saturate_s = so->key.vsaturate_s; - tex_options.saturate_t = so->key.vsaturate_t; - tex_options.saturate_r = so->key.vsaturate_r; - break; - } - - if (ctx->compiler->gpu_id >= 400) { - /* a4xx seems to have *no* sam.p */ - tex_options.lower_txp = ~0; /* lower all txp */ - } else { - /* a3xx just needs to avoid sam.p for 3d tex */ - tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D); - } - - struct nir_shader *s = tgsi_to_nir(tokens, &options); - - if (fd_mesa_debug & FD_DBG_DISASM) { - debug_printf("----------------------\n"); - nir_print_shader(s, stdout); - debug_printf("----------------------\n"); - } - - nir_opt_global_to_local(s); - nir_convert_to_ssa(s); - if (s->stage == MESA_SHADER_VERTEX) { - nir_lower_clip_vs(s, so->key.ucp_enables); - } else if (s->stage == MESA_SHADER_FRAGMENT) { - nir_lower_clip_fs(s, so->key.ucp_enables); - } - nir_lower_tex(s, &tex_options); - if (so->key.color_two_side) - nir_lower_two_sided_color(s); - nir_lower_idiv(s); - nir_lower_load_const_to_scalar(s); - - do { - progress = false; - - nir_lower_vars_to_ssa(s); - nir_lower_alu_to_scalar(s); - nir_lower_phis_to_scalar(s); - - progress |= nir_copy_prop(s); - progress |= nir_opt_dce(s); - progress |= nir_opt_cse(s); - progress |= ir3_nir_lower_if_else(s); - progress |= nir_opt_algebraic(s); - progress |= nir_opt_constant_folding(s); - - } while (progress); - - nir_remove_dead_variables(s); - nir_validate_shader(s); - - if (fd_mesa_debug & FD_DBG_DISASM) { - debug_printf("----------------------\n"); - nir_print_shader(s, stdout); - debug_printf("----------------------\n"); - } - - return s; -} static struct ir3_compile * compile_init(struct ir3_compiler *compiler, - struct ir3_shader_variant *so, - const struct tgsi_token *tokens) + struct ir3_shader_variant *so) { struct ir3_compile *ctx = rzalloc(NULL, struct ir3_compile); @@ -239,7 +148,28 @@ compile_init(struct ir3_compiler *compiler, ctx->block_ht = _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal); - ctx->s = to_nir(ctx, tokens, so); + /* TODO: maybe generate some sort of bitmask of what key + * lowers vs what shader has (ie. no need to lower + * texture clamp lowering if no texture sample instrs).. + * although should be done further up the stack to avoid + * creating duplicate variants.. + */ + + if (ir3_key_lowers_nir(&so->key)) { + nir_shader *s = nir_shader_clone(ctx, so->shader->nir); + ctx->s = ir3_optimize_nir(so->shader, s, &so->key); + } else { + /* fast-path for shader key that lowers nothing in NIR: */ + ctx->s = so->shader->nir; + } + + if (fd_mesa_debug & FD_DBG_DISASM) { + DBG("dump nir%dv%d: type=%d, k={bp=%u,cts=%u,hp=%u}", + so->shader->id, so->id, so->type, + so->key.binning_pass, so->key.color_two_side, + so->key.half_precision); + nir_print_shader(ctx->s, stdout); + } so->first_driver_param = so->first_immediate = ctx->s->num_uniforms; @@ -1954,8 +1884,6 @@ emit_instr(struct ir3_compile *ctx, nir_instr *instr) case nir_texop_query_levels: emit_tex_query_levels(ctx, tex); break; - case nir_texop_samples_identical: - unreachable("nir_texop_samples_identical"); default: emit_tex(ctx, tex); break; @@ -2170,6 +2098,8 @@ emit_stream_out(struct ir3_compile *ctx) static void emit_function(struct ir3_compile *ctx, nir_function_impl *impl) { + nir_metadata_require(impl, nir_metadata_block_index); + emit_cf_list(ctx, &impl->body); emit_block(ctx, impl->end_block); @@ -2499,7 +2429,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, assert(!so->ir); - ctx = compile_init(compiler, so, so->shader->tokens); + ctx = compile_init(compiler, so); if (!ctx) { DBG("INIT failed!"); ret = -1; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_nir.c new file mode 100644 index 00000000000..565b9c32c1d --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.c @@ -0,0 +1,153 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2015 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + + +#include "freedreno_util.h" + +#include "ir3_nir.h" +#include "ir3_compiler.h" +#include "ir3_shader.h" + +#include "nir/tgsi_to_nir.h" + +struct nir_shader * +ir3_tgsi_to_nir(const struct tgsi_token *tokens) +{ + static const nir_shader_compiler_options options = { + .lower_fpow = true, + .lower_fsat = true, + .lower_scmp = true, + .lower_flrp = true, + .lower_ffract = true, + .native_integers = true, + }; + return tgsi_to_nir(tokens, &options); +} + +/* for given shader key, are any steps handled in nir? */ +bool +ir3_key_lowers_nir(const struct ir3_shader_key *key) +{ + return key->fsaturate_s | key->fsaturate_t | key->fsaturate_r | + key->vsaturate_s | key->vsaturate_t | key->vsaturate_r | + key->ucp_enables | key->color_two_side; +} + +#define OPT(nir, pass, ...) ({ \ + bool this_progress = false; \ + NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \ + this_progress; \ +}) + +#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__) + +struct nir_shader * +ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s, + const struct ir3_shader_key *key) +{ + struct nir_lower_tex_options tex_options = { + .lower_rect = 0, + }; + bool progress; + + if (key) { + switch (shader->type) { + case SHADER_FRAGMENT: + case SHADER_COMPUTE: + tex_options.saturate_s = key->fsaturate_s; + tex_options.saturate_t = key->fsaturate_t; + tex_options.saturate_r = key->fsaturate_r; + break; + case SHADER_VERTEX: + tex_options.saturate_s = key->vsaturate_s; + tex_options.saturate_t = key->vsaturate_t; + tex_options.saturate_r = key->vsaturate_r; + break; + } + } + + if (shader->compiler->gpu_id >= 400) { + /* a4xx seems to have *no* sam.p */ + tex_options.lower_txp = ~0; /* lower all txp */ + } else { + /* a3xx just needs to avoid sam.p for 3d tex */ + tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D); + } + + if (fd_mesa_debug & FD_DBG_DISASM) { + debug_printf("----------------------\n"); + nir_print_shader(s, stdout); + debug_printf("----------------------\n"); + } + + OPT_V(s, nir_opt_global_to_local); + OPT_V(s, nir_convert_to_ssa); + + if (key) { + if (s->stage == MESA_SHADER_VERTEX) { + OPT_V(s, nir_lower_clip_vs, key->ucp_enables); + } else if (s->stage == MESA_SHADER_FRAGMENT) { + OPT_V(s, nir_lower_clip_fs, key->ucp_enables); + } + if (key->color_two_side) { + OPT_V(s, nir_lower_two_sided_color); + } + } + + OPT_V(s, nir_lower_tex, &tex_options); + OPT_V(s, nir_lower_idiv); + OPT_V(s, nir_lower_load_const_to_scalar); + + do { + progress = false; + + OPT_V(s, nir_lower_vars_to_ssa); + OPT_V(s, nir_lower_alu_to_scalar); + OPT_V(s, nir_lower_phis_to_scalar); + + progress |= OPT(s, nir_copy_prop); + progress |= OPT(s, nir_opt_dce); + progress |= OPT(s, nir_opt_cse); + progress |= OPT(s, ir3_nir_lower_if_else); + progress |= OPT(s, nir_opt_algebraic); + progress |= OPT(s, nir_opt_constant_folding); + + } while (progress); + + OPT_V(s, nir_remove_dead_variables); + + if (fd_mesa_debug & FD_DBG_DISASM) { + debug_printf("----------------------\n"); + nir_print_shader(s, stdout); + debug_printf("----------------------\n"); + } + + nir_sweep(s); + + return s; +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.h b/src/gallium/drivers/freedreno/ir3/ir3_nir.h index 9950782dc38..534199d3744 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_nir.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.h @@ -32,6 +32,13 @@ #include "glsl/nir/nir.h" #include "glsl/nir/shader_enums.h" +#include "ir3_shader.h" + bool ir3_nir_lower_if_else(nir_shader *shader); +struct nir_shader * ir3_tgsi_to_nir(const struct tgsi_token *tokens); +bool ir3_key_lowers_nir(const struct ir3_shader_key *key); +struct nir_shader * ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s, + const struct ir3_shader_key *key); + #endif /* IR3_NIR_H_ */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c index 7b565332256..7d17f426ad3 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c @@ -39,7 +39,7 @@ #include "ir3_shader.h" #include "ir3_compiler.h" - +#include "ir3_nir.h" static void delete_variant(struct ir3_shader_variant *v) @@ -187,12 +187,6 @@ create_variant(struct ir3_shader *shader, struct ir3_shader_key key) v->key = key; v->type = shader->type; - if (fd_mesa_debug & FD_DBG_DISASM) { - DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", shader->type, - key.binning_pass, key.color_two_side, key.half_precision); - tgsi_dump(shader->tokens, 0); - } - ret = ir3_compile_shader_nir(shader->compiler, v); if (ret) { debug_error("compile failed!"); @@ -267,7 +261,7 @@ ir3_shader_destroy(struct ir3_shader *shader) v = v->next; delete_variant(t); } - free((void *)shader->tokens); + ralloc_free(shader->nir); free(shader); } @@ -281,14 +275,24 @@ ir3_shader_create(struct pipe_context *pctx, shader->id = ++shader->compiler->shader_count; shader->pctx = pctx; shader->type = type; - shader->tokens = tgsi_dup_tokens(cso->tokens); + if (fd_mesa_debug & FD_DBG_DISASM) { + DBG("dump tgsi: type=%d", shader->type); + tgsi_dump(cso->tokens, 0); + } + nir_shader *nir = ir3_tgsi_to_nir(cso->tokens); + /* do first pass optimization, ignoring the key: */ + shader->nir = ir3_optimize_nir(shader, nir, NULL); + if (fd_mesa_debug & FD_DBG_DISASM) { + DBG("dump nir%d: type=%d", shader->id, shader->type); + nir_print_shader(shader->nir, stdout); + } shader->stream_output = cso->stream_output; if (fd_mesa_debug & FD_DBG_SHADERDB) { /* if shader-db run, create a standard variant immediately * (as otherwise nothing will trigger the shader to be * actually compiled) */ - static struct ir3_shader_key key = {}; + static struct ir3_shader_key key = {0}; ir3_shader_variant(shader, key); } return shader; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h index cf99a4c05ed..b3c28a41387 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h @@ -230,6 +230,8 @@ struct ir3_shader_variant { struct ir3_shader *shader; }; +typedef struct nir_shader nir_shader; + struct ir3_shader { enum shader_t type; @@ -240,7 +242,7 @@ struct ir3_shader { struct ir3_compiler *compiler; struct pipe_context *pctx; /* TODO replace w/ pipe_screen */ - const struct tgsi_token *tokens; + nir_shader *nir; struct pipe_stream_output_info stream_output; struct ir3_shader_variant *variants; diff --git a/src/gallium/drivers/i915/i915_context.h b/src/gallium/drivers/i915/i915_context.h index 1ed685188db..2adaee30fb9 100644 --- a/src/gallium/drivers/i915/i915_context.h +++ b/src/gallium/drivers/i915/i915_context.h @@ -195,7 +195,6 @@ struct i915_rasterizer_state { unsigned light_twoside : 1; unsigned st; - enum interp_mode color_interp; unsigned LIS4; unsigned LIS7; diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c index a5b161882cd..e2a493bc1b5 100644 --- a/src/gallium/drivers/i915/i915_screen.c +++ b/src/gallium/drivers/i915/i915_screen.c @@ -254,6 +254,11 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap) case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: case PIPE_CAP_CLEAR_TEXTURE: + case PIPE_CAP_DRAW_PARAMETERS: + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: + case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: + case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: return 0; case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: @@ -264,6 +269,8 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap) case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT: case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT: case PIPE_CAP_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: case PIPE_CAP_SAMPLER_VIEW_TARGET: return 0; diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c index 6ba9646f7ab..b54a9fbf4f9 100644 --- a/src/gallium/drivers/i915/i915_state.c +++ b/src/gallium/drivers/i915/i915_state.c @@ -423,7 +423,7 @@ i915_prepare_vertex_sampling(struct i915_context *i915) for (j = view->u.tex.first_level; j <= tex->last_level; j++) { mip_offsets[j] = i915_texture_offset(i915_tex, j , 0 /* FIXME depth */); row_stride[j] = i915_tex->stride; - img_stride[j] = 0; /* FIXME */; + img_stride[j] = 0; /* FIXME */ } draw_set_mapped_texture(i915->draw, @@ -920,7 +920,6 @@ i915_create_rasterizer_state(struct pipe_context *pipe, struct i915_rasterizer_state *cso = CALLOC_STRUCT( i915_rasterizer_state ); cso->templ = *rasterizer; - cso->color_interp = rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR; cso->light_twoside = rasterizer->light_twoside; cso->ds[0].u = _3DSTATE_DEPTH_OFFSET_SCALE; cso->ds[1].f = rasterizer->offset_scale; diff --git a/src/gallium/drivers/i915/i915_state_derived.c b/src/gallium/drivers/i915/i915_state_derived.c index 7ad88a1ce01..bd0f448f645 100644 --- a/src/gallium/drivers/i915/i915_state_derived.c +++ b/src/gallium/drivers/i915/i915_state_derived.c @@ -57,7 +57,6 @@ static uint find_mapping(const struct i915_fragment_shader* fs, int unit) static void calculate_vertex_layout(struct i915_context *i915) { const struct i915_fragment_shader *fs = i915->fs; - const enum interp_mode colorInterp = i915->rasterizer->color_interp; struct vertex_info vinfo; boolean texCoords[I915_TEX_UNITS], colors[2], fog, needW, face; uint i; @@ -107,12 +106,12 @@ static void calculate_vertex_layout(struct i915_context *i915) /* pos */ src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_POSITION, 0); if (needW) { - draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src); + draw_emit_vertex_attr(&vinfo, EMIT_4F, src); vinfo.hwfmt[0] |= S4_VFMT_XYZW; vinfo.attrib[0].emit = EMIT_4F; } else { - draw_emit_vertex_attr(&vinfo, EMIT_3F, INTERP_LINEAR, src); + draw_emit_vertex_attr(&vinfo, EMIT_3F, src); vinfo.hwfmt[0] |= S4_VFMT_XYZ; vinfo.attrib[0].emit = EMIT_3F; } @@ -123,21 +122,21 @@ static void calculate_vertex_layout(struct i915_context *i915) /* primary color */ if (colors[0]) { src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_COLOR, 0); - draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, colorInterp, src); + draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, src); vinfo.hwfmt[0] |= S4_VFMT_COLOR; } /* secondary color */ if (colors[1]) { src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_COLOR, 1); - draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, colorInterp, src); + draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, src); vinfo.hwfmt[0] |= S4_VFMT_SPEC_FOG; } /* fog coord, not fog blend factor */ if (fog) { src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_FOG, 0); - draw_emit_vertex_attr(&vinfo, EMIT_1F, INTERP_PERSPECTIVE, src); + draw_emit_vertex_attr(&vinfo, EMIT_1F, src); vinfo.hwfmt[0] |= S4_VFMT_FOG_PARAM; } @@ -147,7 +146,7 @@ static void calculate_vertex_layout(struct i915_context *i915) if (texCoords[i]) { hwtc = TEXCOORDFMT_4D; src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_GENERIC, fs->generic_mapping[i]); - draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src); + draw_emit_vertex_attr(&vinfo, EMIT_4F, src); } else { hwtc = TEXCOORDFMT_NOT_PRESENT; @@ -164,7 +163,7 @@ static void calculate_vertex_layout(struct i915_context *i915) * module by adding an extra shader output. */ src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_FACE, 0); - draw_emit_vertex_attr(&vinfo, EMIT_1F, INTERP_CONSTANT, src); + draw_emit_vertex_attr(&vinfo, EMIT_1F, src); vinfo.hwfmt[1] &= ~(TEXCOORDFMT_NOT_PRESENT << (slot * 4)); vinfo.hwfmt[1] |= TEXCOORDFMT_1D << (slot * 4); } @@ -185,7 +184,7 @@ static void calculate_vertex_layout(struct i915_context *i915) struct i915_tracked_state i915_update_vertex_layout = { "vertex_layout", calculate_vertex_layout, - I915_NEW_RASTERIZER | I915_NEW_FS | I915_NEW_VS + I915_NEW_FS | I915_NEW_VS }; diff --git a/src/gallium/drivers/ilo/core/ilo_builder.c b/src/gallium/drivers/ilo/core/ilo_builder.c index 9d5195129b7..079872f4306 100644 --- a/src/gallium/drivers/ilo/core/ilo_builder.c +++ b/src/gallium/drivers/ilo/core/ilo_builder.c @@ -333,7 +333,7 @@ ilo_builder_init(struct ilo_builder *builder, const struct ilo_dev *dev, struct intel_winsys *winsys) { - int i; + unsigned i; assert(ilo_is_zeroed(builder, sizeof(*builder))); @@ -366,7 +366,7 @@ ilo_builder_init(struct ilo_builder *builder, void ilo_builder_reset(struct ilo_builder *builder) { - int i; + unsigned i; for (i = 0; i < ILO_BUILDER_WRITER_COUNT; i++) ilo_builder_writer_reset(builder, i); @@ -382,7 +382,7 @@ ilo_builder_reset(struct ilo_builder *builder) bool ilo_builder_begin(struct ilo_builder *builder) { - int i; + unsigned i; for (i = 0; i < ILO_BUILDER_WRITER_COUNT; i++) { if (!ilo_builder_writer_alloc_and_map(builder, i)) { @@ -407,7 +407,7 @@ struct intel_bo * ilo_builder_end(struct ilo_builder *builder, unsigned *used) { struct ilo_builder_writer *bat; - int i; + unsigned i; ilo_builder_batch_patch_sba(builder); diff --git a/src/gallium/drivers/ilo/ilo_context.c b/src/gallium/drivers/ilo/ilo_context.c index 2a00cf1c93c..6bcd0bcb8f5 100644 --- a/src/gallium/drivers/ilo/ilo_context.c +++ b/src/gallium/drivers/ilo/ilo_context.c @@ -189,8 +189,9 @@ ilo_context_create(struct pipe_screen *screen, void *priv, unsigned flags) * These must be called last as u_upload/u_blitter are clients of the pipe * context. */ - ilo->uploader = u_upload_create(&ilo->base, 1024 * 1024, 16, - PIPE_BIND_CONSTANT_BUFFER | PIPE_BIND_INDEX_BUFFER); + ilo->uploader = u_upload_create(&ilo->base, 1024 * 1024, + PIPE_BIND_CONSTANT_BUFFER | PIPE_BIND_INDEX_BUFFER, + PIPE_USAGE_STREAM); if (!ilo->uploader) { ilo_context_destroy(&ilo->base); return NULL; diff --git a/src/gallium/drivers/ilo/ilo_gpgpu.c b/src/gallium/drivers/ilo/ilo_gpgpu.c index 9a2ca007f80..b7415901a88 100644 --- a/src/gallium/drivers/ilo/ilo_gpgpu.c +++ b/src/gallium/drivers/ilo/ilo_gpgpu.c @@ -92,7 +92,7 @@ ilo_launch_grid(struct pipe_context *pipe, input_buf.buffer_size = ilo_shader_get_kernel_param(cs, ILO_KERNEL_CS_INPUT_SIZE); if (input_buf.buffer_size) { - u_upload_data(ilo->uploader, 0, input_buf.buffer_size, input, + u_upload_data(ilo->uploader, 0, input_buf.buffer_size, 16, input, &input_buf.buffer_offset, &input_buf.buffer); } diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c index cfa2fb41152..d5a82ce80ae 100644 --- a/src/gallium/drivers/ilo/ilo_screen.c +++ b/src/gallium/drivers/ilo/ilo_screen.c @@ -463,6 +463,8 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION: case PIPE_CAP_MAX_VERTEX_STREAMS: case PIPE_CAP_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: case PIPE_CAP_SAMPLER_VIEW_TARGET: @@ -476,6 +478,11 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: case PIPE_CAP_CLEAR_TEXTURE: + case PIPE_CAP_DRAW_PARAMETERS: + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: + case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: + case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: return 0; case PIPE_CAP_VENDOR_ID: diff --git a/src/gallium/drivers/ilo/ilo_state.c b/src/gallium/drivers/ilo/ilo_state.c index d89765a9d23..8dc2d38e039 100644 --- a/src/gallium/drivers/ilo/ilo_state.c +++ b/src/gallium/drivers/ilo/ilo_state.c @@ -376,7 +376,7 @@ finalize_cbuf_state(struct ilo_context *ilo, if (cbuf->cso[i].resource) continue; - u_upload_data(ilo->uploader, 0, cbuf->cso[i].info.size, + u_upload_data(ilo->uploader, 0, cbuf->cso[i].info.size, 16, cbuf->cso[i].user_buffer, &offset, &cbuf->cso[i].resource); cbuf->cso[i].info.vma = ilo_resource_get_vma(cbuf->cso[i].resource); @@ -426,12 +426,12 @@ finalize_index_buffer(struct ilo_context *ilo) unsigned hw_offset; if (vec->ib.state.user_buffer) { - u_upload_data(ilo->uploader, 0, size, + u_upload_data(ilo->uploader, 0, size, 16, vec->ib.state.user_buffer + offset, &hw_offset, &vec->ib.hw_resource); } else { u_upload_buffer(ilo->uploader, 0, - vec->ib.state.offset + offset, size, vec->ib.state.buffer, + vec->ib.state.offset + offset, size, 16, vec->ib.state.buffer, &hw_offset, &vec->ib.hw_resource); } diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_fs.c b/src/gallium/drivers/ilo/shader/ilo_shader_fs.c index 5250115a893..f46126e8427 100644 --- a/src/gallium/drivers/ilo/shader/ilo_shader_fs.c +++ b/src/gallium/drivers/ilo/shader/ilo_shader_fs.c @@ -266,7 +266,7 @@ fs_lower_opcode_tgsi_indirect_const(struct fs_compile_context *fcc, struct toy_inst *inst; struct toy_src desc, real_src[4]; struct toy_dst tmp, real_dst[4]; - int i; + unsigned i; tsrc_transpose(idx, real_src); @@ -319,7 +319,7 @@ fs_lower_opcode_tgsi_const_pcb(struct fs_compile_context *fcc, const int grf_subreg = (idx.val32 & 1) * 16; struct toy_src src; struct toy_dst real_dst[4]; - int i; + unsigned i; if (!fcc->variant->use_pcb || dim != 0 || idx.file != TOY_FILE_IMM || grf >= fcc->first_attr_grf) @@ -350,7 +350,7 @@ fs_lower_opcode_tgsi_const_gen6(struct fs_compile_context *fcc, struct toy_inst *inst; struct toy_src desc; struct toy_dst tmp, real_dst[4]; - int i; + unsigned i; if (fs_lower_opcode_tgsi_const_pcb(fcc, dst, dim, idx)) return; @@ -396,7 +396,7 @@ fs_lower_opcode_tgsi_const_gen7(struct fs_compile_context *fcc, struct toy_src desc; struct toy_inst *inst; struct toy_dst tmp, real_dst[4]; - int i; + unsigned i; if (fs_lower_opcode_tgsi_const_pcb(fcc, dst, dim, idx)) return; @@ -1168,7 +1168,7 @@ fs_lower_opcode_derivative(struct toy_compiler *tc, struct toy_inst *inst) { struct toy_dst dst[4]; struct toy_src src[4]; - int i; + unsigned i; tdst_transpose(inst->dst, dst); tsrc_transpose(inst->src[0], src); @@ -1257,7 +1257,7 @@ fs_lower_opcode_kil(struct toy_compiler *tc, struct toy_inst *inst) } else { struct toy_src src[4]; - int i; + unsigned i; tsrc_transpose(inst->src[0], src); /* mask out killed pixels */ @@ -1583,7 +1583,7 @@ fs_write_fb(struct fs_compile_context *fcc) static void fs_setup_shader_out(struct ilo_shader *sh, const struct toy_tgsi *tgsi) { - int i; + unsigned i; sh->out.count = tgsi->num_outputs; for (i = 0; i < tgsi->num_outputs; i++) { @@ -1603,7 +1603,7 @@ static void fs_setup_shader_in(struct ilo_shader *sh, const struct toy_tgsi *tgsi, bool flatshade) { - int i; + unsigned i; sh->in.count = tgsi->num_inputs; for (i = 0; i < tgsi->num_inputs; i++) { diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_vs.c b/src/gallium/drivers/ilo/shader/ilo_shader_vs.c index a29baab10c1..0df0afc706b 100644 --- a/src/gallium/drivers/ilo/shader/ilo_shader_vs.c +++ b/src/gallium/drivers/ilo/shader/ilo_shader_vs.c @@ -126,7 +126,7 @@ vs_lower_opcode_tgsi_const_gen6(struct vs_compile_context *vcc, tc_MOV(tc, block_offsets, idx); msg_type = GEN6_MSG_DP_OWORD_DUAL_BLOCK_READ; - msg_ctrl = GEN6_MSG_DP_OWORD_DUAL_BLOCK_SIZE_1;; + msg_ctrl = GEN6_MSG_DP_OWORD_DUAL_BLOCK_SIZE_1; msg_len = 2; desc = tsrc_imm_mdesc_data_port(tc, false, msg_len, 1, true, false, @@ -522,7 +522,7 @@ vs_prepare_tgsi_sampling(struct vs_compile_context *vcc, if (num_coords >= 3) { struct toy_dst tmp, max; struct toy_src abs_coords[3]; - int i; + unsigned i; tmp = tc_alloc_tmp(tc); max = tdst_writemask(tmp, TOY_WRITEMASK_W); @@ -804,7 +804,7 @@ static int vs_collect_outputs(struct vs_compile_context *vcc, struct toy_src *outs) { const struct toy_tgsi *tgsi = &vcc->tgsi; - int i; + unsigned i; for (i = 0; i < vcc->shader->out.count; i++) { const int slot = vcc->output_map[i]; diff --git a/src/gallium/drivers/ilo/shader/toy_legalize_ra.c b/src/gallium/drivers/ilo/shader/toy_legalize_ra.c index b725375fb67..1874faa6be3 100644 --- a/src/gallium/drivers/ilo/shader/toy_legalize_ra.c +++ b/src/gallium/drivers/ilo/shader/toy_legalize_ra.c @@ -70,7 +70,7 @@ struct linear_scan { static void linear_scan_free_regs(struct linear_scan *ls, int reg, int count) { - int i; + unsigned i; for (i = 0; i < count; i++) ls->free_regs[ls->num_free_regs++] = reg + count - 1 - i; @@ -221,7 +221,7 @@ linear_scan_spill(struct linear_scan *ls, static void linear_scan_spill_range(struct linear_scan *ls, int first, int count) { - int i; + unsigned i; for (i = 0; i < count; i++) { struct linear_scan_live_interval *interval = &ls->intervals[first + i]; diff --git a/src/gallium/drivers/ilo/shader/toy_tgsi.c b/src/gallium/drivers/ilo/shader/toy_tgsi.c index d38585f1475..9a7140b9a9b 100644 --- a/src/gallium/drivers/ilo/shader/toy_tgsi.c +++ b/src/gallium/drivers/ilo/shader/toy_tgsi.c @@ -1593,7 +1593,7 @@ ra_get_type(struct toy_tgsi *tgsi, const struct tgsi_full_instruction *tgsi_inst tgsi_inst->Src[operand].Register.File; switch (file) { case TGSI_FILE_SAMPLER: - case TGSI_FILE_RESOURCE: + case TGSI_FILE_IMAGE: case TGSI_FILE_SAMPLER_VIEW: type = TOY_TYPE_D; break; @@ -1834,7 +1834,7 @@ ra_get_src_indirect(struct toy_tgsi *tgsi, src = tsrc_null(); break; case TGSI_FILE_SAMPLER: - case TGSI_FILE_RESOURCE: + case TGSI_FILE_IMAGE: case TGSI_FILE_SAMPLER_VIEW: is_resource = true; /* fall through */ @@ -1918,7 +1918,7 @@ ra_get_src(struct toy_tgsi *tgsi, need_vrf = true; break; case TGSI_FILE_SAMPLER: - case TGSI_FILE_RESOURCE: + case TGSI_FILE_IMAGE: case TGSI_FILE_SAMPLER_VIEW: assert(!s->Register.Dimension); src = tsrc_imm_d(s->Register.Index); @@ -2256,7 +2256,7 @@ parse_declaration(struct toy_tgsi *tgsi, case TGSI_FILE_SAMPLER: case TGSI_FILE_PREDICATE: case TGSI_FILE_ADDRESS: - case TGSI_FILE_RESOURCE: + case TGSI_FILE_IMAGE: case TGSI_FILE_SAMPLER_VIEW: /* nothing to do */ break; diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h index 0a52642e395..9029d2a4180 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h +++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h @@ -63,8 +63,7 @@ enum lp_interp { LP_INTERP_LINEAR, LP_INTERP_PERSPECTIVE, LP_INTERP_POSITION, - LP_INTERP_FACING, - LP_INTERP_ZERO + LP_INTERP_FACING }; struct lp_shader_input { diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h index 9dcc102e758..62d99bbaac8 100644 --- a/src/gallium/drivers/llvmpipe/lp_context.h +++ b/src/gallium/drivers/llvmpipe/lp_context.h @@ -108,28 +108,22 @@ struct llvmpipe_context { struct vertex_info vertex_info; /** Which vertex shader output slot contains color */ - uint8_t color_slot[2]; + int8_t color_slot[2]; /** Which vertex shader output slot contains bcolor */ - uint8_t bcolor_slot[2]; + int8_t bcolor_slot[2]; /** Which vertex shader output slot contains point size */ - uint8_t psize_slot; + int8_t psize_slot; /** Which vertex shader output slot contains viewport index */ - uint8_t viewport_index_slot; + int8_t viewport_index_slot; /** Which geometry shader output slot contains layer */ - uint8_t layer_slot; + int8_t layer_slot; /** A fake frontface output for unfilled primitives */ - uint8_t face_slot; - - /** Which output slot is used for the fake vp index info */ - uint8_t fake_vpindex_slot; - - /** Which output slot is used for the fake layer info */ - uint8_t fake_layer_slot; + int8_t face_slot; /** Depth format and bias settings. */ boolean floating_point_depth; diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h index c19f9318006..db45cbbb057 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.h +++ b/src/gallium/drivers/llvmpipe/lp_rast.h @@ -115,7 +115,7 @@ struct lp_rast_plane { int32_t dcdy; /* one-pixel sized trivial reject offsets for each plane */ - int64_t eo; + uint32_t eo; }; /** diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c index c9b9221d87c..232c8599e42 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c @@ -133,36 +133,8 @@ lp_rast_triangle_4_16(struct lp_rasterizer_task *task, lp_rast_triangle_4(task, arg2); } -#if !defined(PIPE_ARCH_SSE) +#if defined(PIPE_ARCH_SSE) -void -lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, - const union lp_rast_cmd_arg arg) -{ - union lp_rast_cmd_arg arg2; - arg2.triangle.tri = arg.triangle.tri; - arg2.triangle.plane_mask = (1<<3)-1; - lp_rast_triangle_32_3(task, arg2); -} - -void -lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task, - const union lp_rast_cmd_arg arg) -{ - union lp_rast_cmd_arg arg2; - arg2.triangle.tri = arg.triangle.tri; - arg2.triangle.plane_mask = (1<<4)-1; - lp_rast_triangle_32_4(task, arg2); -} - -void -lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, - const union lp_rast_cmd_arg arg) -{ - lp_rast_triangle_32_3_16(task, arg); -} - -#else #include <emmintrin.h> #include "util/u_sse.h" @@ -265,12 +237,6 @@ sign_bits4(const __m128i *cstep, int cdiff) #define NR_PLANES 3 - - - - - - void lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) @@ -381,10 +347,6 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, 0xffff & ~out[i].mask); } - - - - void lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) @@ -471,6 +433,254 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, } #undef NR_PLANES + +#else + +#if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN) + +#include <altivec.h> +#include "util/u_pwr8.h" + +static inline void +build_masks_32(int c, + int cdiff, + int dcdx, + int dcdy, + unsigned *outmask, + unsigned *partmask) +{ + __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); + __m128i xdcdy = (__m128i) vec_splats(dcdy); + + /* Get values across the quad + */ + __m128i cstep1 = vec_add_epi32(cstep0, xdcdy); + __m128i cstep2 = vec_add_epi32(cstep1, xdcdy); + __m128i cstep3 = vec_add_epi32(cstep2, xdcdy); + + { + __m128i cstep01, cstep23, result; + + cstep01 = vec_packs_epi32(cstep0, cstep1); + cstep23 = vec_packs_epi32(cstep2, cstep3); + result = vec_packs_epi16(cstep01, cstep23); + + *outmask |= vec_movemask_epi8(result); + } + + + { + __m128i cio4 = (__m128i) vec_splats(cdiff); + __m128i cstep01, cstep23, result; + + cstep0 = vec_add_epi32(cstep0, cio4); + cstep1 = vec_add_epi32(cstep1, cio4); + cstep2 = vec_add_epi32(cstep2, cio4); + cstep3 = vec_add_epi32(cstep3, cio4); + + cstep01 = vec_packs_epi32(cstep0, cstep1); + cstep23 = vec_packs_epi32(cstep2, cstep3); + result = vec_packs_epi16(cstep01, cstep23); + + *partmask |= vec_movemask_epi8(result); + } +} + +static inline unsigned +build_mask_linear_32(int c, int dcdx, int dcdy) +{ + __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); + __m128i xdcdy = (__m128i) vec_splats(dcdy); + + /* Get values across the quad + */ + __m128i cstep1 = vec_add_epi32(cstep0, xdcdy); + __m128i cstep2 = vec_add_epi32(cstep1, xdcdy); + __m128i cstep3 = vec_add_epi32(cstep2, xdcdy); + + /* pack pairs of results into epi16 + */ + __m128i cstep01 = vec_packs_epi32(cstep0, cstep1); + __m128i cstep23 = vec_packs_epi32(cstep2, cstep3); + + /* pack into epi8, preserving sign bits + */ + __m128i result = vec_packs_epi16(cstep01, cstep23); + + /* extract sign bits to create mask + */ + return vec_movemask_epi8(result); +} + +static inline __m128i +lp_plane_to_m128i(const struct lp_rast_plane *plane) +{ + return vec_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx, + (int32_t)plane->dcdy, (int32_t)plane->eo); +} + +#define NR_PLANES 3 + +void +lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + const struct lp_rast_triangle *tri = arg.triangle.tri; + const struct lp_rast_plane *plane = GET_PLANES(tri); + int x = (arg.triangle.plane_mask & 0xff) + task->x; + int y = (arg.triangle.plane_mask >> 8) + task->y; + unsigned i, j; + + struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16]; + unsigned nr = 0; + + __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */ + __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */ + __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */ + __m128i zero = vec_splats((unsigned char) 0); + + __m128i c; + __m128i dcdx; + __m128i dcdy; + __m128i rej4; + + __m128i dcdx2; + __m128i dcdx3; + + __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ + __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ + __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ + __m128i unused; + + __m128i vshuf_mask0; + __m128i vshuf_mask1; + __m128i vshuf_mask2; + +#ifdef PIPE_ARCH_LITTLE_ENDIAN + vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x03020100); + vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x07060504); + vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x0B0A0908); +#else + vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x0C0D0E0F); + vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x08090A0B); + vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x04050607); +#endif + + transpose4_epi32(&p0, &p1, &p2, &zero, + &c, &dcdx, &dcdy, &rej4); + + /* Adjust dcdx; + */ + dcdx = vec_sub_epi32(zero, dcdx); + + c = vec_add_epi32(c, vec_mullo_epi32(dcdx, (__m128i) vec_splats(x))); + c = vec_add_epi32(c, vec_mullo_epi32(dcdy, (__m128i) vec_splats(y))); + rej4 = vec_slli_epi32(rej4, 2); + + /* + * Adjust so we can just check the sign bit (< 0 comparison), + * instead of having to do a less efficient <= 0 comparison + */ + c = vec_sub_epi32(c, (__m128i) vec_splats((unsigned int) 1)); + rej4 = vec_add_epi32(rej4, (__m128i) vec_splats((unsigned int) 1)); + + dcdx2 = vec_add_epi32(dcdx, dcdx); + dcdx3 = vec_add_epi32(dcdx2, dcdx); + + transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, + &span_0, &span_1, &span_2, &unused); + + for (i = 0; i < 4; i++) { + __m128i cx = c; + + for (j = 0; j < 4; j++) { + __m128i c4rej = vec_add_epi32(cx, rej4); + __m128i rej_masks = vec_srai_epi32(c4rej, 31); + + /* if (is_zero(rej_masks)) */ + if (vec_movemask_epi8(rej_masks) == 0) { + __m128i c0_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask0), span_0); + __m128i c1_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask1), span_1); + __m128i c2_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask2), span_2); + + __m128i c_0 = vec_or(vec_or(c0_0, c1_0), c2_0); + + __m128i c0_1 = vec_add_epi32(c0_0, vec_perm(dcdy, dcdy, vshuf_mask0)); + __m128i c1_1 = vec_add_epi32(c1_0, vec_perm(dcdy, dcdy, vshuf_mask1)); + __m128i c2_1 = vec_add_epi32(c2_0, vec_perm(dcdy, dcdy, vshuf_mask2)); + + __m128i c_1 = vec_or(vec_or(c0_1, c1_1), c2_1); + __m128i c_01 = vec_packs_epi32(c_0, c_1); + + __m128i c0_2 = vec_add_epi32(c0_1, vec_perm(dcdy, dcdy, vshuf_mask0)); + __m128i c1_2 = vec_add_epi32(c1_1, vec_perm(dcdy, dcdy, vshuf_mask1)); + __m128i c2_2 = vec_add_epi32(c2_1, vec_perm(dcdy, dcdy, vshuf_mask2)); + + __m128i c_2 = vec_or(vec_or(c0_2, c1_2), c2_2); + + __m128i c0_3 = vec_add_epi32(c0_2, vec_perm(dcdy, dcdy, vshuf_mask0)); + __m128i c1_3 = vec_add_epi32(c1_2, vec_perm(dcdy, dcdy, vshuf_mask1)); + __m128i c2_3 = vec_add_epi32(c2_2, vec_perm(dcdy, dcdy, vshuf_mask2)); + + __m128i c_3 = vec_or(vec_or(c0_3, c1_3), c2_3); + __m128i c_23 = vec_packs_epi32(c_2, c_3); + __m128i c_0123 = vec_packs_epi16(c_01, c_23); + + unsigned mask = vec_movemask_epi8(c_0123); + + out[nr].i = i; + out[nr].j = j; + out[nr].mask = mask; + if (mask != 0xffff) + nr++; + } + cx = vec_add_epi32(cx, vec_slli_epi32(dcdx, 2)); + } + + c = vec_add_epi32(c, vec_slli_epi32(dcdy, 2)); + } + + for (i = 0; i < nr; i++) + lp_rast_shade_quads_mask(task, + &tri->inputs, + x + 4 * out[i].j, + y + 4 * out[i].i, + 0xffff & ~out[i].mask); +} + +#undef NR_PLANES + +#else + +void +lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + union lp_rast_cmd_arg arg2; + arg2.triangle.tri = arg.triangle.tri; + arg2.triangle.plane_mask = (1<<3)-1; + lp_rast_triangle_32_3(task, arg2); +} + +#endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */ + +void +lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + union lp_rast_cmd_arg arg2; + arg2.triangle.tri = arg.triangle.tri; + arg2.triangle.plane_mask = (1<<4)-1; + lp_rast_triangle_32_4(task, arg2); +} + +void +lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + lp_rast_triangle_32_3_16(task, arg); +} + #endif @@ -512,7 +722,7 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, #define NR_PLANES 8 #include "lp_rast_tri_tmp.h" -#ifdef PIPE_ARCH_SSE +#if defined(PIPE_ARCH_SSE) || (defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)) #undef BUILD_MASKS #undef BUILD_MASK_LINEAR #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_32((int)c, (int)cdiff, dcdx, dcdy, omask, pmask) diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h index 52f6e999683..e0aea94205e 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h @@ -82,7 +82,7 @@ TAG(do_block_16)(struct lp_rasterizer_task *task, const int64_t dcdx = -IMUL64(plane[j].dcdx, 4); const int64_t dcdy = IMUL64(plane[j].dcdy, 4); const int64_t cox = IMUL64(plane[j].eo, 4); - const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo; + const int64_t ei = plane[j].dcdy - plane[j].dcdx - (int64_t)plane[j].eo; const int64_t cio = IMUL64(ei, 4) - 1; BUILD_MASKS(c[j] + cox, @@ -182,7 +182,7 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task, const int64_t dcdx = -IMUL64(plane[j].dcdx, 16); const int64_t dcdy = IMUL64(plane[j].dcdy, 16); const int64_t cox = IMUL64(plane[j].eo, 16); - const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo; + const int64_t ei = plane[j].dcdy - plane[j].dcdx - (int64_t)plane[j].eo; const int64_t cio = IMUL64(ei, 16) - 1; BUILD_MASKS(c[j] + cox, diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c index 899f28da7d3..e29b008c7e8 100644 --- a/src/gallium/drivers/llvmpipe/lp_screen.c +++ b/src/gallium/drivers/llvmpipe/lp_screen.c @@ -301,6 +301,13 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: case PIPE_CAP_CLEAR_TEXTURE: + case PIPE_CAP_DRAW_PARAMETERS: + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: + case PIPE_CAP_MULTI_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: + case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: + case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: return 0; } /* should only get here on unhandled cases */ diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c index ddbb88eb107..bd850519468 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.c +++ b/src/gallium/drivers/llvmpipe/lp_setup.c @@ -486,6 +486,11 @@ lp_setup_try_clear_zs(struct lp_setup_context *setup, depth, stencil); + /* + * XXX: should make a full mask here for things like D24X8, + * otherwise we'll do a read-modify-write clear later which + * should be unnecessary. + */ zsmask = util_pack64_mask_z_stencil(setup->fb.zsbuf->format, zmask32, smask8); diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h index 4451284c303..80acd74bddd 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_context.h +++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h @@ -105,10 +105,10 @@ struct lp_setup_context float pixel_offset; float line_width; float point_size; - uint8_t psize_slot; - uint8_t viewport_index_slot; - uint8_t layer_slot; - uint8_t face_slot; + int8_t psize_slot; + int8_t viewport_index_slot; + int8_t layer_slot; + int8_t face_slot; struct pipe_framebuffer_state fb; struct u_rect framebuffer; diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c index fac1cd61d77..a0de599c9c6 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_line.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c @@ -644,19 +644,25 @@ try_setup_line( struct lp_setup_context *setup, line->inputs.layer = layer; line->inputs.viewport_index = viewport_index; + /* + * XXX: this code is mostly identical to the one in lp_setup_tri, except it + * uses 4 planes instead of 3. Could share the code (including the sse + * assembly, in fact we'd get the 4th plane for free). + * The only difference apart from storing the 4th plane would be some + * different shuffle for calculating dcdx/dcdy. + */ for (i = 0; i < 4; i++) { - /* half-edge constants, will be interated over the whole render + /* half-edge constants, will be iterated over the whole render * target. */ plane[i].c = IMUL64(plane[i].dcdx, x[i]) - IMUL64(plane[i].dcdy, y[i]); - - /* correct for top-left vs. bottom-left fill convention. - */ + /* correct for top-left vs. bottom-left fill convention. + */ if (plane[i].dcdx < 0) { /* both fill conventions want this - adjust for left edges */ - plane[i].c++; + plane[i].c++; } else if (plane[i].dcdx == 0) { if (setup->pixel_offset == 0) { diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c index b1671dd0ae2..358da442ea7 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c @@ -46,6 +46,9 @@ #if defined(PIPE_ARCH_SSE) #include <emmintrin.h> +#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN) +#include <altivec.h> +#include "util/u_pwr8.h" #endif static inline int @@ -387,25 +390,21 @@ do_triangle_ccw(struct lp_setup_context *setup, plane = GET_PLANES(tri); #if defined(PIPE_ARCH_SSE) - if (setup->fb.width <= MAX_FIXED_LENGTH32 && - setup->fb.height <= MAX_FIXED_LENGTH32 && - (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 && - (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) { + if (1) { __m128i vertx, verty; __m128i shufx, shufy; - __m128i dcdx, dcdy, c; - __m128i unused; + __m128i dcdx, dcdy; + __m128i cdx02, cdx13, cdy02, cdy13, c02, c13; + __m128i c01, c23, unused; __m128i dcdx_neg_mask; __m128i dcdy_neg_mask; __m128i dcdx_zero_mask; - __m128i top_left_flag; - __m128i c_inc_mask, c_inc; + __m128i top_left_flag, c_dec; __m128i eo, p0, p1, p2; __m128i zero = _mm_setzero_si128(); - PIPE_ALIGN_VAR(16) int32_t temp_vec[4]; - vertx = _mm_loadu_si128((__m128i *)position->x); /* vertex x coords */ - verty = _mm_loadu_si128((__m128i *)position->y); /* vertex y coords */ + vertx = _mm_load_si128((__m128i *)position->x); /* vertex x coords */ + verty = _mm_load_si128((__m128i *)position->y); /* vertex y coords */ shufx = _mm_shuffle_epi32(vertx, _MM_SHUFFLE(3,0,2,1)); shufy = _mm_shuffle_epi32(verty, _MM_SHUFFLE(3,0,2,1)); @@ -419,42 +418,161 @@ do_triangle_ccw(struct lp_setup_context *setup, top_left_flag = _mm_set1_epi32((setup->bottom_edge_rule == 0) ? ~0 : 0); - c_inc_mask = _mm_or_si128(dcdx_neg_mask, - _mm_and_si128(dcdx_zero_mask, - _mm_xor_si128(dcdy_neg_mask, - top_left_flag))); - - c_inc = _mm_srli_epi32(c_inc_mask, 31); - - c = _mm_sub_epi32(mm_mullo_epi32(dcdx, vertx), - mm_mullo_epi32(dcdy, verty)); + c_dec = _mm_or_si128(dcdx_neg_mask, + _mm_and_si128(dcdx_zero_mask, + _mm_xor_si128(dcdy_neg_mask, + top_left_flag))); - c = _mm_add_epi32(c, c_inc); + /* + * 64 bit arithmetic. + * Note we need _signed_ mul (_mm_mul_epi32) which we emulate. + */ + cdx02 = mm_mullohi_epi32(dcdx, vertx, &cdx13); + cdy02 = mm_mullohi_epi32(dcdy, verty, &cdy13); + c02 = _mm_sub_epi64(cdx02, cdy02); + c13 = _mm_sub_epi64(cdx13, cdy13); + c02 = _mm_sub_epi64(c02, _mm_shuffle_epi32(c_dec, + _MM_SHUFFLE(2,2,0,0))); + c13 = _mm_sub_epi64(c13, _mm_shuffle_epi32(c_dec, + _MM_SHUFFLE(3,3,1,1))); + + /* + * Useful for very small fbs/tris (or fewer subpixel bits) only: + * c = _mm_sub_epi32(mm_mullo_epi32(dcdx, vertx), + * mm_mullo_epi32(dcdy, verty)); + * + * c = _mm_sub_epi32(c, c_dec); + */ /* Scale up to match c: */ dcdx = _mm_slli_epi32(dcdx, FIXED_ORDER); dcdy = _mm_slli_epi32(dcdy, FIXED_ORDER); - /* Calculate trivial reject values: + /* + * Calculate trivial reject values: + * Note eo cannot overflow even if dcdx/dcdy would already have + * 31 bits (which they shouldn't have). This is because eo + * is never negative (albeit if we rely on that need to be careful...) */ eo = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy), _mm_and_si128(dcdx_neg_mask, dcdx)); /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */ + /* + * Pointless transpose which gets undone immediately in + * rasterization. + * It is actually difficult to do away with it - would essentially + * need GET_PLANES_DX, GET_PLANES_DY etc., but the calculations + * for this then would need to depend on the number of planes. + * The transpose is quite special here due to c being 64bit... + * The store has to be unaligned (unless we'd make the plane size + * a multiple of 128), and of course storing eo separately... + */ + c01 = _mm_unpacklo_epi64(c02, c13); + c23 = _mm_unpackhi_epi64(c02, c13); + transpose2_64_2_32(&c01, &c23, &dcdx, &dcdy, + &p0, &p1, &p2, &unused); + _mm_storeu_si128((__m128i *)&plane[0], p0); + plane[0].eo = (uint32_t)_mm_cvtsi128_si32(eo); + _mm_storeu_si128((__m128i *)&plane[1], p1); + eo = _mm_shuffle_epi32(eo, _MM_SHUFFLE(3,2,0,1)); + plane[1].eo = (uint32_t)_mm_cvtsi128_si32(eo); + _mm_storeu_si128((__m128i *)&plane[2], p2); + eo = _mm_shuffle_epi32(eo, _MM_SHUFFLE(0,0,0,2)); + plane[2].eo = (uint32_t)_mm_cvtsi128_si32(eo); + } else +#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN) + /* + * XXX this code is effectively disabled for all practical purposes, + * as the allowed fb size is tiny if FIXED_ORDER is 8. + */ + if (setup->fb.width <= MAX_FIXED_LENGTH32 && + setup->fb.height <= MAX_FIXED_LENGTH32 && + (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 && + (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) { + unsigned int bottom_edge; + __m128i vertx, verty; + __m128i shufx, shufy; + __m128i dcdx, dcdy, c; + __m128i unused; + __m128i dcdx_neg_mask; + __m128i dcdy_neg_mask; + __m128i dcdx_zero_mask; + __m128i top_left_flag; + __m128i c_inc_mask, c_inc; + __m128i eo, p0, p1, p2; + __m128i_union vshuf_mask; + __m128i zero = vec_splats((unsigned char) 0); + PIPE_ALIGN_VAR(16) int32_t temp_vec[4]; + +#ifdef PIPE_ARCH_LITTLE_ENDIAN + vshuf_mask.i[0] = 0x07060504; + vshuf_mask.i[1] = 0x0B0A0908; + vshuf_mask.i[2] = 0x03020100; + vshuf_mask.i[3] = 0x0F0E0D0C; +#else + vshuf_mask.i[0] = 0x00010203; + vshuf_mask.i[1] = 0x0C0D0E0F; + vshuf_mask.i[2] = 0x04050607; + vshuf_mask.i[3] = 0x08090A0B; +#endif + + /* vertex x coords */ + vertx = vec_load_si128((const uint32_t *) position->x); + /* vertex y coords */ + verty = vec_load_si128((const uint32_t *) position->y); + + shufx = vec_perm (vertx, vertx, vshuf_mask.m128i); + shufy = vec_perm (verty, verty, vshuf_mask.m128i); + + dcdx = vec_sub_epi32(verty, shufy); + dcdy = vec_sub_epi32(vertx, shufx); + + dcdx_neg_mask = vec_srai_epi32(dcdx, 31); + dcdx_zero_mask = vec_cmpeq_epi32(dcdx, zero); + dcdy_neg_mask = vec_srai_epi32(dcdy, 31); + + bottom_edge = (setup->bottom_edge_rule == 0) ? ~0 : 0; + top_left_flag = (__m128i) vec_splats(bottom_edge); + + c_inc_mask = vec_or(dcdx_neg_mask, + vec_and(dcdx_zero_mask, + vec_xor(dcdy_neg_mask, + top_left_flag))); + + c_inc = vec_srli_epi32(c_inc_mask, 31); + + c = vec_sub_epi32(vec_mullo_epi32(dcdx, vertx), + vec_mullo_epi32(dcdy, verty)); + + c = vec_add_epi32(c, c_inc); + + /* Scale up to match c: + */ + dcdx = vec_slli_epi32(dcdx, FIXED_ORDER); + dcdy = vec_slli_epi32(dcdy, FIXED_ORDER); + + /* Calculate trivial reject values: + */ + eo = vec_sub_epi32(vec_andc(dcdy_neg_mask, dcdy), + vec_and(dcdx_neg_mask, dcdx)); + + /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */ + /* Pointless transpose which gets undone immediately in * rasterization: */ transpose4_epi32(&c, &dcdx, &dcdy, &eo, &p0, &p1, &p2, &unused); -#define STORE_PLANE(plane, vec) do { \ - _mm_store_si128((__m128i *)&temp_vec, vec); \ - plane.c = (int64_t)temp_vec[0]; \ - plane.dcdx = temp_vec[1]; \ - plane.dcdy = temp_vec[2]; \ - plane.eo = temp_vec[3]; \ +#define STORE_PLANE(plane, vec) do { \ + vec_store_si128((uint32_t *)&temp_vec, vec); \ + plane.c = (int64_t)temp_vec[0]; \ + plane.dcdx = temp_vec[1]; \ + plane.dcdy = temp_vec[2]; \ + plane.eo = temp_vec[3]; \ } while(0) STORE_PLANE(plane[0], p0); @@ -473,17 +591,17 @@ do_triangle_ccw(struct lp_setup_context *setup, plane[2].dcdx = position->dy20; for (i = 0; i < 3; i++) { - /* half-edge constants, will be interated over the whole render + /* half-edge constants, will be iterated over the whole render * target. */ plane[i].c = IMUL64(plane[i].dcdx, position->x[i]) - - IMUL64(plane[i].dcdy, position->y[i]); + IMUL64(plane[i].dcdy, position->y[i]); /* correct for top-left vs. bottom-left fill convention. - */ + */ if (plane[i].dcdx < 0) { /* both fill conventions want this - adjust for left edges */ - plane[i].c++; + plane[i].c++; } else if (plane[i].dcdx == 0) { if (setup->bottom_edge_rule == 0){ @@ -517,19 +635,19 @@ do_triangle_ccw(struct lp_setup_context *setup, } if (0) { - debug_printf("p0: %"PRIx64"/%08x/%08x/%"PRIx64"\n", + debug_printf("p0: %"PRIx64"/%08x/%08x/%08x\n", plane[0].c, plane[0].dcdx, plane[0].dcdy, plane[0].eo); - - debug_printf("p1: %"PRIx64"/%08x/%08x/%"PRIx64"\n", + + debug_printf("p1: %"PRIx64"/%08x/%08x/%08x\n", plane[1].c, plane[1].dcdx, plane[1].dcdy, plane[1].eo); - - debug_printf("p2: %"PRIx64"/%08x/%08x/%"PRIx64"\n", + + debug_printf("p2: %"PRIx64"/%08x/%08x/%08x\n", plane[2].c, plane[2].dcdx, plane[2].dcdy, @@ -590,7 +708,7 @@ do_triangle_ccw(struct lp_setup_context *setup, static inline uint32_t floor_pot(uint32_t n) { -#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86) +#if defined(PIPE_CC_GCC) && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) if (n == 0) return 0; @@ -738,9 +856,9 @@ lp_setup_bin_triangle( struct lp_setup_context *setup, ei[i] = (plane[i].dcdy - plane[i].dcdx - - plane[i].eo) << TILE_ORDER; + (int64_t)plane[i].eo) << TILE_ORDER; - eo[i] = plane[i].eo << TILE_ORDER; + eo[i] = (int64_t)plane[i].eo << TILE_ORDER; xstep[i] = -(((int64_t)plane[i].dcdx) << TILE_ORDER); ystep[i] = ((int64_t)plane[i].dcdy) << TILE_ORDER; } @@ -932,12 +1050,12 @@ rotate_fixed_position_12( struct fixed_position* position ) /** * Draw triangle if it's CW, cull otherwise. */ -static void triangle_cw( struct lp_setup_context *setup, - const float (*v0)[4], - const float (*v1)[4], - const float (*v2)[4] ) +static void triangle_cw(struct lp_setup_context *setup, + const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4]) { - struct fixed_position position; + PIPE_ALIGN_VAR(16) struct fixed_position position; calc_fixed_position(setup, &position, v0, v1, v2); @@ -953,12 +1071,12 @@ static void triangle_cw( struct lp_setup_context *setup, } -static void triangle_ccw( struct lp_setup_context *setup, - const float (*v0)[4], - const float (*v1)[4], - const float (*v2)[4]) +static void triangle_ccw(struct lp_setup_context *setup, + const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4]) { - struct fixed_position position; + PIPE_ALIGN_VAR(16) struct fixed_position position; calc_fixed_position(setup, &position, v0, v1, v2); @@ -969,12 +1087,12 @@ static void triangle_ccw( struct lp_setup_context *setup, /** * Draw triangle whether it's CW or CCW. */ -static void triangle_both( struct lp_setup_context *setup, - const float (*v0)[4], - const float (*v1)[4], - const float (*v2)[4] ) +static void triangle_both(struct lp_setup_context *setup, + const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4]) { - struct fixed_position position; + PIPE_ALIGN_VAR(16) struct fixed_position position; struct llvmpipe_context *lp_context = (struct llvmpipe_context *)setup->pipe; if (lp_context->active_statistics_queries && diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c index f5bcfb2b511..34961cbbac5 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_derived.c +++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c @@ -48,21 +48,26 @@ static void compute_vertex_info(struct llvmpipe_context *llvmpipe) { - const struct lp_fragment_shader *lpfs = llvmpipe->fs; + const struct tgsi_shader_info *fsInfo = &llvmpipe->fs->info.base; struct vertex_info *vinfo = &llvmpipe->vertex_info; int vs_index; uint i; draw_prepare_shader_outputs(llvmpipe->draw); - llvmpipe->color_slot[0] = 0; - llvmpipe->color_slot[1] = 0; - llvmpipe->bcolor_slot[0] = 0; - llvmpipe->bcolor_slot[1] = 0; - llvmpipe->viewport_index_slot = 0; - llvmpipe->layer_slot = 0; - llvmpipe->face_slot = 0; - llvmpipe->psize_slot = 0; + /* + * Those can't actually be 0 (because pos is always at 0). + * But use ints anyway to avoid confusion (in vs outputs, they + * can very well be at pos 0). + */ + llvmpipe->color_slot[0] = -1; + llvmpipe->color_slot[1] = -1; + llvmpipe->bcolor_slot[0] = -1; + llvmpipe->bcolor_slot[1] = -1; + llvmpipe->viewport_index_slot = -1; + llvmpipe->layer_slot = -1; + llvmpipe->face_slot = -1; + llvmpipe->psize_slot = -1; /* * Match FS inputs against VS outputs, emitting the necessary @@ -73,60 +78,49 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe) vinfo->num_attribs = 0; vs_index = draw_find_shader_output(llvmpipe->draw, - TGSI_SEMANTIC_POSITION, - 0); + TGSI_SEMANTIC_POSITION, 0); - draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index); + draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index); - for (i = 0; i < lpfs->info.base.num_inputs; i++) { + for (i = 0; i < fsInfo->num_inputs; i++) { /* * Search for each input in current vs output: */ - vs_index = draw_find_shader_output(llvmpipe->draw, - lpfs->info.base.input_semantic_name[i], - lpfs->info.base.input_semantic_index[i]); + fsInfo->input_semantic_name[i], + fsInfo->input_semantic_index[i]); - if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_COLOR && - lpfs->info.base.input_semantic_index[i] < 2) { - int idx = lpfs->info.base.input_semantic_index[i]; - llvmpipe->color_slot[idx] = vinfo->num_attribs; + if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR && + fsInfo->input_semantic_index[i] < 2) { + int idx = fsInfo->input_semantic_index[i]; + llvmpipe->color_slot[idx] = (int)vinfo->num_attribs; } - if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_FACE) { - llvmpipe->face_slot = vinfo->num_attribs; - draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); - } else if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_PRIMID) { - draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); + if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_FACE) { + llvmpipe->face_slot = (int)vinfo->num_attribs; + draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index); /* * For vp index and layer, if the fs requires them but the vs doesn't - * provide them, store the slot - we'll later replace the data directly - * with zero (as required by ARB_fragment_layer_viewport). This is - * because draw itself just redirects them to whatever was at output 0. - * We'll also store the real vpindex/layer slot for setup use. + * provide them, draw (vbuf) will give us the required 0 (slot -1). + * (This means in this case we'll also use those slots in setup, which + * isn't necessary but they'll contain the correct (0) value.) */ - } else if (lpfs->info.base.input_semantic_name[i] == + } else if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX) { - if (vs_index >= 0) { - llvmpipe->viewport_index_slot = vinfo->num_attribs; - } - else { - llvmpipe->fake_vpindex_slot = vinfo->num_attribs; - } - draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); - } else if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_LAYER) { - if (vs_index >= 0) { - llvmpipe->layer_slot = vinfo->num_attribs; - } - else { - llvmpipe->fake_layer_slot = vinfo->num_attribs; - } - draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); + llvmpipe->viewport_index_slot = (int)vinfo->num_attribs; + draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index); + } else if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_LAYER) { + llvmpipe->layer_slot = (int)vinfo->num_attribs; + draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index); } else { /* - * Emit the requested fs attribute for all but position. + * Note that we'd actually want to skip position (as we won't use + * the attribute in the fs) but can't. The reason is that we don't + * actually have a input/output map for setup (even though it looks + * like we do...). Could adjust for this though even without a map + * (in llvmpipe_create_fs_state()). */ - draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index); + draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index); } } @@ -137,8 +131,8 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe) TGSI_SEMANTIC_BCOLOR, i); if (vs_index >= 0) { - llvmpipe->bcolor_slot[i] = vinfo->num_attribs; - draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index); + llvmpipe->bcolor_slot[i] = (int)vinfo->num_attribs; + draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index); } } @@ -148,29 +142,29 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe) TGSI_SEMANTIC_PSIZE, 0); if (vs_index >= 0) { - llvmpipe->psize_slot = vinfo->num_attribs; - draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); + llvmpipe->psize_slot = (int)vinfo->num_attribs; + draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index); } /* Figure out if we need viewport index (if it wasn't already in fs input) */ - if (llvmpipe->viewport_index_slot == 0) { + if (llvmpipe->viewport_index_slot < 0) { vs_index = draw_find_shader_output(llvmpipe->draw, TGSI_SEMANTIC_VIEWPORT_INDEX, 0); if (vs_index >= 0) { - llvmpipe->viewport_index_slot = vinfo->num_attribs; - draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); + llvmpipe->viewport_index_slot =(int)vinfo->num_attribs; + draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index); } } /* Figure out if we need layer (if it wasn't already in fs input) */ - if (llvmpipe->layer_slot == 0) { + if (llvmpipe->layer_slot < 0) { vs_index = draw_find_shader_output(llvmpipe->draw, TGSI_SEMANTIC_LAYER, 0); if (vs_index >= 0) { - llvmpipe->layer_slot = vinfo->num_attribs; - draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); + llvmpipe->layer_slot = (int)vinfo->num_attribs; + draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index); } } @@ -197,10 +191,9 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe ) llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW; } - if (llvmpipe->dirty & (LP_NEW_RASTERIZER | - LP_NEW_FS | + if (llvmpipe->dirty & (LP_NEW_FS | LP_NEW_VS)) - compute_vertex_info( llvmpipe ); + compute_vertex_info(llvmpipe); if (llvmpipe->dirty & (LP_NEW_FS | LP_NEW_FRAMEBUFFER | diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index 079083e9601..83ff97659fb 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -2695,34 +2695,35 @@ llvmpipe_create_fs_state(struct pipe_context *pipe, switch (shader->info.base.input_interpolate[i]) { case TGSI_INTERPOLATE_CONSTANT: - shader->inputs[i].interp = LP_INTERP_CONSTANT; - break; + shader->inputs[i].interp = LP_INTERP_CONSTANT; + break; case TGSI_INTERPOLATE_LINEAR: - shader->inputs[i].interp = LP_INTERP_LINEAR; - break; + shader->inputs[i].interp = LP_INTERP_LINEAR; + break; case TGSI_INTERPOLATE_PERSPECTIVE: - shader->inputs[i].interp = LP_INTERP_PERSPECTIVE; - break; + shader->inputs[i].interp = LP_INTERP_PERSPECTIVE; + break; case TGSI_INTERPOLATE_COLOR: - shader->inputs[i].interp = LP_INTERP_COLOR; - break; + shader->inputs[i].interp = LP_INTERP_COLOR; + break; default: - assert(0); - break; + assert(0); + break; } switch (shader->info.base.input_semantic_name[i]) { case TGSI_SEMANTIC_FACE: - shader->inputs[i].interp = LP_INTERP_FACING; - break; + shader->inputs[i].interp = LP_INTERP_FACING; + break; case TGSI_SEMANTIC_POSITION: - /* Position was already emitted above - */ - shader->inputs[i].interp = LP_INTERP_POSITION; - shader->inputs[i].src_index = 0; - continue; + /* Position was already emitted above + */ + shader->inputs[i].interp = LP_INTERP_POSITION; + shader->inputs[i].src_index = 0; + continue; } + /* XXX this is a completely pointless index map... */ shader->inputs[i].src_index = i+1; } diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.c b/src/gallium/drivers/llvmpipe/lp_state_setup.c index d7ba5c8ad8e..6a4fbbbf202 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_setup.c +++ b/src/gallium/drivers/llvmpipe/lp_state_setup.c @@ -372,9 +372,9 @@ load_attribute(struct gallivm_state *gallivm, /* Potentially modify it according to twoside, etc: */ if (key->twoside) { - if (vert_attr == key->color_slot && key->bcolor_slot > 0) + if (vert_attr == key->color_slot && key->bcolor_slot >= 0) lp_twoside(gallivm, args, key, key->bcolor_slot, attribv); - else if (vert_attr == key->spec_slot && key->bspec_slot > 0) + else if (vert_attr == key->spec_slot && key->bspec_slot >= 0) lp_twoside(gallivm, args, key, key->bspec_slot, attribv); } } @@ -602,13 +602,6 @@ emit_tri_coef( struct gallivm_state *gallivm, */ break; - case LP_INTERP_ZERO: - /* - * The information we get from the output is bogus, replace it - * with zero. - */ - emit_constant_coef4(gallivm, args, slot+1, args->bld.zero); - break; case LP_INTERP_FACING: emit_facing_coef(gallivm, args, slot+1); break; @@ -879,13 +872,7 @@ lp_make_setup_variant_key(struct llvmpipe_context *lp, key->pad = 0; memcpy(key->inputs, fs->inputs, key->num_inputs * sizeof key->inputs[0]); for (i = 0; i < key->num_inputs; i++) { - if (key->inputs[i].interp == LP_INTERP_CONSTANT) { - if (key->inputs[i].src_index == lp->fake_vpindex_slot || - key->inputs[i].src_index == lp->fake_layer_slot) { - key->inputs[i].interp = LP_INTERP_ZERO; - } - } - else if (key->inputs[i].interp == LP_INTERP_COLOR) { + if (key->inputs[i].interp == LP_INTERP_COLOR) { if (lp->rasterizer->flatshade) key->inputs[i].interp = LP_INTERP_CONSTANT; else diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.h b/src/gallium/drivers/llvmpipe/lp_state_setup.h index 6cee6fe5eb5..9ad244482de 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_setup.h +++ b/src/gallium/drivers/llvmpipe/lp_state_setup.h @@ -17,10 +17,10 @@ struct lp_setup_variant_list_item struct lp_setup_variant_key { unsigned size:16; unsigned num_inputs:8; - unsigned color_slot:8; - unsigned bcolor_slot:8; - unsigned spec_slot:8; - unsigned bspec_slot:8; + int color_slot:8; + int bcolor_slot:8; + int spec_slot:8; + int bspec_slot:8; unsigned flatshade_first:1; unsigned pixel_center_half:1; unsigned twoside:1; diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c index 7b19174f345..9139b83f05a 100644 --- a/src/gallium/drivers/llvmpipe/lp_test_blend.c +++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c @@ -184,7 +184,7 @@ add_blend_test(struct gallivm_state *gallivm, LLVMBuildStore(builder, res, res_ptr); - LLVMBuildRetVoid(builder);; + LLVMBuildRetVoid(builder); gallivm_verify_function(gallivm, func); diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c index a30f35c8149..02a63193af5 100644 --- a/src/gallium/drivers/llvmpipe/lp_test_conv.c +++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c @@ -140,7 +140,7 @@ add_conv_test(struct gallivm_state *gallivm, LLVMBuildStore(builder, dst[i], ptr); } - LLVMBuildRetVoid(builder);; + LLVMBuildRetVoid(builder); gallivm_verify_function(gallivm, func); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h index d09a0ab0610..d1fdd75495f 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h @@ -390,6 +390,9 @@ enum SVSemantic SV_VERTEX_STRIDE, SV_INVOCATION_INFO, SV_THREAD_KILL, + SV_BASEVERTEX, + SV_BASEINSTANCE, + SV_DRAWID, SV_UNDEFINED, SV_LAST }; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h index b49bf9d53bc..4504240ac5e 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h @@ -124,6 +124,7 @@ struct nv50_ir_prog_info union { struct { uint32_t inputMask[4]; /* mask of attributes read (1 bit per scalar) */ + bool usesDrawParameters; } vp; struct { uint8_t inputPatchSize; @@ -160,8 +161,9 @@ struct nv50_ir_prog_info uint8_t clipDistances; /* number of clip distance outputs */ uint8_t cullDistances; /* number of cull distance outputs */ int8_t genUserClip; /* request user clip planes for ClipVertex */ + uint8_t auxCBSlot; /* constant buffer index of UCP/draw data */ uint16_t ucpBase; /* base address for UCPs */ - uint8_t ucpCBSlot; /* constant buffer index of UCP data */ + uint16_t drawInfoBase; /* base address for draw parameters */ uint8_t pointSize; /* output index for PointSize */ uint8_t instanceId; /* system value index of InstanceID */ uint8_t vertexId; /* system value index of VertexID */ diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp index e9ddd366391..ec74e7ac811 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp @@ -740,6 +740,7 @@ CodeEmitterGM107::emitF2F() emitCC (0x2f); emitField(0x2d, 1, (insn->op == OP_NEG) || insn->src(0).mod.neg()); emitFMZ (0x2c, 1); + emitField(0x29, 1, insn->subOp); emitRND (0x27, rnd, 0x2a); emitField(0x0a, 2, util_logbase2(typeSizeof(insn->sType))); emitField(0x08, 2, util_logbase2(typeSizeof(insn->dType))); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp index 1d4f0d92f6b..0b28047e22b 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp @@ -1030,7 +1030,10 @@ CodeEmitterNVC0::emitCVT(Instruction *i) // for 8/16 source types, the byte/word is in subOp. word 1 is // represented as 2. - code[1] |= i->subOp << 0x17; + if (!isFloatType(i->sType)) + code[1] |= i->subOp << 0x17; + else + code[1] |= i->subOp << 0x18; if (sat) code[0] |= 0x20; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index b23386040a7..7b313f3c39c 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -319,6 +319,10 @@ unsigned int Instruction::srcMask(unsigned int s) const x |= 2; return x; } + case TGSI_OPCODE_PK2H: + return 0x3; + case TGSI_OPCODE_UP2H: + return 0x1; default: break; } @@ -348,7 +352,7 @@ static nv50_ir::DataFile translateFile(uint file) case TGSI_FILE_PREDICATE: return nv50_ir::FILE_PREDICATE; case TGSI_FILE_IMMEDIATE: return nv50_ir::FILE_IMMEDIATE; case TGSI_FILE_SYSTEM_VALUE: return nv50_ir::FILE_SYSTEM_VALUE; - case TGSI_FILE_RESOURCE: return nv50_ir::FILE_MEMORY_GLOBAL; + //case TGSI_FILE_RESOURCE: return nv50_ir::FILE_MEMORY_GLOBAL; case TGSI_FILE_SAMPLER: case TGSI_FILE_NULL: default: @@ -377,6 +381,9 @@ static nv50_ir::SVSemantic translateSysVal(uint sysval) case TGSI_SEMANTIC_TESSINNER: return nv50_ir::SV_TESS_INNER; case TGSI_SEMANTIC_VERTICESIN: return nv50_ir::SV_VERTEX_COUNT; case TGSI_SEMANTIC_HELPER_INVOCATION: return nv50_ir::SV_THREAD_KILL; + case TGSI_SEMANTIC_BASEVERTEX: return nv50_ir::SV_BASEVERTEX; + case TGSI_SEMANTIC_BASEINSTANCE: return nv50_ir::SV_BASEINSTANCE; + case TGSI_SEMANTIC_DRAWID: return nv50_ir::SV_DRAWID; default: assert(0); return nv50_ir::SV_CLOCK; @@ -449,6 +456,7 @@ nv50_ir::DataType Instruction::inferSrcType() const case TGSI_OPCODE_ATOMUMAX: case TGSI_OPCODE_UBFE: case TGSI_OPCODE_UMSB: + case TGSI_OPCODE_UP2H: return nv50_ir::TYPE_U32; case TGSI_OPCODE_I2F: case TGSI_OPCODE_I2D: @@ -513,10 +521,12 @@ nv50_ir::DataType Instruction::inferDstType() const case TGSI_OPCODE_DSGE: case TGSI_OPCODE_DSLT: case TGSI_OPCODE_DSNE: + case TGSI_OPCODE_PK2H: return nv50_ir::TYPE_U32; case TGSI_OPCODE_I2F: case TGSI_OPCODE_U2F: case TGSI_OPCODE_D2F: + case TGSI_OPCODE_UP2H: return nv50_ir::TYPE_F32; case TGSI_OPCODE_I2D: case TGSI_OPCODE_U2D: @@ -861,7 +871,7 @@ bool Source::scanSource() clipVertexOutput = -1; textureViews.resize(scan.file_max[TGSI_FILE_SAMPLER_VIEW] + 1); - resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1); + //resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1); info->immd.bufSize = 0; @@ -1128,6 +1138,11 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl) case TGSI_SEMANTIC_SAMPLEPOS: info->prop.fp.sampleInterp = 1; break; + case TGSI_SEMANTIC_BASEVERTEX: + case TGSI_SEMANTIC_BASEINSTANCE: + case TGSI_SEMANTIC_DRAWID: + info->prop.vp.usesDrawParameters = true; + break; default: break; } @@ -1144,6 +1159,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl) } } break; +/* case TGSI_FILE_RESOURCE: for (i = first; i <= last; ++i) { resources[i].target = decl->Resource.Resource; @@ -1151,6 +1167,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl) resources[i].slot = i; } break; +*/ case TGSI_FILE_SAMPLER_VIEW: for (i = first; i <= last; ++i) textureViews[i].target = decl->SamplerView.Resource; @@ -1216,11 +1233,13 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst) if (src.isIndirect(0)) mainTempsInLMem = true; } else +/* if (src.getFile() == TGSI_FILE_RESOURCE) { if (src.getIndex(0) == TGSI_RESOURCE_GLOBAL) info->io.globalAccess |= (insn.getOpcode() == TGSI_OPCODE_LOAD) ? 0x1 : 0x2; } else +*/ if (src.getFile() == TGSI_FILE_OUTPUT) { if (src.isIndirect(0)) { // We don't know which one is accessed, just mark everything for @@ -1271,9 +1290,11 @@ Instruction::getTexture(const tgsi::Source *code, int s) const unsigned int r; switch (getSrc(s).getFile()) { +/* case TGSI_FILE_RESOURCE: r = getSrc(s).getIndex(0); return translateTexture(code->resources.at(r).target); +*/ case TGSI_FILE_SAMPLER_VIEW: r = getSrc(s).getIndex(0); return translateTexture(code->textureViews.at(r).target); @@ -1639,8 +1660,6 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr) // don't load masked inputs, won't be assigned a slot if (!ptr && !(info->in[idx].mask & (1 << swz))) return loadImm(NULL, swz == TGSI_SWIZZLE_W ? 1.0f : 0.0f); - if (!ptr && info->in[idx].sn == TGSI_SEMANTIC_FACE) - return mkOp1v(OP_RDSV, TYPE_F32, getSSA(), mkSysVal(SV_FACE, 0)); return interpolate(src, c, shiftAddress(ptr)); } else if (prog->getType() == Program::TYPE_GEOMETRY) { @@ -1681,7 +1700,7 @@ Converter::acquireDst(int d, int c) const int idx = dst.getIndex(0); const int idx2d = dst.is2D() ? dst.getIndex(1) : 0; - if (dst.isMasked(c) || f == TGSI_FILE_RESOURCE) + if (dst.isMasked(c)/* || f == TGSI_FILE_RESOURCE*/) return NULL; if (dst.isIndirect(0) || @@ -2799,6 +2818,21 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn) FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) mkCvt(OP_CVT, dstTy, dst0[c], srcTy, fetchSrc(0, c)); break; + case TGSI_OPCODE_PK2H: + val0 = getScratch(); + val1 = getScratch(); + mkCvt(OP_CVT, TYPE_F16, val0, TYPE_F32, fetchSrc(0, 0)); + mkCvt(OP_CVT, TYPE_F16, val1, TYPE_F32, fetchSrc(0, 1)); + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) + mkOp3(OP_INSBF, TYPE_U32, dst0[c], val1, mkImm(0x1010), val0); + break; + case TGSI_OPCODE_UP2H: + src0 = fetchSrc(0, 0); + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + geni = mkCvt(OP_CVT, TYPE_F32, dst0[c], TYPE_F16, src0); + geni->subOp = c & 1; + } + break; case TGSI_OPCODE_EMIT: /* export the saved viewport index */ if (viewport != NULL) { @@ -3252,7 +3286,7 @@ Converter::handleUserClipPlanes() for (c = 0; c < 4; ++c) { for (i = 0; i < info->io.genUserClip; ++i) { - Symbol *sym = mkSymbol(FILE_MEMORY_CONST, info->io.ucpCBSlot, + Symbol *sym = mkSymbol(FILE_MEMORY_CONST, info->io.auxCBSlot, TYPE_F32, info->io.ucpBase + i * 16 + c * 4); Value *ucp = mkLoadv(TYPE_F32, sym, NULL); if (c == 0) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index e67bf3eca84..6530078b938 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -1576,6 +1576,17 @@ NVC0LoweringPass::handleRDSV(Instruction *i) ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0)); ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK; break; + case SV_BASEVERTEX: + case SV_BASEINSTANCE: + case SV_DRAWID: + ld = bld.mkLoad(TYPE_U32, i->getDef(0), + bld.mkSymbol(FILE_MEMORY_CONST, + prog->driver->io.auxCBSlot, + TYPE_U32, + prog->driver->io.drawInfoBase + + 4 * (sv - SV_BASEVERTEX)), + NULL); + break; default: if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch) vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0)); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index c2842c2186f..f5c590eef10 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -676,23 +676,22 @@ ConstantFolding::expr(Instruction *i, switch (i->op) { case OP_MAD: case OP_FMA: { - i->op = OP_ADD; + ImmediateValue src0, src1 = *i->getSrc(0)->asImm(); - /* Move the immediate to the second arg, otherwise the ADD operation - * won't be emittable - */ - i->setSrc(1, i->getSrc(0)); + // Move the immediate into position 1, where we know it might be + // emittable. However it might not be anyways, as there may be other + // restrictions, so move it into a separate LValue. + bld.setPosition(i, false); + i->op = OP_ADD; + i->setSrc(1, bld.mkMov(bld.getSSA(type), i->getSrc(0), type)->getDef(0)); i->setSrc(0, i->getSrc(2)); i->src(0).mod = i->src(2).mod; i->setSrc(2, NULL); - ImmediateValue src0; if (i->src(0).getImmediate(src0)) - expr(i, src0, *i->getSrc(1)->asImm()); - if (i->saturate && !prog->getTarget()->isSatSupported(i)) { - bld.setPosition(i, false); - i->setSrc(1, bld.loadImm(NULL, res.data.u32)); - } + expr(i, src0, src1); + else + opnd(i, src1, 1); break; } case OP_PFETCH: diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp index 19637ce33f5..014c652eede 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp @@ -295,6 +295,9 @@ TargetNVC0::getSVAddress(DataFile shaderFile, const Symbol *sym) const case SV_SAMPLE_INDEX: return 0; case SV_SAMPLE_POS: return 0; case SV_SAMPLE_MASK: return 0; + case SV_BASEVERTEX: return 0; + case SV_BASEINSTANCE: return 0; + case SV_DRAWID: return 0; default: return 0xffffffff; } diff --git a/src/gallium/drivers/nouveau/nouveau_compiler.c b/src/gallium/drivers/nouveau/nouveau_compiler.c index 670b0c8b135..cd44aa1e1d9 100644 --- a/src/gallium/drivers/nouveau/nouveau_compiler.c +++ b/src/gallium/drivers/nouveau/nouveau_compiler.c @@ -112,7 +112,7 @@ nouveau_codegen(int chipset, int type, struct tgsi_token tokens[], info.bin.sourceRep = NV50_PROGRAM_IR_TGSI; info.bin.source = tokens; - info.io.ucpCBSlot = 15; + info.io.auxCBSlot = 15; info.io.ucpBase = NV50_CB_AUX_UCP_OFFSET; info.io.resInfoCBSlot = 15; diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video.h b/src/gallium/drivers/nouveau/nouveau_vp3_video.h index 58df5ee847f..809e971a678 100644 --- a/src/gallium/drivers/nouveau/nouveau_vp3_video.h +++ b/src/gallium/drivers/nouveau/nouveau_vp3_video.h @@ -114,6 +114,11 @@ struct nouveau_vp3_decoder { unsigned fence_seq, fw_sizes, last_frame_num, tmp_stride, ref_stride; unsigned bsp_idx, vp_idx, ppp_idx; + + /* End of the bsp bo where new data should be appended between one begin/end + * frame. + */ + char *bsp_ptr; }; struct comm { @@ -208,11 +213,15 @@ nouveau_vp3_load_firmware(struct nouveau_vp3_decoder *dec, enum pipe_video_profile profile, unsigned chipset); +void +nouveau_vp3_bsp_begin(struct nouveau_vp3_decoder *dec); + +void +nouveau_vp3_bsp_next(struct nouveau_vp3_decoder *dec, unsigned num_buffers, + const void *const *data, const unsigned *num_bytes); + uint32_t -nouveau_vp3_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc, - struct nouveau_vp3_video_buffer *target, - unsigned comm_seq, unsigned num_buffers, - const void *const *data, const unsigned *num_bytes); +nouveau_vp3_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc); void nouveau_vp3_vp_caps(struct nouveau_vp3_decoder *dec, union pipe_desc desc, diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c b/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c index 692772e49d1..a3d07deeb18 100644 --- a/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c +++ b/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c @@ -230,20 +230,58 @@ nouveau_vp3_fill_picparm_h264_bsp(struct nouveau_vp3_decoder *dec, return caps | 3; } +static inline struct strparm_bsp *strparm_bsp(struct nouveau_vp3_decoder *dec) +{ + unsigned comm_seq = dec->fence_seq; + struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH]; + return (struct strparm_bsp *)(bsp_bo->map + 0x100); +} + +void +nouveau_vp3_bsp_begin(struct nouveau_vp3_decoder *dec) +{ + struct strparm_bsp *str_bsp = strparm_bsp(dec); + + dec->bsp_ptr = (void *)str_bsp; + memset(str_bsp, 0, 0x80); + dec->bsp_ptr += 0x100; + /* Reserved for picparm_vp */ + dec->bsp_ptr += 0x300; + /* Reserved for comm */ +#if !NOUVEAU_VP3_DEBUG_FENCE + memset(dec->bsp_ptr, 0, 0x200); +#endif + dec->bsp_ptr += 0x200; +} + +void +nouveau_vp3_bsp_next(struct nouveau_vp3_decoder *dec, unsigned num_buffers, + const void *const *data, const unsigned *num_bytes) +{ +#ifndef NDEBUG + unsigned comm_seq = dec->fence_seq; + struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH]; +#endif + struct strparm_bsp *str_bsp = strparm_bsp(dec); + int i; + + for (i = 0; i < num_buffers; ++i) { + assert(bsp_bo->size >= str_bsp->w0[0] + num_bytes[i]); + memcpy(dec->bsp_ptr, data[i], num_bytes[i]); + dec->bsp_ptr += num_bytes[i]; + str_bsp->w0[0] += num_bytes[i]; + } +} + uint32_t -nouveau_vp3_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc, - struct nouveau_vp3_video_buffer *target, - unsigned comm_seq, unsigned num_buffers, - const void *const *data, const unsigned *num_bytes) +nouveau_vp3_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc) { enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile); + unsigned comm_seq = dec->fence_seq; struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH]; - char *bsp; uint32_t endmarker, caps; - struct strparm_bsp *str_bsp; - int i; - - bsp = bsp_bo->map; + struct strparm_bsp *str_bsp = strparm_bsp(dec); + char *bsp = bsp_bo->map; /* * 0x000..0x100: picparm_bsp * 0x200..0x500: picparm_vp @@ -277,34 +315,21 @@ nouveau_vp3_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc, caps |= 1 << 17; // enable watchdog caps |= 0 << 18; // do not report error to VP, so it can continue decoding what we have caps |= 0 << 19; // if enabled, use crypto crap? - bsp += 0x100; - str_bsp = (struct strparm_bsp *)bsp; - memset(str_bsp, 0, 0x80); - str_bsp->w0[0] = 16; + str_bsp = strparm_bsp(dec); str_bsp->w1[0] = 0x1; - bsp += 0x100; - /* Reserved for picparm_vp */ - bsp += 0x300; - /* Reserved for comm */ -#if !NOUVEAU_VP3_DEBUG_FENCE - memset(bsp, 0, 0x200); -#endif - bsp += 0x200; - for (i = 0; i < num_buffers; ++i) { - memcpy(bsp, data[i], num_bytes[i]); - bsp += num_bytes[i]; - str_bsp->w0[0] += num_bytes[i]; - } /* Append end sequence */ - *(uint32_t *)bsp = endmarker; - bsp += 4; - *(uint32_t *)bsp = 0x00000000; - bsp += 4; - *(uint32_t *)bsp = endmarker; - bsp += 4; - *(uint32_t *)bsp = 0x00000000; + *(uint32_t *)dec->bsp_ptr = endmarker; + dec->bsp_ptr += 4; + *(uint32_t *)dec->bsp_ptr = 0x00000000; + dec->bsp_ptr += 4; + *(uint32_t *)dec->bsp_ptr = endmarker; + dec->bsp_ptr += 4; + *(uint32_t *)dec->bsp_ptr = 0x00000000; + str_bsp->w0[0] += 16; + + dec->bsp_ptr = NULL; return caps; } diff --git a/src/gallium/drivers/nouveau/nv30/nv30_draw.c b/src/gallium/drivers/nouveau/nv30/nv30_draw.c index 098d6e499fa..7b0d0745766 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_draw.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_draw.c @@ -208,17 +208,16 @@ nv30_render_release_vertices(struct vbuf_render *render) static const struct { unsigned emit; - unsigned interp; unsigned vp30; unsigned vp40; unsigned ow40; } vroute [] = { - [TGSI_SEMANTIC_POSITION] = { EMIT_4F, INTERP_PERSPECTIVE, 0, 0, 0x00000000 }, - [TGSI_SEMANTIC_COLOR ] = { EMIT_4F, INTERP_LINEAR , 3, 1, 0x00000001 }, - [TGSI_SEMANTIC_BCOLOR ] = { EMIT_4F, INTERP_LINEAR , 1, 3, 0x00000004 }, - [TGSI_SEMANTIC_FOG ] = { EMIT_4F, INTERP_PERSPECTIVE, 5, 5, 0x00000010 }, - [TGSI_SEMANTIC_PSIZE ] = { EMIT_1F_PSIZE, INTERP_POS , 6, 6, 0x00000020 }, - [TGSI_SEMANTIC_TEXCOORD] = { EMIT_4F, INTERP_PERSPECTIVE, 8, 7, 0x00004000 }, + [TGSI_SEMANTIC_POSITION] = { EMIT_4F, 0, 0, 0x00000000 }, + [TGSI_SEMANTIC_COLOR ] = { EMIT_4F, 3, 1, 0x00000001 }, + [TGSI_SEMANTIC_BCOLOR ] = { EMIT_4F, 1, 3, 0x00000004 }, + [TGSI_SEMANTIC_FOG ] = { EMIT_4F, 5, 5, 0x00000010 }, + [TGSI_SEMANTIC_PSIZE ] = { EMIT_1F_PSIZE, 6, 6, 0x00000020 }, + [TGSI_SEMANTIC_TEXCOORD] = { EMIT_4F, 8, 7, 0x00004000 }, }; static bool @@ -247,7 +246,7 @@ vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx) if (emit == EMIT_OMIT) return false; - draw_emit_vertex_attr(vinfo, emit, vroute[sem].interp, attrib); + draw_emit_vertex_attr(vinfo, emit, attrib); format = draw_translate_vinfo_format(emit); r->vtxfmt[attrib] = nv30_vtxfmt(&screen->base.base, format)->hw; diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c index 854f70cf34c..d9c940232c4 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c @@ -157,6 +157,8 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_USER_VERTEX_BUFFERS: case PIPE_CAP_COMPUTE: case PIPE_CAP_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: case PIPE_CAP_SAMPLER_VIEW_TARGET: @@ -174,6 +176,11 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: case PIPE_CAP_CLEAR_TEXTURE: + case PIPE_CAP_DRAW_PARAMETERS: + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: + case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: + case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: return 0; case PIPE_CAP_VENDOR_ID: @@ -265,6 +272,7 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: + case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: return 0; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; @@ -308,6 +316,7 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: + case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: return 0; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h index 2cebcd99423..712d00ed2d3 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_context.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h @@ -134,9 +134,11 @@ struct nv50_context { struct nv50_constbuf constbuf[3][NV50_MAX_PIPE_CONSTBUFS]; uint16_t constbuf_dirty[3]; uint16_t constbuf_valid[3]; + uint16_t constbuf_coherent[3]; struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS]; unsigned num_vtxbufs; + uint32_t vtxbufs_coherent; struct pipe_index_buffer idxbuf; uint32_t vbo_fifo; /* bitmask of vertex elements to be pushed to FIFO */ uint32_t vbo_user; /* bitmask of vertex buffers pointing to user memory */ @@ -148,6 +150,7 @@ struct nv50_context { struct pipe_sampler_view *textures[3][PIPE_MAX_SAMPLERS]; unsigned num_textures[3]; + uint32_t textures_coherent[3]; struct nv50_tsc_entry *samplers[3][PIPE_MAX_SAMPLERS]; unsigned num_samplers[3]; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c index a4b8ddfda95..888d62e1c52 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_program.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c @@ -148,7 +148,6 @@ nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info) for (m = 0, i = 0; i < info->numInputs; ++i) { switch (info->in[i].sn) { case TGSI_SEMANTIC_POSITION: - case TGSI_SEMANTIC_FACE: continue; default: m += info->in[i].flat ? 0 : 1; @@ -166,9 +165,6 @@ nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info) for (c = 0; c < 4; ++c) if (info->in[i].mask & (1 << c)) info->in[i].slot[c] = nintp++; - } else - if (info->in[i].sn == TGSI_SEMANTIC_FACE) { - info->in[i].slot[0] = 255; } else { unsigned j = info->in[i].flat ? m++ : n++; @@ -335,7 +331,7 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset, info->bin.sourceRep = NV50_PROGRAM_IR_TGSI; info->bin.source = (void *)prog->pipe.tokens; - info->io.ucpCBSlot = 15; + info->io.auxCBSlot = 15; info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET; info->io.genUserClip = prog->vp.clpd_nr; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c index 272e1d45bff..56c67e0ddfb 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c @@ -182,6 +182,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_CLEAR_TEXTURE: case PIPE_CAP_COMPUTE: + case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: return 1; case PIPE_CAP_SEAMLESS_CUBE_MAP: return 1; /* class_3d >= NVA0_3D_CLASS; */ @@ -212,11 +213,17 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_TEXTURE_GATHER_OFFSETS: case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION: case PIPE_CAP_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: case PIPE_CAP_VERTEXID_NOBASE: case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: /* potentially supported on some hw */ case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: + case PIPE_CAP_DRAW_PARAMETERS: + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: + case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: return 0; case PIPE_CAP_VENDOR_ID: @@ -300,6 +307,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: + case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: return 0; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c index de655971b66..cb040439139 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_state.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c @@ -664,6 +664,17 @@ nv50_stage_set_sampler_views(struct nv50_context *nv50, int s, if (old) nv50_screen_tic_unlock(nv50->screen, old); + if (views[i] && views[i]->texture) { + struct pipe_resource *res = views[i]->texture; + if (res->target == PIPE_BUFFER && + (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)) + nv50->textures_coherent[s] |= 1 << i; + else + nv50->textures_coherent[s] &= ~(1 << i); + } else { + nv50->textures_coherent[s] &= ~(1 << i); + } + pipe_sampler_view_reference(&nv50->textures[s][i], views[i]); } @@ -847,13 +858,19 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index, nv50->constbuf[s][i].u.data = cb->user_buffer; nv50->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000); nv50->constbuf_valid[s] |= 1 << i; + nv50->constbuf_coherent[s] &= ~(1 << i); } else if (res) { nv50->constbuf[s][i].offset = cb->buffer_offset; nv50->constbuf[s][i].size = MIN2(align(cb->buffer_size, 0x100), 0x10000); nv50->constbuf_valid[s] |= 1 << i; + if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) + nv50->constbuf_coherent[s] |= 1 << i; + else + nv50->constbuf_coherent[s] &= ~(1 << i); } else { nv50->constbuf_valid[s] &= ~(1 << i); + nv50->constbuf_coherent[s] &= ~(1 << i); } nv50->constbuf_dirty[s] |= 1 << i; @@ -1003,6 +1020,7 @@ nv50_set_vertex_buffers(struct pipe_context *pipe, if (!vb) { nv50->vbo_user &= ~(((1ull << count) - 1) << start_slot); nv50->vbo_constant &= ~(((1ull << count) - 1) << start_slot); + nv50->vtxbufs_coherent &= ~(((1ull << count) - 1) << start_slot); return; } @@ -1015,9 +1033,16 @@ nv50_set_vertex_buffers(struct pipe_context *pipe, nv50->vbo_constant |= 1 << dst_index; else nv50->vbo_constant &= ~(1 << dst_index); + nv50->vtxbufs_coherent &= ~(1 << dst_index); } else { nv50->vbo_user &= ~(1 << dst_index); nv50->vbo_constant &= ~(1 << dst_index); + + if (vb[i].buffer && + vb[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) + nv50->vtxbufs_coherent |= (1 << dst_index); + else + nv50->vtxbufs_coherent &= ~(1 << dst_index); } } } diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c index 7de2f1f1d0f..60fa2bc06a8 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c @@ -636,8 +636,8 @@ nv50_draw_elements(struct nv50_context *nv50, bool shorten, BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1); PUSH_DATA (push, prim); - PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain); nouveau_pushbuf_space(push, 8, 0, 1); + PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain); switch (index_size) { case 4: @@ -765,7 +765,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) struct nv50_context *nv50 = nv50_context(pipe); struct nouveau_pushbuf *push = nv50->base.pushbuf; bool tex_dirty = false; - int i, s; + int s; /* NOTE: caller must ensure that (min_index + index_bias) is >= 0 */ nv50->vb_elt_first = info->min_index + info->index_bias; @@ -794,27 +794,9 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) push->kick_notify = nv50_draw_vbo_kick_notify; - /* TODO: Instead of iterating over all the buffer resources looking for - * coherent buffers, keep track of a context-wide count. - */ for (s = 0; s < 3 && !nv50->cb_dirty; ++s) { - uint32_t valid = nv50->constbuf_valid[s]; - - while (valid && !nv50->cb_dirty) { - const unsigned i = ffs(valid) - 1; - struct pipe_resource *res; - - valid &= ~(1 << i); - if (nv50->constbuf[s][i].user) - continue; - - res = nv50->constbuf[s][i].u.buf; - if (!res) - continue; - - if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) - nv50->cb_dirty = true; - } + if (nv50->constbuf_coherent[s]) + nv50->cb_dirty = true; } /* If there are any coherent constbufs, flush the cache */ @@ -825,15 +807,10 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) } for (s = 0; s < 3 && !tex_dirty; ++s) { - for (i = 0; i < nv50->num_textures[s] && !tex_dirty; ++i) { - if (!nv50->textures[s][i] || - nv50->textures[s][i]->texture->target != PIPE_BUFFER) - continue; - if (nv50->textures[s][i]->texture->flags & - PIPE_RESOURCE_FLAG_MAP_COHERENT) - tex_dirty = true; - } + if (nv50->textures_coherent[s]) + tex_dirty = true; } + if (tex_dirty) { BEGIN_NV04(push, NV50_3D(TEX_CACHE_CTL), 1); PUSH_DATA (push, 0x20); @@ -853,12 +830,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) PUSH_DATA (push, info->start_instance); } - for (i = 0; i < nv50->num_vtxbufs && !nv50->base.vbo_dirty; ++i) { - if (!nv50->vtxbuf[i].buffer) - continue; - if (nv50->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) - nv50->base.vbo_dirty = true; - } + nv50->base.vbo_dirty |= !!nv50->vtxbufs_coherent; if (nv50->base.vbo_dirty) { BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FLUSH), 1); diff --git a/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c b/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c index dbde1bfcebe..4fe0e05c96b 100644 --- a/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c +++ b/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c @@ -77,7 +77,7 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc, bsp_size += (1 << 20) - 1; bsp_size &= ~((1 << 20) - 1); - ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_size, NULL, &tmp_bo); + ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_size, NULL, &tmp_bo); if (ret) { debug_printf("reallocating bsp %u -> %u failed with %i\n", bsp_bo ? (unsigned)bsp_bo->size : 0, bsp_size, ret); @@ -90,7 +90,7 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc, if (!inter_bo || bsp_bo->size * 4 > inter_bo->size) { struct nouveau_bo *tmp_bo = NULL; - ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, NULL, &tmp_bo); + ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, NULL, &tmp_bo); if (ret) { debug_printf("reallocating inter %u -> %u failed with %i\n", inter_bo ? (unsigned)inter_bo->size : 0, (unsigned)bsp_bo->size * 4, ret); @@ -106,8 +106,9 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc, return -1; } - caps = nouveau_vp3_bsp(dec, desc, target, comm_seq, - num_buffers, data, num_bytes); + nouveau_vp3_bsp_begin(dec); + nouveau_vp3_bsp_next(dec, num_buffers, data, num_bytes); + caps = nouveau_vp3_bsp_end(dec, desc); nouveau_vp3_vp_caps(dec, desc, target, comm_seq, vp_caps, is_ref, refs); diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme index b2060d1fa53..4daa57d47bb 100644 --- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme +++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme @@ -230,27 +230,43 @@ locn_0f_ts: * Forcefully sets VERTEX_ID_BASE to the value of VB_ELEMENT_BASE. * * arg = mode - * parm[0] = count - * parm[1] = instance_count - * parm[2] = start - * parm[3] = index_bias - * parm[4] = start_instance + * parm[0] = start_drawid + * parm[1] = numparams + * parm[2 + 5n + 0] = count + * parm[2 + 5n + 1] = instance_count + * parm[2 + 5n + 2] = start + * parm[2 + 5n + 3] = index_bias + * parm[2 + 5n + 4] = start_instance + * + * SCRATCH[0] = saved VB_ELEMENT_BASE + * SCRATCH[1] = saved VB_INSTANCE_BASE */ .section #mme9097_draw_elts_indirect + read $r6 0x50d /* VB_ELEMENT_BASE */ + read $r7 0x50e /* VB_INSTANCE_BASE */ + maddr 0x1d00 + send $r6 /* SCRATCH[0] = VB_ELEMENT_BASE */ + send $r7 /* SCRATCH[1] = VB_INSTANCE_BASE */ + parm $r6 /* start_drawid */ + parm $r7 /* numparams */ +dei_draw_again: parm $r3 /* count */ parm $r2 /* instance_count */ parm $r4 maddr 0x5f7 /* INDEX_BATCH_FIRST, start */ parm $r4 send $r4 /* index_bias, send start */ + maddr 0x18e3 /* CB_POS */ + send 0x180 /* 256 + 128 */ braz $r2 #dei_end - parm $r5 /* start_instance */ - read $r6 0x50d /* VB_ELEMENT_BASE */ - read $r7 0x50e /* VB_INSTANCE_BASE */ + parm $r5 send $r4 /* start_instance, send index_bias */ + send $r5 /* send start_instance */ + send $r6 /* draw id */ maddr 0x150d /* VB_ELEMENT,INSTANCE_BASE */ send $r4 send $r5 maddr 0x446 send $r4 mov $r4 0x1 + mov $r1 (extrinsrt $r1 0x0 0 1 26) /* clear INSTANCE_NEXT */ dei_again: maddr 0x586 /* VERTEX_BEGIN_GL */ send $r1 /* mode */ @@ -260,46 +276,218 @@ dei_again: maddrsend 0x585 /* VERTEX_END_GL */ branz $r2 #dei_again mov $r1 (extrinsrt $r1 $r4 0 1 26) /* set INSTANCE_NEXT */ +dei_end: + mov $r7 (add $r7 -1) + branz $r7 #dei_draw_again + mov $r6 (add $r6 1) + read $r6 0xd00 + read $r7 0xd01 maddr 0x150d /* VB_ELEMENT,INSTANCE_BASE */ send $r6 send $r7 exit maddr 0x446 send $r6 -dei_end: - exit - nop /* NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT: * * NOTE: Saves and restores VB_INSTANCE_BASE. * * arg = mode - * parm[0] = count - * parm[1] = instance_count - * parm[2] = start - * parm[3] = start_instance + * parm[0] = start_drawid + * parm[1] = numparams + * parm[2 + 4n + 0] = count + * parm[2 + 4n + 1] = instance_count + * parm[2 + 4n + 2] = start + * parm[2 + 4n + 3] = start_instance */ .section #mme9097_draw_arrays_indirect + read $r5 0x50e /* VB_INSTANCE_BASE */ + parm $r6 /* start_drawid */ + parm $r7 /* numparams */ +dai_draw_again: parm $r2 /* count */ parm $r3 /* instance_count */ parm $r4 maddr 0x35d /* VERTEX_BUFFER_FIRST, start */ - parm $r4 send $r4 /* start_instance */ braz $r3 #dai_end - read $r6 0x50e /* VB_INSTANCE_BASE */ + parm $r4 send $r4 /* start_instance */ + maddr 0x18e3 /* CB_POS */ + send 0x180 /* 256 + 128 */ + send 0x0 /* send 0 as base_vertex */ + send $r4 /* send start_instance */ + send $r6 /* draw id */ maddr 0x50e /* VB_INSTANCE_BASE */ - mov $r5 0x1 send $r4 + mov $r4 0x1 + mov $r1 (extrinsrt $r1 0x0 0 1 26) /* clear INSTANCE_NEXT */ dai_again: maddr 0x586 /* VERTEX_BEGIN_GL */ send $r1 /* mode */ maddr 0x35e /* VERTEX_BUFFER_COUNT */ send $r2 - mov $r3 (sub $r3 $r5) + mov $r3 (sub $r3 $r4) maddrsend 0x585 /* VERTEX_END_GL */ branz $r3 #dai_again - mov $r1 (extrinsrt $r1 $r5 0 1 26) /* set INSTANCE_NEXT */ + mov $r1 (extrinsrt $r1 $r4 0 1 26) /* set INSTANCE_NEXT */ +dai_end: + mov $r7 (add $r7 -1) + branz $r7 #dai_draw_again + mov $r6 (add $r6 1) exit maddr 0x50e /* VB_INSTANCE_BASE to restore */ + send $r5 + +/* NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT + * + * NOTE: Saves and restores VB_ELEMENT,INSTANCE_BASE. + * Forcefully sets VERTEX_ID_BASE to the value of VB_ELEMENT_BASE. + * + * arg = mode + * parm[0] = start_drawid + * parm[1] = numparams + * parm[2] = totaldraws + * parm[3 + 5n + 0] = count + * parm[3 + 5n + 1] = instance_count + * parm[3 + 5n + 2] = start + * parm[3 + 5n + 3] = index_bias + * parm[3 + 5n + 4] = start_instance + * + * SCRATCH[0] = saved VB_ELEMENT_BASE + * SCRATCH[1] = saved VB_INSTANCE_BASE + * SCRATCH[2] = draws left + */ +.section #mme9097_draw_elts_indirect_count + read $r6 0x50d /* VB_ELEMENT_BASE */ + read $r7 0x50e /* VB_INSTANCE_BASE */ + maddr 0x1d00 + send $r6 /* SCRATCH[0] = VB_ELEMENT_BASE */ + send $r7 /* SCRATCH[1] = VB_INSTANCE_BASE */ + parm $r6 /* start_drawid */ + parm $r7 /* numparams */ + parm $r5 /* totaldraws */ + mov $r5 (sub $r5 $r6) /* draws left */ + braz $r5 #deic_runout + mov $r3 (extrinsrt 0x0 $r5 31 1 0) /* extract high bit */ + branz $r3 #deic_runout + send $r5 +deic_draw_again: + parm $r3 /* count */ + parm $r2 /* instance_count */ + parm $r4 maddr 0x5f7 /* INDEX_BATCH_FIRST, start */ + parm $r4 send $r4 /* index_bias, send start */ + maddr 0x18e3 /* CB_POS */ + send 0x180 /* 256 + 128 */ + braz $r2 #deic_end + parm $r5 send $r4 /* start_instance, send index_bias */ + send $r5 /* send start_instance */ + send $r6 /* draw id */ + maddr 0x150d /* VB_ELEMENT,INSTANCE_BASE */ + send $r4 + send $r5 + maddr 0x446 + send $r4 + mov $r4 0x1 + mov $r1 (extrinsrt $r1 0x0 0 1 26) /* clear INSTANCE_NEXT */ +deic_again: + maddr 0x586 /* VERTEX_BEGIN_GL */ + send $r1 /* mode */ + maddr 0x5f8 /* INDEX_BATCH_COUNT */ + send $r3 /* count */ + mov $r2 (sub $r2 $r4) + maddrsend 0x585 /* VERTEX_END_GL */ + branz $r2 #deic_again + mov $r1 (extrinsrt $r1 $r4 0 1 26) /* set INSTANCE_NEXT */ +deic_end: + read $r5 0xd02 + mov $r5 (add $r5 -1) + braz $r5 #deic_runout_check + mov $r7 (add $r7 -1) + maddr 0xd02 + send $r5 + branz $r7 #deic_draw_again + mov $r6 (add $r6 1) +deic_restore: + read $r6 0xd00 + read $r7 0xd01 + maddr 0x150d /* VB_ELEMENT,INSTANCE_BASE */ send $r6 -dai_end: - exit - nop + send $r7 + exit maddr 0x446 + send $r6 +deic_runout: + parm $r2 + parm $r2 + parm $r2 + parm $r2 + parm $r2 + mov $r7 (add $r7 -1) +deic_runout_check: + branz annul $r7 #deic_runout + bra annul #deic_restore + +/* NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT: + * + * NOTE: Saves and restores VB_INSTANCE_BASE. + * + * arg = mode + * parm[0] = start_drawid + * parm[1] = numparams + * parm[2] = totaldraws + * parm[3 + 4n + 0] = count + * parm[3 + 4n + 1] = instance_count + * parm[3 + 4n + 2] = start + * parm[3 + 4n + 3] = start_instance + * + * SCRATCH[0] = VB_INSTANCE_BASE + */ +.section #mme9097_draw_arrays_indirect_count + read $r5 0x50e /* VB_INSTANCE_BASE */ + maddr 0xd00 + parm $r6 send $r5 /* start_drawid, save VB_INSTANCE_BASE */ + parm $r7 /* numparams */ + parm $r5 /* totaldraws */ + mov $r5 (sub $r5 $r6) /* draws left */ + braz $r5 #daic_runout + mov $r3 (extrinsrt 0x0 $r5 31 1 0) /* extract high bit */ + branz annul $r3 #daic_runout +daic_draw_again: + parm $r2 /* count */ + parm $r3 /* instance_count */ + parm $r4 maddr 0x35d /* VERTEX_BUFFER_FIRST, start */ + braz $r3 #daic_end + parm $r4 send $r4 /* start_instance */ + maddr 0x18e3 /* CB_POS */ + send 0x180 /* 256 + 128 */ + send 0x0 /* send 0 as base_vertex */ + send $r4 /* send start_instance */ + send $r6 /* draw id */ + maddr 0x50e /* VB_INSTANCE_BASE */ + send $r4 + mov $r4 0x1 + mov $r1 (extrinsrt $r1 0x0 0 1 26) /* clear INSTANCE_NEXT */ +daic_again: + maddr 0x586 /* VERTEX_BEGIN_GL */ + send $r1 /* mode */ + maddr 0x35e /* VERTEX_BUFFER_COUNT */ + send $r2 + mov $r3 (sub $r3 $r4) + maddrsend 0x585 /* VERTEX_END_GL */ + branz $r3 #daic_again + mov $r1 (extrinsrt $r1 $r4 0 1 26) /* set INSTANCE_NEXT */ +daic_end: + mov $r5 (add $r5 -1) + braz $r5 #daic_runout_check + mov $r7 (add $r7 -1) + branz $r7 #daic_draw_again + mov $r6 (add $r6 1) +daic_restore: + read $r5 0xd00 + exit maddr 0x50e /* VB_INSTANCE_BASE to restore */ + send $r5 +daic_runout: + parm $r2 + parm $r2 + parm $r2 + parm $r2 + mov $r7 (add $r7 -1) +daic_runout_check: + branz annul $r7 #daic_runout + bra annul #daic_restore diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h index bac9042c2df..bf8625e0584 100644 --- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h +++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h @@ -125,22 +125,33 @@ uint32_t mme9097_tep_select[] = { }; uint32_t mme9097_draw_elts_indirect[] = { + 0x01434615, +/* 0x0007: dei_draw_again */ + 0x01438715, + 0x07400021, + 0x00003041, + 0x00003841, + 0x00000601, +/* 0x0018: dei_again */ + 0x00000701, 0x00000301, +/* 0x0020: dei_end */ 0x00000201, 0x017dc451, -/* 0x000e: dei_again */ 0x00002431, - 0x0005d007, - 0x00000501, -/* 0x001b: dei_end */ - 0x01434615, - 0x01438715, + 0x0638c021, + 0x00600041, + 0x0004d007, + 0x00002531, + 0x00002841, + 0x00003041, 0x05434021, 0x00002041, 0x00002841, 0x01118021, 0x00002041, 0x00004411, + 0xd0400912, 0x01618021, 0x00000841, 0x017e0021, @@ -149,37 +160,175 @@ uint32_t mme9097_draw_elts_indirect[] = { 0x01614071, 0xfffe9017, 0xd0410912, + 0xffffff11, + 0xfff9b817, + 0x00007611, + 0x03400615, + 0x03404715, 0x05434021, 0x00003041, 0x00003841, 0x011180a1, 0x00003041, - 0x00000091, - 0x00000011, }; uint32_t mme9097_draw_arrays_indirect[] = { +/* 0x0003: dai_draw_again */ + 0x01438515, + 0x00000601, + 0x00000701, 0x00000201, +/* 0x0011: dai_again */ 0x00000301, -/* 0x0009: dai_again */ 0x00d74451, +/* 0x0019: dai_end */ + 0x0004d807, 0x00002431, -/* 0x0013: dai_end */ - 0x0003d807, - 0x01438615, + 0x0638c021, + 0x00600041, + 0x00000041, + 0x00002041, + 0x00003041, 0x01438021, - 0x00004511, 0x00002041, + 0x00004411, + 0xd0400912, 0x01618021, 0x00000841, 0x00d78021, 0x00001041, - 0x00055b10, + 0x00051b10, 0x01614071, 0xfffe9817, - 0xd0414912, + 0xd0410912, + 0xffffff11, + 0xfffa7817, + 0x00007611, 0x014380a1, + 0x00002841, +}; + +uint32_t mme9097_draw_elts_indirect_count[] = { + 0x01434615, + 0x01438715, + 0x07400021, +/* 0x000d: deic_draw_again */ 0x00003041, - 0x00000091, - 0x00000011, + 0x00003841, + 0x00000601, + 0x00000701, +/* 0x001e: deic_again */ + 0x00000501, + 0x0005ad10, +/* 0x0026: deic_end */ + 0x000b2807, + 0x007f4312, +/* 0x002e: deic_restore */ + 0x000a9817, + 0x00002841, +/* 0x0035: deic_runout */ + 0x00000301, +/* 0x003b: deic_runout_check */ + 0x00000201, + 0x017dc451, + 0x00002431, + 0x0638c021, + 0x00600041, + 0x0004d007, + 0x00002531, + 0x00002841, + 0x00003041, + 0x05434021, + 0x00002041, + 0x00002841, + 0x01118021, + 0x00002041, + 0x00004411, + 0xd0400912, + 0x01618021, + 0x00000841, + 0x017e0021, + 0x00001841, + 0x00051210, + 0x01614071, + 0xfffe9017, + 0xd0410912, + 0x03408515, + 0xffffed11, + 0x0004e807, + 0xffffff11, + 0x03408021, + 0x00002841, + 0xfff87817, + 0x00007611, + 0x03400615, + 0x03404715, + 0x05434021, + 0x00003041, + 0x00003841, + 0x011180a1, + 0x00003041, + 0x00000201, + 0x00000201, + 0x00000201, + 0x00000201, + 0x00000201, + 0xffffff11, + 0xfffeb837, + 0xfffc8027, +}; + +uint32_t mme9097_draw_arrays_indirect_count[] = { + 0x01438515, + 0x03400021, +/* 0x0009: daic_draw_again */ + 0x00002e31, + 0x00000701, + 0x00000501, +/* 0x0017: daic_again */ + 0x0005ad10, + 0x00086807, +/* 0x001f: daic_end */ + 0x007f4312, + 0x0007d837, +/* 0x0024: daic_restore */ +/* 0x0027: daic_runout */ + 0x00000201, + 0x00000301, +/* 0x002c: daic_runout_check */ + 0x00d74451, + 0x0004d807, + 0x00002431, + 0x0638c021, + 0x00600041, + 0x00000041, + 0x00002041, + 0x00003041, + 0x01438021, + 0x00002041, + 0x00004411, + 0xd0400912, + 0x01618021, + 0x00000841, + 0x00d78021, + 0x00001041, + 0x00051b10, + 0x01614071, + 0xfffe9817, + 0xd0410912, + 0xffffed11, + 0x00032807, + 0xffffff11, + 0xfff9f817, + 0x00007611, + 0x03400515, + 0x014380a1, + 0x00002841, + 0x00000201, + 0x00000201, + 0x00000201, + 0x00000201, + 0xffffff11, + 0xfffef837, + 0xfffdc027, }; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h index 39b73ecb0c2..12195489691 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h @@ -134,10 +134,12 @@ struct nvc0_context { struct nvc0_constbuf constbuf[6][NVC0_MAX_PIPE_CONSTBUFS]; uint16_t constbuf_dirty[6]; uint16_t constbuf_valid[6]; + uint16_t constbuf_coherent[6]; bool cb_dirty; struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS]; unsigned num_vtxbufs; + uint32_t vtxbufs_coherent; struct pipe_index_buffer idxbuf; uint32_t constant_vbos; uint32_t vbo_user; /* bitmask of vertex buffers pointing to user memory */ @@ -149,6 +151,7 @@ struct nvc0_context { struct pipe_sampler_view *textures[6][PIPE_MAX_SAMPLERS]; unsigned num_textures[6]; uint32_t textures_dirty[6]; + uint32_t textures_coherent[6]; struct nv50_tsc_entry *samplers[6][PIPE_MAX_SAMPLERS]; unsigned num_samplers[6]; uint16_t samplers_dirty[6]; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h b/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h index bf2798a44a0..27c026b8b30 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h @@ -29,4 +29,8 @@ #define NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT 0x00003840 +#define NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT 0x00003848 + +#define NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT 0x00003850 + #endif /* __NVC0_MACROS_H__ */ diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c index 67a25acf778..c3b53621630 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c @@ -55,7 +55,6 @@ nvc0_shader_input_address(unsigned sn, unsigned si) case TGSI_SEMANTIC_INSTANCEID: return 0x2f8; case TGSI_SEMANTIC_VERTEXID: return 0x2fc; case TGSI_SEMANTIC_TEXCOORD: return 0x300 + si * 0x10; - case TGSI_SEMANTIC_FACE: return 0x3fc; default: assert(!"invalid TGSI input semantic"); return ~0; @@ -285,8 +284,6 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info) break; case PIPE_PRIM_TRIANGLES: tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_TRIANGLES; - if (info->prop.tp.winding > 0) - tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW; break; case PIPE_PRIM_QUADS: tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_QUADS; @@ -295,6 +292,10 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info) tp->tp.tess_mode = ~0; return; } + + if (info->prop.tp.winding > 0) + tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW; + if (info->prop.tp.outputPrim != PIPE_PRIM_POINTS) tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CONNECTED; @@ -533,8 +534,9 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset, info->bin.source = (void *)prog->pipe.tokens; info->io.genUserClip = prog->vp.num_ucps; + info->io.auxCBSlot = 15; info->io.ucpBase = 256; - info->io.ucpCBSlot = 15; + info->io.drawInfoBase = 256 + 128; if (prog->type == PIPE_SHADER_COMPUTE) { if (chipset >= NVISA_GK104_CHIPSET) { @@ -583,6 +585,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset, prog->num_barriers = info->numBarriers; prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS; + prog->vp.need_draw_parameters = info->prop.vp.usesDrawParameters; if (info->io.edgeFlagOut < PIPE_MAX_ATTRIBS) info->out[info->io.edgeFlagOut].mask = 0; /* for headergen */ diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h index 9c45e7b3e31..8b8d221edfc 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h @@ -42,6 +42,7 @@ struct nvc0_program { uint8_t num_ucps; /* also set to max if ClipDistance is used */ uint8_t edgeflag; /* attribute index of edgeflag input */ bool need_vertex_id; + bool need_draw_parameters; } vp; struct { uint8_t early_z; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c index a70d524ea85..1bed0162baf 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c @@ -470,10 +470,7 @@ nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf *push, { struct nvc0_hw_query *hq = nvc0_hw_query(q); -#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8)) - PUSH_REFN(push, hq->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART); - nouveau_pushbuf_space(push, 0, 0, 1); nouveau_pushbuf_data(push, hq->bo, hq->offset + result_offset, 4 | NVC0_IB_ENTRY_1_NO_PREFETCH); } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index 39954464b9c..33dd17ebeca 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -184,6 +184,11 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_CLEAR_TEXTURE: + case PIPE_CAP_DRAW_PARAMETERS: + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: + case PIPE_CAP_MULTI_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: + case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: return 1; case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: return (class_3d >= NVE4_3D_CLASS) ? 1 : 0; @@ -206,6 +211,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_VERTEXID_NOBASE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: + case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: return 0; case PIPE_CAP_VENDOR_ID: @@ -311,6 +318,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: + case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: return 0; case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: return 16; /* would be 32 in linked (OpenGL-style) mode */ @@ -1025,6 +1033,8 @@ nvc0_screen_create(struct nouveau_device *dev) MK_MACRO(NVC0_3D_MACRO_POLYGON_MODE_BACK, mme9097_poly_mode_back); MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT, mme9097_draw_arrays_indirect); MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT, mme9097_draw_elts_indirect); + MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT, mme9097_draw_arrays_indirect_count); + MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT, mme9097_draw_elts_indirect_count); BEGIN_NVC0(push, NVC0_3D(RASTERIZE_ENABLE), 1); PUSH_DATA (push, 1); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c index 5e84ca9e0ea..dc02b011bdf 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c @@ -317,6 +317,7 @@ nvc0_tfb_validate(struct nvc0_context *nvc0) if (!targ->clean) nvc0_hw_query_fifo_wait(push, nvc0_query(targ->pq)); + nouveau_pushbuf_space(push, 0, 0, 1); BEGIN_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 5); PUSH_DATA (push, 1); PUSH_DATAh(push, buf->address + targ->pipe.buffer_offset); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c index 41a824a97a0..24a6c222dd5 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c @@ -554,6 +554,17 @@ nvc0_stage_set_sampler_views(struct nvc0_context *nvc0, int s, continue; nvc0->textures_dirty[s] |= 1 << i; + if (views[i] && views[i]->texture) { + struct pipe_resource *res = views[i]->texture; + if (res->target == PIPE_BUFFER && + (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)) + nvc0->textures_coherent[s] |= 1 << i; + else + nvc0->textures_coherent[s] &= ~(1 << i); + } else { + nvc0->textures_coherent[s] &= ~(1 << i); + } + if (old) { nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(s, i)); nvc0_screen_tic_unlock(nvc0->screen, old); @@ -596,6 +607,17 @@ nvc0_stage_set_sampler_views_range(struct nvc0_context *nvc0, const unsigned s, continue; nvc0->textures_dirty[s] |= 1 << i; + if (views[p] && views[p]->texture) { + struct pipe_resource *res = views[p]->texture; + if (res->target == PIPE_BUFFER && + (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)) + nvc0->textures_coherent[s] |= 1 << i; + else + nvc0->textures_coherent[s] &= ~(1 << i); + } else { + nvc0->textures_coherent[s] &= ~(1 << i); + } + if (nvc0->textures[s][i]) { struct nv50_tic_entry *old = nv50_tic_entry(nvc0->textures[s][i]); nouveau_bufctx_reset(bctx, bin + i); @@ -842,14 +864,20 @@ nvc0_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index, nvc0->constbuf[s][i].u.data = cb->user_buffer; nvc0->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000); nvc0->constbuf_valid[s] |= 1 << i; + nvc0->constbuf_coherent[s] &= ~(1 << i); } else if (cb) { nvc0->constbuf[s][i].offset = cb->buffer_offset; nvc0->constbuf[s][i].size = MIN2(align(cb->buffer_size, 0x100), 0x10000); nvc0->constbuf_valid[s] |= 1 << i; + if (res && res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) + nvc0->constbuf_coherent[s] |= 1 << i; + else + nvc0->constbuf_coherent[s] &= ~(1 << i); } else { nvc0->constbuf_valid[s] &= ~(1 << i); + nvc0->constbuf_coherent[s] &= ~(1 << i); } } @@ -1009,6 +1037,7 @@ nvc0_set_vertex_buffers(struct pipe_context *pipe, if (!vb) { nvc0->vbo_user &= ~(((1ull << count) - 1) << start_slot); nvc0->constant_vbos &= ~(((1ull << count) - 1) << start_slot); + nvc0->vtxbufs_coherent &= ~(((1ull << count) - 1) << start_slot); return; } @@ -1021,9 +1050,16 @@ nvc0_set_vertex_buffers(struct pipe_context *pipe, nvc0->constant_vbos |= 1 << dst_index; else nvc0->constant_vbos &= ~(1 << dst_index); + nvc0->vtxbufs_coherent &= ~(1 << dst_index); } else { nvc0->vbo_user &= ~(1 << dst_index); nvc0->constant_vbos &= ~(1 << dst_index); + + if (vb[i].buffer && + vb[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) + nvc0->vtxbufs_coherent |= (1 << dst_index); + else + nvc0->vtxbufs_coherent &= ~(1 << dst_index); } } } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c index 54443bdccc0..ad79d1cbb9c 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c @@ -787,7 +787,7 @@ nvc0_draw_stream_output(struct nvc0_context *nvc0, } while (num_instances--) { - PUSH_SPACE(push, 8); + nouveau_pushbuf_space(push, 9, 0, 1); BEGIN_NVC0(push, NVC0_3D(VERTEX_BEGIN_GL), 1); PUSH_DATA (push, mode); BEGIN_NVC0(push, NVC0_3D(DRAW_TFB_BASE), 1); @@ -807,19 +807,31 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; struct nv04_resource *buf = nv04_resource(info->indirect); - unsigned size; - const uint32_t offset = buf->offset + info->indirect_offset; + struct nv04_resource *buf_count = nv04_resource(info->indirect_params); + unsigned size, macro, count = info->indirect_count, drawid = info->drawid; + uint32_t offset = buf->offset + info->indirect_offset; /* must make FIFO wait for engines idle before continuing to process */ - if (buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr)) + if ((buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr)) || + (buf_count && buf_count->fence_wr && + !nouveau_fence_signalled(buf_count->fence_wr))) { IMMED_NVC0(push, SUBC_3D(NV10_SUBCHAN_REF_CNT), 0); + } + + /* Queue things up to let the macros write params to the driver constbuf */ + BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); + PUSH_DATA (push, 512); + PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9)); + PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9)); - PUSH_SPACE(push, 8); if (info->indexed) { assert(nvc0->idxbuf.buffer); assert(nouveau_resource_mapped_by_gpu(nvc0->idxbuf.buffer)); - size = 5 * 4; - BEGIN_1IC0(push, NVC0_3D(MACRO_DRAW_ELEMENTS_INDIRECT), 1 + size / 4); + size = 5; + if (buf_count) + macro = NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT; + else + macro = NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT; } else { if (nvc0->state.index_bias) { /* index_bias is implied 0 if !info->indexed (really ?) */ @@ -827,15 +839,59 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info) IMMED_NVC0(push, NVC0_3D(VERTEX_ID_BASE), 0); nvc0->state.index_bias = 0; } - size = 4 * 4; - BEGIN_1IC0(push, NVC0_3D(MACRO_DRAW_ARRAYS_INDIRECT), 1 + size / 4); - } - PUSH_DATA(push, nvc0_prim_gl(info->mode)); -#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8)) - PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain); - nouveau_pushbuf_space(push, 0, 0, 1); - nouveau_pushbuf_data(push, - buf->bo, offset, NVC0_IB_ENTRY_1_NO_PREFETCH | size); + size = 4; + if (buf_count) + macro = NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT; + else + macro = NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT; + } + + /* If the stride is not the natural stride, we have to stick a separate + * push data reference for each draw. Otherwise it can all go in as one. + * Of course there is a maximum packet size, so we have to break things up + * along those borders as well. + */ + while (count) { + unsigned draws = count, pushes, i; + if (info->indirect_stride == size * 4) { + draws = MIN2(draws, (NV04_PFIFO_MAX_PACKET_LEN - 4) / size); + pushes = 1; + } else { + draws = MIN2(draws, 32); + pushes = draws; + } + + nouveau_pushbuf_space(push, 16, 0, pushes + !!buf_count); + PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain); + if (buf_count) + PUSH_REFN(push, buf_count->bo, NOUVEAU_BO_RD | buf_count->domain); + PUSH_DATA(push, + NVC0_FIFO_PKHDR_1I(0, macro, 3 + !!buf_count + draws * size)); + PUSH_DATA(push, nvc0_prim_gl(info->mode)); + PUSH_DATA(push, drawid); + PUSH_DATA(push, draws); + if (buf_count) { + nouveau_pushbuf_data(push, + buf_count->bo, + buf_count->offset + info->indirect_params_offset, + NVC0_IB_ENTRY_1_NO_PREFETCH | 4); + } + if (pushes == 1) { + nouveau_pushbuf_data(push, + buf->bo, offset, + NVC0_IB_ENTRY_1_NO_PREFETCH | (size * 4 * draws)); + offset += draws * info->indirect_stride; + } else { + for (i = 0; i < pushes; i++) { + nouveau_pushbuf_data(push, + buf->bo, offset, + NVC0_IB_ENTRY_1_NO_PREFETCH | (size * 4)); + offset += info->indirect_stride; + } + } + count -= draws; + drawid += draws; + } } static inline void @@ -864,7 +920,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) { struct nvc0_context *nvc0 = nvc0_context(pipe); struct nouveau_pushbuf *push = nvc0->base.pushbuf; - int i, s; + int s; /* NOTE: caller must ensure that (min_index + index_bias) is >= 0 */ nvc0->vb_elt_first = info->min_index + info->index_bias; @@ -901,29 +957,25 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) /* 8 as minimum to avoid immediate double validation of new buffers */ nvc0_state_validate(nvc0, ~0, 8); + if (nvc0->vertprog->vp.need_draw_parameters) { + BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); + PUSH_DATA (push, 512); + PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9)); + PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9)); + if (!info->indirect) { + BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 3); + PUSH_DATA (push, 256 + 128); + PUSH_DATA (push, info->index_bias); + PUSH_DATA (push, info->start_instance); + PUSH_DATA (push, info->drawid); + } + } + push->kick_notify = nvc0_draw_vbo_kick_notify; - /* TODO: Instead of iterating over all the buffer resources looking for - * coherent buffers, keep track of a context-wide count. - */ for (s = 0; s < 5 && !nvc0->cb_dirty; ++s) { - uint32_t valid = nvc0->constbuf_valid[s]; - - while (valid && !nvc0->cb_dirty) { - const unsigned i = ffs(valid) - 1; - struct pipe_resource *res; - - valid &= ~(1 << i); - if (nvc0->constbuf[s][i].user) - continue; - - res = nvc0->constbuf[s][i].u.buf; - if (!res) - continue; - - if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) - nvc0->cb_dirty = true; - } + if (nvc0->constbuf_coherent[s]) + nvc0->cb_dirty = true; } if (nvc0->cb_dirty) { @@ -932,14 +984,12 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) } for (s = 0; s < 5; ++s) { + if (!nvc0->textures_coherent[s]) + continue; + for (int i = 0; i < nvc0->num_textures[s]; ++i) { struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]); - struct pipe_resource *res; - if (!tic) - continue; - res = nvc0->textures[s][i]->texture; - if (res->target != PIPE_BUFFER || - !(res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)) + if (!(nvc0->textures_coherent[s] & (1 << i))) continue; BEGIN_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 1); @@ -965,12 +1015,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) PUSH_DATA (push, info->start_instance); } - for (i = 0; i < nvc0->num_vtxbufs && !nvc0->base.vbo_dirty; ++i) { - if (!nvc0->vtxbuf[i].buffer) - continue; - if (nvc0->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) - nvc0->base.vbo_dirty = true; - } + nvc0->base.vbo_dirty |= !!nvc0->vtxbufs_coherent; if (!nvc0->base.vbo_dirty && nvc0->idxbuf.buffer && nvc0->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video.c b/src/gallium/drivers/nouveau/nvc0/nvc0_video.c index 48ffac1b715..a9fd1d20942 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_video.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video.c @@ -26,6 +26,24 @@ #include "util/u_format.h" static void +nvc0_decoder_begin_frame(struct pipe_video_codec *decoder, + struct pipe_video_buffer *target, + struct pipe_picture_desc *picture) +{ + struct nouveau_vp3_decoder *dec = (struct nouveau_vp3_decoder *)decoder; + uint32_t comm_seq = ++dec->fence_seq; + unsigned ret = 0; + + assert(dec); + assert(target); + assert(target->buffer_format == PIPE_FORMAT_NV12); + + ret = nvc0_decoder_bsp_begin(dec, comm_seq); + + assert(ret == 2); +} + +static void nvc0_decoder_decode_bitstream(struct pipe_video_codec *decoder, struct pipe_video_buffer *video_target, struct pipe_picture_desc *picture, @@ -34,8 +52,24 @@ nvc0_decoder_decode_bitstream(struct pipe_video_codec *decoder, const unsigned *num_bytes) { struct nouveau_vp3_decoder *dec = (struct nouveau_vp3_decoder *)decoder; + uint32_t comm_seq = dec->fence_seq; + unsigned ret = 0; + + assert(decoder); + + ret = nvc0_decoder_bsp_next(dec, comm_seq, num_buffers, data, num_bytes); + + assert(ret == 2); +} + +static void +nvc0_decoder_end_frame(struct pipe_video_codec *decoder, + struct pipe_video_buffer *video_target, + struct pipe_picture_desc *picture) +{ + struct nouveau_vp3_decoder *dec = (struct nouveau_vp3_decoder *)decoder; struct nouveau_vp3_video_buffer *target = (struct nouveau_vp3_video_buffer *)video_target; - uint32_t comm_seq = ++dec->fence_seq; + uint32_t comm_seq = dec->fence_seq; union pipe_desc desc; unsigned vp_caps, is_ref, ret; @@ -43,11 +77,7 @@ nvc0_decoder_decode_bitstream(struct pipe_video_codec *decoder, desc.base = picture; - assert(target->base.buffer_format == PIPE_FORMAT_NV12); - - ret = nvc0_decoder_bsp(dec, desc, target, comm_seq, - num_buffers, data, num_bytes, - &vp_caps, &is_ref, refs); + ret = nvc0_decoder_bsp_end(dec, desc, target, comm_seq, &vp_caps, &is_ref, refs); /* did we decode bitstream correctly? */ assert(ret == 2); @@ -164,14 +194,19 @@ nvc0_create_decoder(struct pipe_context *context, PUSH_DATA (push[2], dec->ppp->handle); dec->base.context = context; + dec->base.begin_frame = nvc0_decoder_begin_frame; dec->base.decode_bitstream = nvc0_decoder_decode_bitstream; + dec->base.end_frame = nvc0_decoder_end_frame; for (i = 0; i < NOUVEAU_VP3_VIDEO_QDEPTH && !ret; ++i) ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM, 0, 1 << 20, &cfg, &dec->bsp_bo[i]); - if (!ret) + if (!ret) { + /* total fudge factor... just has to be bigger for higher bitrates? */ + unsigned inter_size = align(templ->width * templ->height * 2, 4 << 20); ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM, - 0x100, 4 << 20, &cfg, &dec->inter_bo[0]); + 0x100, inter_size, &cfg, &dec->inter_bo[0]); + } if (!ret) { ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM, 0x100, dec->inter_bo[0]->size, &cfg, diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video.h b/src/gallium/drivers/nouveau/nvc0/nvc0_video.h index 9ee0280f8ea..cf3c942355b 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_video.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video.h @@ -30,12 +30,18 @@ #include "util/u_video.h" extern unsigned -nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc, - struct nouveau_vp3_video_buffer *target, - unsigned comm_seq, unsigned num_buffers, - const void *const *data, const unsigned *num_bytes, - unsigned *vp_caps, unsigned *is_ref, - struct nouveau_vp3_video_buffer *refs[16]); +nvc0_decoder_bsp_begin(struct nouveau_vp3_decoder *dec, unsigned comm_seq); + +extern unsigned +nvc0_decoder_bsp_next(struct nouveau_vp3_decoder *dec, + unsigned comm_seq, unsigned num_buffers, + const void *const *data, const unsigned *num_bytes); + +extern unsigned +nvc0_decoder_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc, + struct nouveau_vp3_video_buffer *target, + unsigned comm_seq, unsigned *vp_caps, unsigned *is_ref, + struct nouveau_vp3_video_buffer *refs[16]); extern void nvc0_decoder_vp(struct nouveau_vp3_decoder *dec, union pipe_desc desc, diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c index 4392f62c530..c53f946a762 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c @@ -32,40 +32,34 @@ static void dump_comm_bsp(struct comm *comm) #endif unsigned -nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc, - struct nouveau_vp3_video_buffer *target, - unsigned comm_seq, unsigned num_buffers, - const void *const *data, const unsigned *num_bytes, - unsigned *vp_caps, unsigned *is_ref, - struct nouveau_vp3_video_buffer *refs[16]) +nvc0_decoder_bsp_begin(struct nouveau_vp3_decoder *dec, unsigned comm_seq) { - struct nouveau_pushbuf *push = dec->pushbuf[0]; - enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile); - uint32_t bsp_addr, comm_addr, inter_addr; - uint32_t slice_size, bucket_size, ring_size, bsp_size; - uint32_t caps, i; - int ret; struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH]; - struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1]; - unsigned fence_extra = 0; - struct nouveau_pushbuf_refn bo_refs[] = { - { bsp_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM }, - { inter_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM }, -#if NOUVEAU_VP3_DEBUG_FENCE - { dec->fence_bo, NOUVEAU_BO_WR | NOUVEAU_BO_GART }, -#endif - { dec->bitplane_bo, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM }, - }; - int num_refs = ARRAY_SIZE(bo_refs); + unsigned ret = 0; - if (!dec->bitplane_bo) - num_refs--; + ret = nouveau_bo_map(bsp_bo, NOUVEAU_BO_WR, dec->client); + if (ret) { + debug_printf("map failed: %i %s\n", ret, strerror(-ret)); + return -1; + } -#if NOUVEAU_VP3_DEBUG_FENCE - fence_extra = 4; -#endif + nouveau_vp3_bsp_begin(dec); + + return 2; +} - bsp_size = NOUVEAU_VP3_BSP_RESERVED_SIZE; +unsigned +nvc0_decoder_bsp_next(struct nouveau_vp3_decoder *dec, + unsigned comm_seq, unsigned num_buffers, + const void *const *data, const unsigned *num_bytes) +{ + struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH]; + struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1]; + uint32_t bsp_size = 0; + uint32_t i = 0; + unsigned ret = 0; + + bsp_size = dec->bsp_ptr - (char *)bsp_bo->map; for (i = 0; i < num_buffers; i++) bsp_size += num_bytes[i]; bsp_size += 256; /* the 4 end markers */ @@ -81,14 +75,29 @@ nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc, bsp_size += (1 << 20) - 1; bsp_size &= ~((1 << 20) - 1); - ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_size, &cfg, &tmp_bo); + ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_size, &cfg, &tmp_bo); if (ret) { debug_printf("reallocating bsp %u -> %u failed with %i\n", bsp_bo ? (unsigned)bsp_bo->size : 0, bsp_size, ret); return -1; } + + ret = nouveau_bo_map(tmp_bo, NOUVEAU_BO_WR, dec->client); + if (ret) { + debug_printf("map failed: %i %s\n", ret, strerror(-ret)); + return -1; + } + + /* Preserve previous buffer. */ + /* TODO: offload this copy to the GPU, as otherwise we're reading and + * writing to VRAM. */ + memcpy(tmp_bo->map, bsp_bo->map, bsp_bo->size); + + /* update position to current chunk */ + dec->bsp_ptr = tmp_bo->map + (dec->bsp_ptr - (char *)bsp_bo->map); + nouveau_bo_ref(NULL, &bsp_bo); - bo_refs[0].bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH] = bsp_bo = tmp_bo; + dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH] = bsp_bo = tmp_bo; } if (!inter_bo || bsp_bo->size * 4 > inter_bo->size) { @@ -98,24 +107,61 @@ nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc, cfg.nvc0.tile_mode = 0x10; cfg.nvc0.memtype = 0xfe; - ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, &cfg, &tmp_bo); + ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, &cfg, &tmp_bo); if (ret) { debug_printf("reallocating inter %u -> %u failed with %i\n", inter_bo ? (unsigned)inter_bo->size : 0, (unsigned)bsp_bo->size * 4, ret); return -1; } + + ret = nouveau_bo_map(tmp_bo, NOUVEAU_BO_WR, dec->client); + if (ret) { + debug_printf("map failed: %i %s\n", ret, strerror(-ret)); + return -1; + } + nouveau_bo_ref(NULL, &inter_bo); - bo_refs[1].bo = dec->inter_bo[comm_seq & 1] = inter_bo = tmp_bo; + dec->inter_bo[comm_seq & 1] = inter_bo = tmp_bo; } - ret = nouveau_bo_map(bsp_bo, NOUVEAU_BO_WR, dec->client); - if (ret) { - debug_printf("map failed: %i %s\n", ret, strerror(-ret)); - return -1; - } + nouveau_vp3_bsp_next(dec, num_buffers, data, num_bytes); + + return 2; +} + + +unsigned +nvc0_decoder_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc, + struct nouveau_vp3_video_buffer *target, unsigned comm_seq, + unsigned *vp_caps, unsigned *is_ref, + struct nouveau_vp3_video_buffer *refs[16]) +{ + struct nouveau_pushbuf *push = dec->pushbuf[0]; + enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile); + uint32_t bsp_addr, comm_addr, inter_addr; + uint32_t slice_size, bucket_size, ring_size; + uint32_t caps; + struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH]; + struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1]; + unsigned fence_extra = 0; + struct nouveau_pushbuf_refn bo_refs[] = { + { bsp_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM }, + { inter_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM }, +#if NOUVEAU_VP3_DEBUG_FENCE + { dec->fence_bo, NOUVEAU_BO_WR | NOUVEAU_BO_GART }, +#endif + { dec->bitplane_bo, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM }, + }; + int num_refs = ARRAY_SIZE(bo_refs); + + if (!dec->bitplane_bo) + num_refs--; + +#if NOUVEAU_VP3_DEBUG_FENCE + fence_extra = 4; +#endif - caps = nouveau_vp3_bsp(dec, desc, target, comm_seq, - num_buffers, data, num_bytes); + caps = nouveau_vp3_bsp_end(dec, desc); nouveau_vp3_vp_caps(dec, desc, target, comm_seq, vp_caps, is_ref, refs); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h b/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h index 4ea8ca3cfa2..79abe78b77a 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h @@ -68,6 +68,7 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags) #define SUBC_SW(m) 7, (m) #define NVC0_3D_SERIALIZE NV50_GRAPH_SERIALIZE +#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8)) static inline uint32_t NVC0_FIFO_PKHDR_SQ(int subc, int mthd, unsigned size) diff --git a/src/gallium/drivers/r300/compiler/r500_fragprog.c b/src/gallium/drivers/r300/compiler/r500_fragprog.c index 88aad8a054f..4c415afcb05 100644 --- a/src/gallium/drivers/r300/compiler/r500_fragprog.c +++ b/src/gallium/drivers/r300/compiler/r500_fragprog.c @@ -384,7 +384,7 @@ void r500FragmentProgramDump(struct radeon_compiler *c, void *user) case R500_INST_TYPE_OUT: str = "OUT"; break; case R500_INST_TYPE_FC: str = "FC"; break; case R500_INST_TYPE_TEX: str = "TEX"; break; - }; + } fprintf(stderr,"%s %s %s %s %s ", str, inst & R500_INST_TEX_SEM_WAIT ? "TEX_WAIT" : "", inst & R500_INST_LAST ? "LAST" : "", diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c index b393769c861..82ba0435118 100644 --- a/src/gallium/drivers/r300/r300_context.c +++ b/src/gallium/drivers/r300/r300_context.c @@ -421,8 +421,8 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen, r300->context.create_video_codec = vl_create_decoder; r300->context.create_video_buffer = vl_video_buffer_create; - r300->uploader = u_upload_create(&r300->context, 256 * 1024, 4, - PIPE_BIND_CUSTOM); + r300->uploader = u_upload_create(&r300->context, 256 * 1024, + PIPE_BIND_CUSTOM, PIPE_USAGE_STREAM); r300->blitter = util_blitter_create(&r300->context); if (r300->blitter == NULL) diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c index b482fa140ed..7eda6753d0d 100644 --- a/src/gallium/drivers/r300/r300_render.c +++ b/src/gallium/drivers/r300/r300_render.c @@ -1010,7 +1010,7 @@ static void r300_render_draw_elements(struct vbuf_render* render, CS_LOCALS(r300); DBG(r300, DBG_DRAW, "r300: render_draw_elements (count: %d)\n", count); - u_upload_data(r300->uploader, 0, count * 2, indices, + u_upload_data(r300->uploader, 0, count * 2, 4, indices, &index_buffer_offset, &index_buffer); if (!index_buffer) { return; diff --git a/src/gallium/drivers/r300/r300_render_translate.c b/src/gallium/drivers/r300/r300_render_translate.c index caeeec05909..7221211deea 100644 --- a/src/gallium/drivers/r300/r300_render_translate.c +++ b/src/gallium/drivers/r300/r300_render_translate.c @@ -37,7 +37,7 @@ void r300_translate_index_buffer(struct r300_context *r300, switch (*index_size) { case 1: *out_buffer = NULL; - u_upload_alloc(r300->uploader, 0, count * 2, + u_upload_alloc(r300->uploader, 0, count * 2, 4, &out_offset, out_buffer, &ptr); util_shorten_ubyte_elts_to_userptr( @@ -51,7 +51,7 @@ void r300_translate_index_buffer(struct r300_context *r300, case 2: if (index_offset) { *out_buffer = NULL; - u_upload_alloc(r300->uploader, 0, count * 2, + u_upload_alloc(r300->uploader, 0, count * 2, 4, &out_offset, out_buffer, &ptr); util_rebuild_ushort_elts_to_userptr(&r300->context, ib, @@ -65,7 +65,7 @@ void r300_translate_index_buffer(struct r300_context *r300, case 4: if (index_offset) { *out_buffer = NULL; - u_upload_alloc(r300->uploader, 0, count * 4, + u_upload_alloc(r300->uploader, 0, count * 4, 4, &out_offset, out_buffer, &ptr); util_rebuild_uint_elts_to_userptr(&r300->context, ib, diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c index 606e25f915b..d1b59ab4345 100644 --- a/src/gallium/drivers/r300/r300_screen.c +++ b/src/gallium/drivers/r300/r300_screen.c @@ -183,6 +183,8 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_SAMPLE_SHADING: case PIPE_CAP_TEXTURE_GATHER_OFFSETS: case PIPE_CAP_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: case PIPE_CAP_SAMPLER_VIEW_TARGET: @@ -200,6 +202,11 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: case PIPE_CAP_CLEAR_TEXTURE: + case PIPE_CAP_DRAW_PARAMETERS: + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: + case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: + case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: return 0; /* SWTCL-only features. */ @@ -304,6 +311,7 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: + case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: return 0; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; @@ -362,6 +370,7 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: + case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: return 0; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; diff --git a/src/gallium/drivers/r300/r300_screen_buffer.c b/src/gallium/drivers/r300/r300_screen_buffer.c index 737a6f5e4f8..42c8e3a0fc5 100644 --- a/src/gallium/drivers/r300/r300_screen_buffer.c +++ b/src/gallium/drivers/r300/r300_screen_buffer.c @@ -42,7 +42,7 @@ void r300_upload_index_buffer(struct r300_context *r300, *index_buffer = NULL; u_upload_data(r300->uploader, - 0, count * index_size, + 0, count * index_size, 4, ptr + (*start * index_size), &index_offset, index_buffer); diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c index da472f4d7f4..741e263e7ed 100644 --- a/src/gallium/drivers/r300/r300_state_derived.c +++ b/src/gallium/drivers/r300/r300_state_derived.c @@ -52,7 +52,6 @@ enum r300_rs_col_write_type { static void r300_draw_emit_attrib(struct r300_context* r300, enum attrib_emit emit, - enum interp_mode interp, int index) { struct r300_vertex_shader* vs = r300->vs_state.state; @@ -62,7 +61,7 @@ static void r300_draw_emit_attrib(struct r300_context* r300, output = draw_find_shader_output(r300->draw, info->output_semantic_name[index], info->output_semantic_index[index]); - draw_emit_vertex_attr(&r300->vertex_info, emit, interp, output); + draw_emit_vertex_attr(&r300->vertex_info, emit, output); } static void r300_draw_emit_all_attribs(struct r300_context* r300) @@ -73,31 +72,27 @@ static void r300_draw_emit_all_attribs(struct r300_context* r300) /* Position. */ if (vs_outputs->pos != ATTR_UNUSED) { - r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE, - vs_outputs->pos); + r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->pos); } else { assert(0); } /* Point size. */ if (vs_outputs->psize != ATTR_UNUSED) { - r300_draw_emit_attrib(r300, EMIT_1F_PSIZE, INTERP_POS, - vs_outputs->psize); + r300_draw_emit_attrib(r300, EMIT_1F_PSIZE, vs_outputs->psize); } /* Colors. */ for (i = 0; i < ATTR_COLOR_COUNT; i++) { if (vs_outputs->color[i] != ATTR_UNUSED) { - r300_draw_emit_attrib(r300, EMIT_4F, INTERP_LINEAR, - vs_outputs->color[i]); + r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->color[i]); } } /* Back-face colors. */ for (i = 0; i < ATTR_COLOR_COUNT; i++) { if (vs_outputs->bcolor[i] != ATTR_UNUSED) { - r300_draw_emit_attrib(r300, EMIT_4F, INTERP_LINEAR, - vs_outputs->bcolor[i]); + r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->bcolor[i]); } } @@ -108,16 +103,14 @@ static void r300_draw_emit_all_attribs(struct r300_context* r300) for (i = 0; i < ATTR_GENERIC_COUNT && gen_count < 8; i++) { if (vs_outputs->generic[i] != ATTR_UNUSED && !(r300->sprite_coord_enable & (1 << i))) { - r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE, - vs_outputs->generic[i]); + r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->generic[i]); gen_count++; } } /* Fog coordinates. */ if (gen_count < 8 && vs_outputs->fog != ATTR_UNUSED) { - r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE, - vs_outputs->fog); + r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->fog); gen_count++; } @@ -125,8 +118,7 @@ static void r300_draw_emit_all_attribs(struct r300_context* r300) if (r300_fs(r300)->shader->inputs.wpos != ATTR_UNUSED && gen_count < 8) { DBG(r300, DBG_SWTCL, "draw_emit_attrib: WPOS, index: %i\n", vs_outputs->wpos); - r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE, - vs_outputs->wpos); + r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->wpos); } } diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c index d83eb17c280..20945ece155 100644 --- a/src/gallium/drivers/r600/evergreen_compute.c +++ b/src/gallium/drivers/r600/evergreen_compute.c @@ -600,7 +600,7 @@ static void evergreen_launch_grid( ctx->screen->has_compressed_msaa_texturing); bc->type = TGSI_PROCESSOR_COMPUTE; bc->isa = ctx->isa; - r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump); + r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump, &ctx->b.debug); if (dump && !sb_disasm) { r600_bytecode_disasm(bc); diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index 1aee7dd2da8..9dfb84965cf 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -1956,7 +1956,7 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx, if (!gs_ring_buffer) { radeon_set_context_reg_flag(cs, reg_alu_constbuf_size + buffer_index * 4, - ALIGN_DIVUP(cb->buffer_size, 256), pkt_flags); + DIV_ROUND_UP(cb->buffer_size, 256), pkt_flags); radeon_set_context_reg_flag(cs, reg_alu_const_cache + buffer_index * 4, va >> 8, pkt_flags); } diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index 1cc30317ba5..8b91372f3ae 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -726,7 +726,7 @@ static void tex_fetch_args( * That operand should be passed as a float value in the args array * right after the coord vector. After packing it's not used anymore, * that's why arg_count is not increased */ - coords[4] = lp_build_emit_fetch(bld_base, inst, 1, 0); + coords[4] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X); } if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE || @@ -915,14 +915,17 @@ unsigned r600_llvm_compile( enum radeon_family family, struct r600_bytecode *bc, boolean *use_kill, - unsigned dump) + unsigned dump, + struct pipe_debug_callback *debug) { unsigned r; struct radeon_shader_binary binary; const char * gpu_family = r600_get_llvm_processor_name(family); memset(&binary, 0, sizeof(struct radeon_shader_binary)); - r = radeon_llvm_compile(mod, &binary, gpu_family, dump, dump, NULL); + if (dump) + LLVMDumpModule(mod); + r = radeon_llvm_compile(mod, &binary, gpu_family, NULL, debug); r = r600_create_shader(bc, &binary, use_kill); diff --git a/src/gallium/drivers/r600/r600_llvm.h b/src/gallium/drivers/r600/r600_llvm.h index 9b5304d9fcb..f570b739fbe 100644 --- a/src/gallium/drivers/r600/r600_llvm.h +++ b/src/gallium/drivers/r600/r600_llvm.h @@ -7,6 +7,7 @@ #include "radeon/radeon_llvm.h" #include <llvm-c/Core.h> +struct pipe_debug_callback; struct r600_bytecode; struct r600_shader_ctx; struct radeon_llvm_context; @@ -22,7 +23,8 @@ unsigned r600_llvm_compile( enum radeon_family family, struct r600_bytecode *bc, boolean *use_kill, - unsigned dump); + unsigned dump, + struct pipe_debug_callback *debug); unsigned r600_create_shader(struct r600_bytecode *bc, const struct radeon_shader_binary *binary, diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index 17006f70601..e61d9286542 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -348,6 +348,13 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_CLEAR_TEXTURE: + case PIPE_CAP_DRAW_PARAMETERS: + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: + case PIPE_CAP_MULTI_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: + case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: + case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: return 0; case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: @@ -522,6 +529,7 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: + case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: return 0; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: /* due to a bug in the shader compiler, some loops hang diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h index 31f2a729494..0e4dd16525b 100644 --- a/src/gallium/drivers/r600/r600_pipe.h +++ b/src/gallium/drivers/r600/r600_pipe.h @@ -946,7 +946,6 @@ static inline uint32_t S_FIXED(float value, uint32_t frac_bits) { return value * (1 << frac_bits); } -#define ALIGN_DIVUP(x, y) (((x) + (y) - 1) / (y)) /* 12.4 fixed-point */ static inline unsigned r600_pack_float_12p4(float x) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index d411b0be50e..df40f94bdcf 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -162,7 +162,8 @@ int r600_pipe_shader_create(struct pipe_context *ctx, struct r600_context *rctx = (struct r600_context *)ctx; struct r600_pipe_shader_selector *sel = shader->selector; int r; - bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens); + bool dump = r600_can_dump_shader(&rctx->screen->b, + tgsi_get_processor_type(sel->tokens)); unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB); unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM); unsigned export_shader; @@ -394,7 +395,7 @@ static int tgsi_last_instruction(unsigned writemask) static int tgsi_is_supported(struct r600_shader_ctx *ctx) { struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction; - int j; + unsigned j; if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) { R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs); @@ -1166,7 +1167,7 @@ static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_off */ static int evergreen_gpr_count(struct r600_shader_ctx *ctx) { - int i; + unsigned i; int num_baryc; struct tgsi_parse_context parse; @@ -1585,7 +1586,7 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx) { struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; - int i; + unsigned i; for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { struct tgsi_full_src_register *src = &inst->Src[i]; @@ -1854,7 +1855,7 @@ static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_re static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx) { struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; - int i; + unsigned i; for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { struct tgsi_full_src_register *src = &inst->Src[i]; @@ -2784,7 +2785,7 @@ static int r600_tess_factor_read(struct r600_shader_ctx *ctx, static int r600_emit_tess_factor(struct r600_shader_ctx *ctx) { - int i; + unsigned i; int stride, outer_comps, inner_comps; int tessinner_idx = -1, tessouter_idx = -1; int r; @@ -3238,7 +3239,8 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, if (use_llvm) { struct radeon_llvm_context radeon_llvm_ctx; LLVMModuleRef mod; - bool dump = r600_can_dump_shader(&rscreen->b, tokens); + bool dump = r600_can_dump_shader(&rscreen->b, + tgsi_get_processor_type(tokens)); boolean use_kill = false; memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx)); @@ -3259,7 +3261,8 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp; ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers; - if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill, dump)) { + if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill, + dump, &rctx->b.debug)) { radeon_llvm_dispose(&radeon_llvm_ctx); use_llvm = 0; fprintf(stderr, "R600 LLVM backend failed to compile " @@ -4424,7 +4427,7 @@ static int cayman_mul_double_instr(struct r600_shader_ctx *ctx) memset(&alu, 0, sizeof(struct r600_bytecode_alu)); alu.op = ctx->inst_info->op; for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { - r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));; + r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1)); } alu.dst.sel = t1; alu.dst.chan = i; @@ -4791,7 +4794,7 @@ static int tgsi_lit(struct r600_shader_ctx *ctx) { int chan; int sel; - int i; + unsigned i; if (ctx->bc->chip_class == CAYMAN) { for (i = 0; i < 3; i++) { @@ -7925,7 +7928,7 @@ static int tgsi_exp(struct r600_shader_ctx *ctx) struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; struct r600_bytecode_alu alu; int r; - int i; + unsigned i; /* result.x = 2^floor(src); */ if (inst->Dst[0].Register.WriteMask & 1) { @@ -8054,7 +8057,7 @@ static int tgsi_log(struct r600_shader_ctx *ctx) struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; struct r600_bytecode_alu alu; int r; - int i; + unsigned i; /* result.x = floor(log2(|src|)); */ if (inst->Dst[0].Register.WriteMask & 1) { @@ -8781,7 +8784,7 @@ static int tgsi_bgnloop(struct r600_shader_ctx *ctx) static int tgsi_endloop(struct r600_shader_ctx *ctx) { - int i; + unsigned i; r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END); diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c index 43b80742cb5..f60e30486a2 100644 --- a/src/gallium/drivers/r600/r600_state.c +++ b/src/gallium/drivers/r600/r600_state.c @@ -1768,7 +1768,7 @@ static void r600_emit_constant_buffers(struct r600_context *rctx, if (!gs_ring_buffer) { radeon_set_context_reg(cs, reg_alu_constbuf_size + buffer_index * 4, - ALIGN_DIVUP(cb->buffer_size, 256)); + DIV_ROUND_UP(cb->buffer_size, 256)); radeon_set_context_reg(cs, reg_alu_const_cache + buffer_index * 4, offset >> 8); } diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c index ca589fa7759..c3346f29811 100644 --- a/src/gallium/drivers/r600/r600_state_common.c +++ b/src/gallium/drivers/r600/r600_state_common.c @@ -1106,10 +1106,10 @@ static void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint tmpPtr[i] = util_cpu_to_le32(((uint32_t *)ptr)[i]); } - u_upload_data(rctx->b.uploader, 0, size, tmpPtr, &cb->buffer_offset, &cb->buffer); + u_upload_data(rctx->b.uploader, 0, size, 256, tmpPtr, &cb->buffer_offset, &cb->buffer); free(tmpPtr); } else { - u_upload_data(rctx->b.uploader, 0, input->buffer_size, ptr, &cb->buffer_offset, &cb->buffer); + u_upload_data(rctx->b.uploader, 0, input->buffer_size, 256, ptr, &cb->buffer_offset, &cb->buffer); } /* account it in gtt */ rctx->b.gtt += input->buffer_size; @@ -1732,7 +1732,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info } } - u_upload_alloc(rctx->b.uploader, start, count * 2, + u_upload_alloc(rctx->b.uploader, start, count * 2, 256, &out_offset, &out_buffer, &ptr); util_shorten_ubyte_elts_to_userptr( @@ -1753,7 +1753,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info if (ib.user_buffer && (R600_BIG_ENDIAN || info.indirect || info.instance_count > 1 || info.count*ib.index_size > 20)) { - u_upload_data(rctx->b.uploader, 0, info.count * ib.index_size, + u_upload_data(rctx->b.uploader, 0, info.count * ib.index_size, 256, ib.user_buffer, &ib.offset, &ib.buffer); ib.user_buffer = NULL; } diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c index 18925277d2d..484f5c8d5b7 100644 --- a/src/gallium/drivers/radeon/r600_buffer_common.c +++ b/src/gallium/drivers/radeon/r600_buffer_common.c @@ -298,7 +298,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, struct r600_resource *staging = NULL; u_upload_alloc(rctx->uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT), - &offset, (struct pipe_resource**)&staging, (void**)&data); + 256, &offset, (struct pipe_resource**)&staging, (void**)&data); if (staging) { data += box->x % R600_MAP_BUFFER_ALIGNMENT; diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c index 9a5e9878176..52c365e81d0 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.c +++ b/src/gallium/drivers/radeon/r600_pipe_common.c @@ -85,7 +85,7 @@ void r600_draw_rectangle(struct blitter_context *blitter, /* Upload vertices. The hw rectangle has only 3 vertices, * I guess the 4th one is derived from the first 3. * The vertex specification should match u_blitter's vertex element state. */ - u_upload_alloc(rctx->uploader, 0, sizeof(float) * 24, &offset, &buf, (void**)&vb); + u_upload_alloc(rctx->uploader, 0, sizeof(float) * 24, 256, &offset, &buf, (void**)&vb); if (!buf) return; @@ -227,6 +227,17 @@ static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx) return PIPE_UNKNOWN_CONTEXT_RESET; } +static void r600_set_debug_callback(struct pipe_context *ctx, + const struct pipe_debug_callback *cb) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + + if (cb) + rctx->debug = *cb; + else + memset(&rctx->debug, 0, sizeof(rctx->debug)); +} + bool r600_common_context_init(struct r600_common_context *rctx, struct r600_common_screen *rscreen) { @@ -252,6 +263,7 @@ bool r600_common_context_init(struct r600_common_context *rctx, rctx->b.transfer_inline_write = u_default_transfer_inline_write; rctx->b.memory_barrier = r600_memory_barrier; rctx->b.flush = r600_flush_from_st; + rctx->b.set_debug_callback = r600_set_debug_callback; if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 43) { rctx->b.get_device_reset_status = r600_get_reset_status; @@ -272,9 +284,9 @@ bool r600_common_context_init(struct r600_common_context *rctx, if (!rctx->allocator_so_filled_size) return false; - rctx->uploader = u_upload_create(&rctx->b, 1024 * 1024, 256, + rctx->uploader = u_upload_create(&rctx->b, 1024 * 1024, PIPE_BIND_INDEX_BUFFER | - PIPE_BIND_CONSTANT_BUFFER); + PIPE_BIND_CONSTANT_BUFFER, PIPE_USAGE_STREAM); if (!rctx->uploader) return false; @@ -999,13 +1011,9 @@ void r600_destroy_common_screen(struct r600_common_screen *rscreen) } bool r600_can_dump_shader(struct r600_common_screen *rscreen, - const struct tgsi_token *tokens) + unsigned processor) { - /* Compute shader don't have tgsi_tokens */ - if (!tokens) - return (rscreen->debug_flags & DBG_CS) != 0; - - switch (tgsi_get_processor_type(tokens)) { + switch (processor) { case TGSI_PROCESSOR_VERTEX: return (rscreen->debug_flags & DBG_VS) != 0; case TGSI_PROCESSOR_TESS_CTRL: diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h index c3933b1da98..68b50a9fb0f 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.h +++ b/src/gallium/drivers/radeon/r600_pipe_common.h @@ -440,6 +440,8 @@ struct r600_common_context { * the GPU addresses are updated. */ struct list_head texture_buffers; + struct pipe_debug_callback debug; + /* Copy one resource to another using async DMA. */ void (*dma_copy)(struct pipe_context *ctx, struct pipe_resource *dst, @@ -514,7 +516,7 @@ bool r600_common_context_init(struct r600_common_context *rctx, void r600_common_context_cleanup(struct r600_common_context *rctx); void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resource *r); bool r600_can_dump_shader(struct r600_common_screen *rscreen, - const struct tgsi_token *tokens); + unsigned processor); void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst, unsigned offset, unsigned size, unsigned value, bool is_framebuffer); diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c index 61ed9402122..3d0987624a6 100644 --- a/src/gallium/drivers/radeon/radeon_llvm_emit.c +++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c @@ -23,12 +23,15 @@ * Authors: Tom Stellard <[email protected]> * */ + #include "radeon_llvm_emit.h" #include "radeon_elf_util.h" #include "c11/threads.h" #include "gallivm/lp_bld_misc.h" +#include "util/u_debug.h" #include "util/u_memory.h" #include "pipe/p_shader_tokens.h" +#include "pipe/p_state.h" #include <llvm-c/Target.h> #include <llvm-c/TargetMachine.h> @@ -123,16 +126,44 @@ LLVMTargetRef radeon_llvm_get_r600_target(const char *triple) return target; } +struct radeon_llvm_diagnostics { + struct pipe_debug_callback *debug; + unsigned retval; +}; + static void radeonDiagnosticHandler(LLVMDiagnosticInfoRef di, void *context) { - if (LLVMGetDiagInfoSeverity(di) == LLVMDSError) { - unsigned int *diagnosticflag = (unsigned int *)context; - char *diaginfo_message = LLVMGetDiagInfoDescription(di); + struct radeon_llvm_diagnostics *diag = (struct radeon_llvm_diagnostics *)context; + LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di); + char *description = LLVMGetDiagInfoDescription(di); + const char *severity_str = NULL; + + switch (severity) { + case LLVMDSError: + severity_str = "error"; + break; + case LLVMDSWarning: + severity_str = "warning"; + break; + case LLVMDSRemark: + severity_str = "remark"; + break; + case LLVMDSNote: + severity_str = "note"; + break; + default: + severity_str = "unknown"; + } + + pipe_debug_message(diag->debug, SHADER_INFO, + "LLVM diagnostic (%s): %s", severity_str, description); - *diagnosticflag = 1; - fprintf(stderr,"LLVM triggered Diagnostic Handler: %s\n", diaginfo_message); - LLVMDisposeMessage(diaginfo_message); + if (severity == LLVMDSError) { + diag->retval = 1; + fprintf(stderr,"LLVM triggered Diagnostic Handler: %s\n", description); } + + LLVMDisposeMessage(description); } /** @@ -141,22 +172,25 @@ static void radeonDiagnosticHandler(LLVMDiagnosticInfoRef di, void *context) * @returns 0 for success, 1 for failure */ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary, - const char *gpu_family, bool dump_ir, bool dump_asm, - LLVMTargetMachineRef tm) + const char *gpu_family, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug) { - + struct radeon_llvm_diagnostics diag; char cpu[CPU_STRING_LEN]; char fs[FS_STRING_LEN]; char *err; bool dispose_tm = false; LLVMContextRef llvm_ctx; - unsigned rval = 0; LLVMMemoryBufferRef out_buffer; unsigned buffer_size; const char *buffer_data; char triple[TRIPLE_STRING_LEN]; LLVMBool mem_err; + diag.debug = debug; + diag.retval = 0; + if (!tm) { strncpy(triple, "r600--", TRIPLE_STRING_LEN); LLVMTargetRef target = radeon_llvm_get_r600_target(triple); @@ -165,20 +199,17 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar } strncpy(cpu, gpu_family, CPU_STRING_LEN); memset(fs, 0, sizeof(fs)); - if (dump_asm) - strncpy(fs, "+DumpCode", FS_STRING_LEN); + strncpy(fs, "+DumpCode", FS_STRING_LEN); tm = LLVMCreateTargetMachine(target, triple, cpu, fs, LLVMCodeGenLevelDefault, LLVMRelocDefault, LLVMCodeModelDefault); dispose_tm = true; } - if (dump_ir) - LLVMDumpModule(M); + /* Setup Diagnostic Handler*/ llvm_ctx = LLVMGetModuleContext(M); - LLVMContextSetDiagnosticHandler(llvm_ctx, radeonDiagnosticHandler, &rval); - rval = 0; + LLVMContextSetDiagnosticHandler(llvm_ctx, radeonDiagnosticHandler, &diag); /* Compile IR*/ mem_err = LLVMTargetMachineEmitToMemoryBuffer(tm, M, LLVMObjectFile, &err, @@ -187,15 +218,13 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar /* Process Errors/Warnings */ if (mem_err) { fprintf(stderr, "%s: %s", __FUNCTION__, err); + pipe_debug_message(debug, SHADER_INFO, + "LLVM emit error: %s", err); FREE(err); - rval = 1; + diag.retval = 1; goto out; } - if (0 != rval) { - fprintf(stderr, "%s: Processing Diag Flag\n", __FUNCTION__); - } - /* Extract Shader Code*/ buffer_size = LLVMGetBufferSize(out_buffer); buffer_data = LLVMGetBufferStart(out_buffer); @@ -209,5 +238,7 @@ out: if (dispose_tm) { LLVMDisposeTargetMachine(tm); } - return rval; + if (diag.retval != 0) + pipe_debug_message(debug, SHADER_INFO, "LLVM compile failed"); + return diag.retval; } diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.h b/src/gallium/drivers/radeon/radeon_llvm_emit.h index e20aed94c6b..45f05a9e0e1 100644 --- a/src/gallium/drivers/radeon/radeon_llvm_emit.h +++ b/src/gallium/drivers/radeon/radeon_llvm_emit.h @@ -31,6 +31,7 @@ #include <llvm-c/TargetMachine.h> #include <stdbool.h> +struct pipe_debug_callback; struct radeon_shader_binary; void radeon_llvm_shader_type(LLVMValueRef F, unsigned type); @@ -38,7 +39,8 @@ void radeon_llvm_shader_type(LLVMValueRef F, unsigned type); LLVMTargetRef radeon_llvm_get_r600_target(const char *triple); unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary, - const char *gpu_family, bool dump_ir, bool dump_asm, - LLVMTargetMachineRef tm); + const char *gpu_family, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug); #endif /* RADEON_LLVM_EMIT_H */ diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c index 2de237b4716..105a1b2a878 100644 --- a/src/gallium/drivers/radeonsi/cik_sdma.c +++ b/src/gallium/drivers/radeonsi/cik_sdma.c @@ -196,7 +196,7 @@ static void cik_sdma_copy_tile(struct si_context *ctx, (tile_split << 11) | (mt << 8) | (array_mode << 3) | lbpe; cs->buf[cs->cdw++] = y << 16; /* | x */ - cs->buf[cs->cdw++] = 0; /* z */; + cs->buf[cs->cdw++] = 0; /* z */ cs->buf[cs->cdw++] = addr & 0xfffffffc; cs->buf[cs->cdw++] = addr >> 32; cs->buf[cs->cdw++] = (pitch / bpe) - 1; diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 47a74eea0e0..5a08cbfb198 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -67,9 +67,9 @@ static void init_scratch_buffer(struct si_context *sctx, struct si_compute *prog program->shader.binary.global_symbol_offsets[i]; unsigned scratch_bytes_needed; - si_shader_binary_read_config(sctx->screen, - &program->shader, offset); - scratch_bytes_needed = program->shader.scratch_bytes_per_wave; + si_shader_binary_read_config(&program->shader.binary, + &program->shader.config, offset); + scratch_bytes_needed = program->shader.config.scratch_bytes_per_wave; scratch_bytes = MAX2(scratch_bytes, scratch_bytes_needed); } @@ -87,7 +87,7 @@ static void init_scratch_buffer(struct si_context *sctx, struct si_compute *prog * to the maximum bytes needed, so it can compute the stride * correctly. */ - program->shader.scratch_bytes_per_wave = scratch_bytes; + program->shader.config.scratch_bytes_per_wave = scratch_bytes; /* Patch the shader with the scratch buffer address. */ si_shader_apply_scratch_relocs(sctx, @@ -122,8 +122,12 @@ static void *si_create_compute_state( for (i = 0; i < program->num_kernels; i++) { LLVMModuleRef mod = radeon_llvm_get_kernel_module(program->llvm_ctx, i, code, header->num_bytes); - si_compile_llvm(sctx->screen, &program->kernels[i], sctx->tm, - mod); + si_compile_llvm(sctx->screen, &program->kernels[i].binary, + &program->kernels[i].config, sctx->tm, + mod, &sctx->b.debug, TGSI_PROCESSOR_COMPUTE); + si_shader_dump(sctx->screen, &program->kernels[i], + &sctx->b.debug, TGSI_PROCESSOR_COMPUTE); + si_shader_binary_upload(sctx->screen, &program->kernels[i]); LLVMDisposeModule(mod); } } @@ -136,7 +140,11 @@ static void *si_create_compute_state( * the shader code to the GPU. */ init_scratch_buffer(sctx, program); - si_shader_binary_read(sctx->screen, &program->shader); + si_shader_binary_read_config(&program->shader.binary, + &program->shader.config, 0); + si_shader_dump(sctx->screen, &program->shader, &sctx->b.debug, + TGSI_PROCESSOR_COMPUTE); + si_shader_binary_upload(sctx->screen, &program->shader); #endif program->input_buffer = si_resource_create_custom(sctx->b.b.screen, @@ -259,7 +267,7 @@ static void si_launch_grid( #if HAVE_LLVM >= 0x0306 /* Read the config information */ - si_shader_binary_read_config(sctx->screen, shader, pc); + si_shader_binary_read_config(&shader->binary, &shader->config, pc); #endif /* Upload the kernel arguments */ @@ -280,12 +288,12 @@ static void si_launch_grid( memcpy(kernel_args + (num_work_size_bytes / 4), input, program->input_size); - if (shader->scratch_bytes_per_wave > 0) { + if (shader->config.scratch_bytes_per_wave > 0) { COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u bytes; " "Total Scratch: %u bytes\n", num_waves_for_scratch, - shader->scratch_bytes_per_wave, - shader->scratch_bytes_per_wave * + shader->config.scratch_bytes_per_wave, + shader->config.scratch_bytes_per_wave * num_waves_for_scratch); radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, @@ -312,7 +320,7 @@ static void si_launch_grid( si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 8, scratch_buffer_va); si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 12, S_008F04_BASE_ADDRESS_HI(scratch_buffer_va >> 32) - | S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64)); + | S_008F04_STRIDE(shader->config.scratch_bytes_per_wave / 64)); si_pm4_set_reg(pm4, R_00B810_COMPUTE_START_X, 0); si_pm4_set_reg(pm4, R_00B814_COMPUTE_START_Y, 0); @@ -360,9 +368,9 @@ static void si_launch_grid( si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, shader_va >> 8); si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40); - si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, shader->rsrc1); + si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, shader->config.rsrc1); - lds_blocks = shader->lds_size; + lds_blocks = shader->config.lds_size; /* XXX: We are over allocating LDS. For SI, the shader reports LDS in * blocks of 256 bytes, so if there are 4 bytes lds allocated in * the shader and 4 bytes allocated by the state tracker, then @@ -376,10 +384,10 @@ static void si_launch_grid( assert(lds_blocks <= 0xFF); - shader->rsrc2 &= C_00B84C_LDS_SIZE; - shader->rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks); + shader->config.rsrc2 &= C_00B84C_LDS_SIZE; + shader->config.rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks); - si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, shader->rsrc2); + si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, shader->config.rsrc2); si_pm4_set_reg(pm4, R_00B854_COMPUTE_RESOURCE_LIMITS, 0); si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, @@ -401,7 +409,7 @@ static void si_launch_grid( * COMPUTE_PGM_RSRC2.SCRATCH_EN is enabled. */ S_00B860_WAVES(num_waves_for_scratch) - | S_00B860_WAVESIZE(shader->scratch_bytes_per_wave >> 10)) + | S_00B860_WAVESIZE(shader->config.scratch_bytes_per_wave >> 10)) ; si_pm4_cmd_begin(pm4, PKT3_DISPATCH_DIRECT); diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index b3719dea252..d157a9ffb00 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -109,7 +109,7 @@ static bool si_upload_descriptors(struct si_context *sctx, if (!desc->list_dirty) return true; - u_upload_alloc(sctx->b.uploader, 0, list_size, + u_upload_alloc(sctx->b.uploader, 0, list_size, 256, &desc->buffer_offset, (struct pipe_resource**)&desc->buffer, &ptr); if (!desc->buffer) @@ -391,7 +391,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) * directly through a staging buffer and don't go through * the fine-grained upload path. */ - u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset, + u_upload_alloc(sctx->b.uploader, 0, count * 16, 256, &desc->buffer_offset, (struct pipe_resource**)&desc->buffer, (void**)&ptr); if (!desc->buffer) return false; @@ -465,7 +465,7 @@ void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuf { void *tmp; - u_upload_alloc(sctx->b.uploader, 0, size, const_offset, + u_upload_alloc(sctx->b.uploader, 0, size, 256, const_offset, (struct pipe_resource**)rbuffer, &tmp); if (rbuffer) util_memcpy_cpu_to_le32(tmp, ptr, size); @@ -1011,19 +1011,19 @@ void si_init_all_descriptors(struct si_context *sctx) for (i = 0; i < SI_NUM_SHADERS; i++) { si_init_buffer_resources(&sctx->const_buffers[i], - SI_NUM_CONST_BUFFERS, SI_SGPR_CONST, + SI_NUM_CONST_BUFFERS, SI_SGPR_CONST_BUFFERS, RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER); si_init_buffer_resources(&sctx->rw_buffers[i], SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS, RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT); si_init_descriptors(&sctx->samplers[i].views.desc, - SI_SGPR_RESOURCE, 8, SI_NUM_SAMPLER_VIEWS); + SI_SGPR_SAMPLER_VIEWS, 8, SI_NUM_SAMPLER_VIEWS); si_init_descriptors(&sctx->samplers[i].states.desc, - SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES); + SI_SGPR_SAMPLER_STATES, 4, SI_NUM_SAMPLER_STATES); } - si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER, + si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS, 4, SI_NUM_VERTEX_BUFFERS); /* Set pipe_context functions. */ diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index ac13407e2a1..c2ca94339ac 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -340,6 +340,13 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_TEXTURE_GATHER_OFFSETS: case PIPE_CAP_VERTEXID_NOBASE: case PIPE_CAP_CLEAR_TEXTURE: + case PIPE_CAP_DRAW_PARAMETERS: + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: + case PIPE_CAP_MULTI_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: + case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: + case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: return 0; case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: @@ -512,6 +519,8 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu return 1; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; + case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: + return 0; } return 0; } diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 0e98784d51b..1db3e484915 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -82,11 +82,11 @@ struct si_shader_context int param_es2gs_offset; LLVMTargetMachineRef tm; LLVMValueRef const_md; - LLVMValueRef const_resource[SI_NUM_CONST_BUFFERS]; + LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS]; LLVMValueRef lds; LLVMValueRef *constants[SI_NUM_CONST_BUFFERS]; - LLVMValueRef resources[SI_NUM_SAMPLER_VIEWS]; - LLVMValueRef samplers[SI_NUM_SAMPLER_STATES]; + LLVMValueRef sampler_views[SI_NUM_SAMPLER_VIEWS]; + LLVMValueRef sampler_states[SI_NUM_SAMPLER_STATES]; LLVMValueRef so_buffers[4]; LLVMValueRef esgs_ring; LLVMValueRef gsvs_ring[4]; @@ -394,7 +394,7 @@ static void declare_input_vs( LLVMValueRef input; /* Load the T list */ - t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFER); + t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS); t_offset = lp_build_const_int32(gallivm, input_index); @@ -1065,7 +1065,7 @@ static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld; struct gallivm_state *gallivm = &radeon_bld->gallivm; LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST); + LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS); LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_DRIVER_STATE_CONST_BUF); LLVMValueRef resource = build_indexed_load_const(si_shader_ctx, desc, buf_index); @@ -1233,13 +1233,13 @@ static LLVMValueRef fetch_constant( } if (reg->Register.Dimension && reg->Dimension.Indirect) { - LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST); + LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS); LLVMValueRef index; index = get_indirect_index(si_shader_ctx, ®->DimIndirect, reg->Dimension.Index); bufp = build_indexed_load_const(si_shader_ctx, ptr, index); } else - bufp = si_shader_ctx->const_resource[buf]; + bufp = si_shader_ctx->const_buffers[buf]; addr = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle]; addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg"); @@ -1260,7 +1260,7 @@ static LLVMValueRef fetch_constant( addr2 = lp_build_add(&bld_base->uint_bld, addr2, lp_build_const_int32(base->gallivm, idx * 4)); - result2 = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_resource[buf], + result2 = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_buffers[buf], addr2, bld_base->base.elem_type); result = radeon_llvm_emit_fetch_double(bld_base, @@ -1302,18 +1302,8 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, if (si_shader_ctx->type == TGSI_PROCESSOR_FRAGMENT) { int cbuf = target - V_008DFC_SQ_EXP_MRT; - if (cbuf >= 0 && cbuf < 8) { + if (cbuf >= 0 && cbuf < 8) compressed = (si_shader_ctx->shader->key.ps.export_16bpc >> cbuf) & 0x1; - - if (compressed) - si_shader_ctx->shader->spi_shader_col_format |= - V_028714_SPI_SHADER_FP16_ABGR << (4 * cbuf); - else - si_shader_ctx->shader->spi_shader_col_format |= - V_028714_SPI_SHADER_32_ABGR << (4 * cbuf); - - si_shader_ctx->shader->cb_shader_mask |= 0xf << (4 * cbuf); - } } /* Set COMPR flag */ @@ -1333,34 +1323,19 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, LLVMInt32TypeInContext(base->gallivm->context), pack_args, 2, LLVMReadNoneAttribute | LLVMNoUnwindAttribute); - args[chan + 7] = args[chan + 5] = + args[chan + 5] = LLVMBuildBitCast(base->gallivm->builder, packed, LLVMFloatTypeInContext(base->gallivm->context), ""); + args[chan + 7] = base->undef; } } else memcpy(&args[5], values, sizeof(values[0]) * 4); } -/* Load from output pointers and initialize arguments for the shader export intrinsic */ -static void si_llvm_init_export_args_load(struct lp_build_tgsi_context *bld_base, - LLVMValueRef *out_ptr, - unsigned target, - LLVMValueRef *args) -{ - struct gallivm_state *gallivm = bld_base->base.gallivm; - LLVMValueRef values[4]; - int i; - - for (i = 0; i < 4; i++) - values[i] = LLVMBuildLoad(gallivm->builder, out_ptr[i], ""); - - si_llvm_init_export_args(bld_base, values, target, args); -} - static void si_alpha_test(struct lp_build_tgsi_context *bld_base, - LLVMValueRef alpha_ptr) + LLVMValueRef alpha) { struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; @@ -1372,8 +1347,7 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base, LLVMValueRef alpha_pass = lp_build_cmp(&bld_base->base, si_shader_ctx->shader->key.ps.alpha_func, - LLVMBuildLoad(gallivm->builder, alpha_ptr, ""), - alpha_ref); + alpha, alpha_ref); LLVMValueRef arg = lp_build_select(&bld_base->base, alpha_pass, @@ -1390,16 +1364,14 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base, LLVMVoidTypeInContext(gallivm->context), NULL, 0, 0); } - - si_shader_ctx->shader->db_shader_control |= S_02880C_KILL_ENABLE(1); } -static void si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base, - LLVMValueRef alpha_ptr) +static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base, + LLVMValueRef alpha) { struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; - LLVMValueRef coverage, alpha; + LLVMValueRef coverage; /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */ coverage = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, @@ -1417,9 +1389,7 @@ static void si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base lp_build_const_float(gallivm, 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), ""); - alpha = LLVMBuildLoad(gallivm->builder, alpha_ptr, ""); - alpha = LLVMBuildFMul(gallivm->builder, alpha, coverage, ""); - LLVMBuildStore(gallivm->builder, alpha, alpha_ptr); + return LLVMBuildFMul(gallivm->builder, alpha, coverage, ""); } static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context * bld_base, @@ -1432,7 +1402,7 @@ static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context * bld_base, unsigned chan; unsigned const_chan; LLVMValueRef base_elt; - LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST); + LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS); LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm, SI_DRIVER_STATE_CONST_BUF); LLVMValueRef const_resource = build_indexed_load_const(si_shader_ctx, ptr, constbuf_index); @@ -2112,197 +2082,193 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base) FREE(outputs); } -static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base) +static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base, + LLVMValueRef depth, LLVMValueRef stencil, + LLVMValueRef samplemask) { - struct si_shader_context * si_shader_ctx = si_shader_context(bld_base); - struct si_shader * shader = si_shader_ctx->shader; - struct lp_build_context * base = &bld_base->base; - struct lp_build_context * uint = &bld_base->uint_bld; - struct tgsi_shader_info *info = &shader->selector->info; - LLVMBuilderRef builder = base->gallivm->builder; + struct si_screen *sscreen = si_shader_context(bld_base)->screen; + struct lp_build_context *base = &bld_base->base; + struct lp_build_context *uint = &bld_base->uint_bld; LLVMValueRef args[9]; - LLVMValueRef last_args[9] = { 0 }; - int depth_index = -1, stencil_index = -1, samplemask_index = -1; - int i; + unsigned mask = 0; - for (i = 0; i < info->num_outputs; i++) { - unsigned semantic_name = info->output_semantic_name[i]; - unsigned semantic_index = info->output_semantic_index[i]; - unsigned target; - LLVMValueRef alpha_ptr; + assert(depth || stencil || samplemask); - /* Select the correct target */ - switch (semantic_name) { - case TGSI_SEMANTIC_POSITION: - depth_index = i; - continue; - case TGSI_SEMANTIC_STENCIL: - stencil_index = i; - continue; - case TGSI_SEMANTIC_SAMPLEMASK: - samplemask_index = i; - continue; - case TGSI_SEMANTIC_COLOR: - target = V_008DFC_SQ_EXP_MRT + semantic_index; - alpha_ptr = si_shader_ctx->radeon_bld.soa.outputs[i][3]; + args[1] = uint->one; /* whether the EXEC mask is valid */ + args[2] = uint->one; /* DONE bit */ - if (si_shader_ctx->shader->key.ps.clamp_color) { - for (int j = 0; j < 4; j++) { - LLVMValueRef ptr = si_shader_ctx->radeon_bld.soa.outputs[i][j]; - LLVMValueRef result = LLVMBuildLoad(builder, ptr, ""); + /* Specify the target we are exporting */ + args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ); - result = radeon_llvm_saturate(bld_base, result); - LLVMBuildStore(builder, result, ptr); - } - } + args[4] = uint->zero; /* COMP flag */ + args[5] = base->undef; /* R, depth */ + args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */ + args[7] = base->undef; /* B, sample mask */ + args[8] = base->undef; /* A, alpha to mask */ - if (si_shader_ctx->shader->key.ps.alpha_to_one) - LLVMBuildStore(base->gallivm->builder, - base->one, alpha_ptr); + if (depth) { + args[5] = depth; + mask |= 0x1; + } - if (semantic_index == 0 && - si_shader_ctx->shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS) - si_alpha_test(bld_base, alpha_ptr); + if (stencil) { + args[6] = stencil; + mask |= 0x2; + } - if (si_shader_ctx->shader->key.ps.poly_line_smoothing) - si_scale_alpha_by_sample_mask(bld_base, alpha_ptr); + if (samplemask) { + args[7] = samplemask; + mask |= 0x4; + } - break; - default: - target = 0; - fprintf(stderr, - "Warning: SI unhandled fs output type:%d\n", - semantic_name); - } + /* SI (except OLAND) has a bug that it only looks + * at the X writemask component. */ + if (sscreen->b.chip_class == SI && + sscreen->b.family != CHIP_OLAND) + mask |= 0x1; - si_llvm_init_export_args_load(bld_base, - si_shader_ctx->radeon_bld.soa.outputs[i], - target, args); - - if (semantic_name == TGSI_SEMANTIC_COLOR) { - /* If there is an export instruction waiting to be emitted, do so now. */ - if (last_args[0]) { - lp_build_intrinsic(base->gallivm->builder, - "llvm.SI.export", - LLVMVoidTypeInContext(base->gallivm->context), - last_args, 9, 0); - } + /* Specify which components to enable */ + args[0] = lp_build_const_int32(base->gallivm, mask); - /* This instruction will be emitted at the end of the shader. */ - memcpy(last_args, args, sizeof(args)); - - /* Handle FS_COLOR0_WRITES_ALL_CBUFS. */ - if (shader->selector->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] && - semantic_index == 0 && - si_shader_ctx->shader->key.ps.last_cbuf > 0) { - for (int c = 1; c <= si_shader_ctx->shader->key.ps.last_cbuf; c++) { - si_llvm_init_export_args_load(bld_base, - si_shader_ctx->radeon_bld.soa.outputs[i], - V_008DFC_SQ_EXP_MRT + c, args); - lp_build_intrinsic(base->gallivm->builder, - "llvm.SI.export", - LLVMVoidTypeInContext(base->gallivm->context), - args, 9, 0); - } - } - } else { - lp_build_intrinsic(base->gallivm->builder, - "llvm.SI.export", + lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", + LLVMVoidTypeInContext(base->gallivm->context), + args, 9, 0); +} + +static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base, + LLVMValueRef *color, unsigned index, + bool is_last) +{ + struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct lp_build_context *base = &bld_base->base; + LLVMValueRef args[9]; + int i; + + /* Clamp color */ + if (si_shader_ctx->shader->key.ps.clamp_color) + for (i = 0; i < 4; i++) + color[i] = radeon_llvm_saturate(bld_base, color[i]); + + /* Alpha to one */ + if (si_shader_ctx->shader->key.ps.alpha_to_one) + color[3] = base->one; + + /* Alpha test */ + if (index == 0 && + si_shader_ctx->shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS) + si_alpha_test(bld_base, color[3]); + + /* Line & polygon smoothing */ + if (si_shader_ctx->shader->key.ps.poly_line_smoothing) + color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3]); + + /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ + if (index == 0 && + si_shader_ctx->shader->key.ps.last_cbuf > 0) { + for (int c = 1; c <= si_shader_ctx->shader->key.ps.last_cbuf; c++) { + si_llvm_init_export_args(bld_base, color, + V_008DFC_SQ_EXP_MRT + c, args); + lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", LLVMVoidTypeInContext(base->gallivm->context), args, 9, 0); } } - if (depth_index >= 0 || stencil_index >= 0 || samplemask_index >= 0) { - LLVMValueRef out_ptr; - unsigned mask = 0; - - /* Specify the target we are exporting */ - args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ); - - args[5] = base->zero; /* R, depth */ - args[6] = base->zero; /* G, stencil test value[0:7], stencil op value[8:15] */ - args[7] = base->zero; /* B, sample mask */ - args[8] = base->zero; /* A, alpha to mask */ - - if (depth_index >= 0) { - out_ptr = si_shader_ctx->radeon_bld.soa.outputs[depth_index][2]; - args[5] = LLVMBuildLoad(base->gallivm->builder, out_ptr, ""); - mask |= 0x1; - si_shader_ctx->shader->db_shader_control |= S_02880C_Z_EXPORT_ENABLE(1); - } - - if (stencil_index >= 0) { - out_ptr = si_shader_ctx->radeon_bld.soa.outputs[stencil_index][1]; - args[6] = LLVMBuildLoad(base->gallivm->builder, out_ptr, ""); - mask |= 0x2; - si_shader_ctx->shader->db_shader_control |= - S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(1); - } - - if (samplemask_index >= 0) { - out_ptr = si_shader_ctx->radeon_bld.soa.outputs[samplemask_index][0]; - args[7] = LLVMBuildLoad(base->gallivm->builder, out_ptr, ""); - mask |= 0x4; - si_shader_ctx->shader->db_shader_control |= S_02880C_MASK_EXPORT_ENABLE(1); - } - - /* SI (except OLAND) has a bug that it only looks - * at the X writemask component. */ - if (si_shader_ctx->screen->b.chip_class == SI && - si_shader_ctx->screen->b.family != CHIP_OLAND) - mask |= 0x1; + /* Export */ + si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index, + args); + if (is_last) { + args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */ + args[2] = bld_base->uint_bld.one; /* DONE bit */ + } + lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", + LLVMVoidTypeInContext(base->gallivm->context), + args, 9, 0); +} - if (samplemask_index >= 0) - si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_ABGR; - else if (stencil_index >= 0) - si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_GR; - else - si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_R; +static void si_export_null(struct lp_build_tgsi_context *bld_base) +{ + struct lp_build_context *base = &bld_base->base; + struct lp_build_context *uint = &bld_base->uint_bld; + LLVMValueRef args[9]; - /* Specify which components to enable */ - args[0] = lp_build_const_int32(base->gallivm, mask); + args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */ + args[1] = uint->one; /* whether the EXEC mask is valid */ + args[2] = uint->one; /* DONE bit */ + args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL); + args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */ + args[5] = uint->undef; /* R */ + args[6] = uint->undef; /* G */ + args[7] = uint->undef; /* B */ + args[8] = uint->undef; /* A */ + + lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", + LLVMVoidTypeInContext(base->gallivm->context), + args, 9, 0); +} - args[1] = - args[2] = - args[4] = uint->zero; +static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base) +{ + struct si_shader_context * si_shader_ctx = si_shader_context(bld_base); + struct si_shader * shader = si_shader_ctx->shader; + struct lp_build_context * base = &bld_base->base; + struct tgsi_shader_info *info = &shader->selector->info; + LLVMBuilderRef builder = base->gallivm->builder; + LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; + int last_color_export = -1; + int i; - if (last_args[0]) - lp_build_intrinsic(base->gallivm->builder, - "llvm.SI.export", - LLVMVoidTypeInContext(base->gallivm->context), - args, 9, 0); - else - memcpy(last_args, args, sizeof(args)); + /* If there are no outputs, add a dummy export. */ + if (!info->num_outputs) { + si_export_null(bld_base); + return; } - if (!last_args[0]) { - /* Specify which components to enable */ - last_args[0] = lp_build_const_int32(base->gallivm, 0x0); + /* Determine the last export. If MRTZ is present, it's always last. + * Otherwise, find the last color export. + */ + if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) + for (i = 0; i < info->num_outputs; i++) + if (info->output_semantic_name[i] == TGSI_SEMANTIC_COLOR) + last_color_export = i; - /* Specify the target we are exporting */ - last_args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRT); + for (i = 0; i < info->num_outputs; i++) { + unsigned semantic_name = info->output_semantic_name[i]; + unsigned semantic_index = info->output_semantic_index[i]; + unsigned j; + LLVMValueRef color[4] = {}; - /* Set COMPR flag to zero to export data as 32-bit */ - last_args[4] = uint->zero; + /* Select the correct target */ + switch (semantic_name) { + case TGSI_SEMANTIC_POSITION: + depth = LLVMBuildLoad(builder, + si_shader_ctx->radeon_bld.soa.outputs[i][2], ""); + break; + case TGSI_SEMANTIC_STENCIL: + stencil = LLVMBuildLoad(builder, + si_shader_ctx->radeon_bld.soa.outputs[i][1], ""); + break; + case TGSI_SEMANTIC_SAMPLEMASK: + samplemask = LLVMBuildLoad(builder, + si_shader_ctx->radeon_bld.soa.outputs[i][0], ""); + break; + case TGSI_SEMANTIC_COLOR: + for (j = 0; j < 4; j++) + color[j] = LLVMBuildLoad(builder, + si_shader_ctx->radeon_bld.soa.outputs[i][j], ""); - /* dummy bits */ - last_args[5]= uint->zero; - last_args[6]= uint->zero; - last_args[7]= uint->zero; - last_args[8]= uint->zero; + si_export_mrt_color(bld_base, color, semantic_index, + last_color_export == i); + break; + default: + fprintf(stderr, + "Warning: SI unhandled fs output type:%d\n", + semantic_name); + } } - /* Specify whether the EXEC mask represents the valid mask */ - last_args[1] = uint->one; - - /* Specify that this is the last export */ - last_args[2] = lp_build_const_int32(base->gallivm, 1); - - lp_build_intrinsic(base->gallivm->builder, - "llvm.SI.export", - LLVMVoidTypeInContext(base->gallivm->context), - last_args, 9, 0); + if (depth || stencil || samplemask) + si_export_mrt_z(bld_base, depth, stencil, samplemask); } static void build_tex_intrinsic(const struct lp_build_tgsi_action * action, @@ -2390,10 +2356,10 @@ static void tex_fetch_ptrs( ind_index = get_indirect_index(si_shader_ctx, ®->Indirect, reg->Register.Index); - *res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE); + *res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_VIEWS); *res_ptr = build_indexed_load_const(si_shader_ctx, *res_ptr, ind_index); - *samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER); + *samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_STATES); *samp_ptr = build_indexed_load_const(si_shader_ctx, *samp_ptr, ind_index); if (target == TGSI_TEXTURE_2D_MSAA || @@ -2401,13 +2367,13 @@ static void tex_fetch_ptrs( ind_index = LLVMBuildAdd(gallivm->builder, ind_index, lp_build_const_int32(gallivm, SI_FMASK_TEX_OFFSET), ""); - *fmask_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE); + *fmask_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_VIEWS); *fmask_ptr = build_indexed_load_const(si_shader_ctx, *fmask_ptr, ind_index); } } else { - *res_ptr = si_shader_ctx->resources[sampler_index]; - *samp_ptr = si_shader_ctx->samplers[sampler_index]; - *fmask_ptr = si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + sampler_index]; + *res_ptr = si_shader_ctx->sampler_views[sampler_index]; + *samp_ptr = si_shader_ctx->sampler_states[sampler_index]; + *fmask_ptr = si_shader_ctx->sampler_views[SI_FMASK_TEX_OFFSET + sampler_index]; } } @@ -2487,7 +2453,7 @@ static void tex_fetch_args( emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4); emit_data->args[0] = res; emit_data->args[1] = bld_base->uint_bld.zero; - emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0); + emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X); emit_data->arg_count = 3; return; } @@ -2536,12 +2502,12 @@ static void tex_fetch_args( if (opcode == TGSI_OPCODE_TXB) address[count++] = coords[3]; if (opcode == TGSI_OPCODE_TXB2) - address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0); + address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X); /* Pack depth comparison value */ if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) { if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { - address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0); + address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X); } else { assert(ref_pos >= 0); address[count++] = coords[ref_pos]; @@ -2612,7 +2578,7 @@ static void tex_fetch_args( if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF) address[count++] = coords[3]; else if (opcode == TGSI_OPCODE_TXL2) - address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0); + address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X); if (count > 16) { assert(!"Cannot handle more than 16 texture address parameters"); @@ -3105,10 +3071,10 @@ static void interp_fetch_args( /* offset is in second src, first two channels */ emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst, 1, - 0); + TGSI_CHAN_X); emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst, 1, - 1); + TGSI_CHAN_Y); emit_data->arg_count = 2; } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { LLVMValueRef sample_position; @@ -3119,7 +3085,7 @@ static void interp_fetch_args( * and place into first two channels. */ sample_id = lp_build_emit_fetch(bld_base, - emit_data->inst, 1, 0); + emit_data->inst, 1, TGSI_CHAN_X); sample_id = LLVMBuildBitCast(gallivm->builder, sample_id, LLVMInt32TypeInContext(gallivm->context), ""); @@ -3432,15 +3398,15 @@ static void create_function(struct si_shader_context *si_shader_ctx) v16i8 = LLVMVectorType(i8, 16); params[SI_PARAM_RW_BUFFERS] = const_array(v16i8, SI_NUM_RW_BUFFERS); - params[SI_PARAM_CONST] = const_array(v16i8, SI_NUM_CONST_BUFFERS); - params[SI_PARAM_SAMPLER] = const_array(v4i32, SI_NUM_SAMPLER_STATES); - params[SI_PARAM_RESOURCE] = const_array(v8i32, SI_NUM_SAMPLER_VIEWS); - last_array_pointer = SI_PARAM_RESOURCE; + params[SI_PARAM_CONST_BUFFERS] = const_array(v16i8, SI_NUM_CONST_BUFFERS); + params[SI_PARAM_SAMPLER_STATES] = const_array(v4i32, SI_NUM_SAMPLER_STATES); + params[SI_PARAM_SAMPLER_VIEWS] = const_array(v8i32, SI_NUM_SAMPLER_VIEWS); + last_array_pointer = SI_PARAM_SAMPLER_VIEWS; switch (si_shader_ctx->type) { case TGSI_PROCESSOR_VERTEX: - params[SI_PARAM_VERTEX_BUFFER] = const_array(v16i8, SI_NUM_VERTEX_BUFFERS); - last_array_pointer = SI_PARAM_VERTEX_BUFFER; + params[SI_PARAM_VERTEX_BUFFERS] = const_array(v16i8, SI_NUM_VERTEX_BUFFERS); + last_array_pointer = SI_PARAM_VERTEX_BUFFERS; params[SI_PARAM_BASE_VERTEX] = i32; params[SI_PARAM_START_INSTANCE] = i32; num_params = SI_PARAM_START_INSTANCE+1; @@ -3452,8 +3418,8 @@ static void create_function(struct si_shader_context *si_shader_ctx) num_params = SI_PARAM_LS_OUT_LAYOUT+1; } else { if (shader->is_gs_copy_shader) { - last_array_pointer = SI_PARAM_CONST; - num_params = SI_PARAM_CONST+1; + last_array_pointer = SI_PARAM_CONST_BUFFERS; + num_params = SI_PARAM_CONST_BUFFERS+1; } else { params[SI_PARAM_VS_STATE_BITS] = i32; num_params = SI_PARAM_VS_STATE_BITS+1; @@ -3610,7 +3576,7 @@ static void preload_constants(struct si_shader_context *si_shader_ctx) struct gallivm_state * gallivm = bld_base->base.gallivm; const struct tgsi_shader_info * info = bld_base->info; unsigned buf; - LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST); + LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS); for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) { unsigned i, num_const = info->const_file_max[buf] + 1; @@ -3622,14 +3588,14 @@ static void preload_constants(struct si_shader_context *si_shader_ctx) si_shader_ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef)); /* Load the resource descriptor */ - si_shader_ctx->const_resource[buf] = + si_shader_ctx->const_buffers[buf] = build_indexed_load_const(si_shader_ctx, ptr, lp_build_const_int32(gallivm, buf)); /* Load the constants, we rely on the code sinking to do the rest */ for (i = 0; i < num_const * 4; ++i) { si_shader_ctx->constants[buf][i] = buffer_load_const(gallivm->builder, - si_shader_ctx->const_resource[buf], + si_shader_ctx->const_buffers[buf], lp_build_const_int32(gallivm, i * 4), bld_base->base.elem_type); } @@ -3650,23 +3616,23 @@ static void preload_samplers(struct si_shader_context *si_shader_ctx) if (num_samplers == 0) return; - res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE); - samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER); + res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_VIEWS); + samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_STATES); /* Load the resources and samplers, we rely on the code sinking to do the rest */ for (i = 0; i < num_samplers; ++i) { /* Resource */ offset = lp_build_const_int32(gallivm, i); - si_shader_ctx->resources[i] = build_indexed_load_const(si_shader_ctx, res_ptr, offset); + si_shader_ctx->sampler_views[i] = build_indexed_load_const(si_shader_ctx, res_ptr, offset); /* Sampler */ offset = lp_build_const_int32(gallivm, i); - si_shader_ctx->samplers[i] = build_indexed_load_const(si_shader_ctx, samp_ptr, offset); + si_shader_ctx->sampler_states[i] = build_indexed_load_const(si_shader_ctx, samp_ptr, offset); /* FMASK resource */ if (info->is_msaa_sampler[i]) { offset = lp_build_const_int32(gallivm, SI_FMASK_TEX_OFFSET + i); - si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + i] = + si_shader_ctx->sampler_views[SI_FMASK_TEX_OFFSET + i] = build_indexed_load_const(si_shader_ctx, res_ptr, offset); } } @@ -3741,20 +3707,19 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx) } } -void si_shader_binary_read_config(const struct si_screen *sscreen, - struct si_shader *shader, - unsigned symbol_offset) +void si_shader_binary_read_config(struct radeon_shader_binary *binary, + struct si_shader_config *conf, + unsigned symbol_offset) { unsigned i; const unsigned char *config = - radeon_shader_binary_config_start(&shader->binary, - symbol_offset); + radeon_shader_binary_config_start(binary, symbol_offset); /* XXX: We may be able to emit some of these values directly rather than * extracting fields to be emitted later. */ - for (i = 0; i < shader->binary.config_size_per_symbol; i+= 8) { + for (i = 0; i < binary->config_size_per_symbol; i+= 8) { unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i)); unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4)); switch (reg) { @@ -3762,25 +3727,25 @@ void si_shader_binary_read_config(const struct si_screen *sscreen, case R_00B128_SPI_SHADER_PGM_RSRC1_VS: case R_00B228_SPI_SHADER_PGM_RSRC1_GS: case R_00B848_COMPUTE_PGM_RSRC1: - shader->num_sgprs = MAX2(shader->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8); - shader->num_vgprs = MAX2(shader->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4); - shader->float_mode = G_00B028_FLOAT_MODE(value); - shader->rsrc1 = value; + conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8); + conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4); + conf->float_mode = G_00B028_FLOAT_MODE(value); + conf->rsrc1 = value; break; case R_00B02C_SPI_SHADER_PGM_RSRC2_PS: - shader->lds_size = MAX2(shader->lds_size, G_00B02C_EXTRA_LDS_SIZE(value)); + conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value)); break; case R_00B84C_COMPUTE_PGM_RSRC2: - shader->lds_size = MAX2(shader->lds_size, G_00B84C_LDS_SIZE(value)); - shader->rsrc2 = value; + conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value)); + conf->rsrc2 = value; break; case R_0286CC_SPI_PS_INPUT_ENA: - shader->spi_ps_input_ena = value; + conf->spi_ps_input_ena = value; break; case R_0286E8_SPI_TMPRING_SIZE: case R_00B860_COMPUTE_TMPRING_SIZE: /* WAVESIZE is in units of 256 dwords. */ - shader->scratch_bytes_per_wave = + conf->scratch_bytes_per_wave = G_00B860_WAVESIZE(value) * 256 * 4 * 1; break; default: @@ -3799,7 +3764,7 @@ void si_shader_apply_scratch_relocs(struct si_context *sctx, uint32_t scratch_rsrc_dword0 = scratch_va; uint32_t scratch_rsrc_dword1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) - | S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64); + | S_008F04_STRIDE(shader->config.scratch_bytes_per_wave / 64); for (i = 0 ; i < shader->binary.reloc_count; i++) { const struct radeon_shader_reloc *reloc = @@ -3840,80 +3805,124 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader) return 0; } -int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader) +static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary, + struct pipe_debug_callback *debug) { - const struct radeon_shader_binary *binary = &shader->binary; - unsigned i; - int r; - bool dump = r600_can_dump_shader(&sscreen->b, - shader->selector ? shader->selector->tokens : NULL); + char *line, *p; + unsigned i, count; + + if (binary->disasm_string) { + fprintf(stderr, "\nShader Disassembly:\n\n"); + fprintf(stderr, "%s\n", binary->disasm_string); + + if (debug && debug->debug_message) { + /* Very long debug messages are cut off, so send the + * disassembly one line at a time. This causes more + * overhead, but on the plus side it simplifies + * parsing of resulting logs. + */ + pipe_debug_message(debug, SHADER_INFO, + "Shader Disassembly Begin"); - si_shader_binary_read_config(sscreen, shader, 0); - r = si_shader_binary_upload(sscreen, shader); - if (r) - return r; - - if (dump) { - if (!(sscreen->b.debug_flags & DBG_NO_ASM)) { - if (binary->disasm_string) { - fprintf(stderr, "\nShader Disassembly:\n\n"); - fprintf(stderr, "%s\n", binary->disasm_string); - } else { - fprintf(stderr, "SI CODE:\n"); - for (i = 0; i < binary->code_size; i+=4 ) { - fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i, binary->code[i + 3], - binary->code[i + 2], binary->code[i + 1], - binary->code[i]); + line = binary->disasm_string; + while (*line) { + p = strchrnul(line, '\n'); + count = p - line; + + if (count) { + pipe_debug_message(debug, SHADER_INFO, + "%.*s", count, line); } + + if (!*p) + break; + line = p + 1; } + + pipe_debug_message(debug, SHADER_INFO, + "Shader Disassembly End"); + } + } else { + fprintf(stderr, "SI CODE:\n"); + for (i = 0; i < binary->code_size; i += 4) { + fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i, + binary->code[i + 3], binary->code[i + 2], + binary->code[i + 1], binary->code[i]); } + } +} +static void si_shader_dump_stats(struct si_screen *sscreen, + struct si_shader_config *conf, + unsigned code_size, + struct pipe_debug_callback *debug, + unsigned processor) +{ + if (r600_can_dump_shader(&sscreen->b, processor)) { fprintf(stderr, "*** SHADER STATS ***\n" "SGPRS: %d\nVGPRS: %d\nCode Size: %d bytes\nLDS: %d blocks\n" "Scratch: %d bytes per wave\n********************\n", - shader->num_sgprs, shader->num_vgprs, binary->code_size, - shader->lds_size, shader->scratch_bytes_per_wave); + conf->num_sgprs, conf->num_vgprs, code_size, + conf->lds_size, conf->scratch_bytes_per_wave); } - return 0; + + pipe_debug_message(debug, SHADER_INFO, + "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d LDS: %d Scratch: %d", + conf->num_sgprs, conf->num_vgprs, code_size, + conf->lds_size, conf->scratch_bytes_per_wave); +} + +void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, + struct pipe_debug_callback *debug, unsigned processor) +{ + if (r600_can_dump_shader(&sscreen->b, processor)) + if (!(sscreen->b.debug_flags & DBG_NO_ASM)) + si_shader_dump_disassembly(&shader->binary, debug); + + si_shader_dump_stats(sscreen, &shader->config, + shader->binary.code_size, debug, processor); } -int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader, - LLVMTargetMachineRef tm, LLVMModuleRef mod) +int si_compile_llvm(struct si_screen *sscreen, + struct radeon_shader_binary *binary, + struct si_shader_config *conf, + LLVMTargetMachineRef tm, + LLVMModuleRef mod, + struct pipe_debug_callback *debug, + unsigned processor) { int r = 0; - bool dump_asm = r600_can_dump_shader(&sscreen->b, - shader->selector ? shader->selector->tokens : NULL); - bool dump_ir = dump_asm && !(sscreen->b.debug_flags & DBG_NO_IR); unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations); - if (dump_ir || dump_asm) + if (r600_can_dump_shader(&sscreen->b, processor)) { fprintf(stderr, "radeonsi: Compiling shader %d\n", count); - if (!si_replace_shader(count, &shader->binary)) { - r = radeon_llvm_compile(mod, &shader->binary, - r600_get_llvm_processor_name(sscreen->b.family), dump_ir, dump_asm, tm); + if (!(sscreen->b.debug_flags & DBG_NO_IR)) + LLVMDumpModule(mod); + } + + if (!si_replace_shader(count, binary)) { + r = radeon_llvm_compile(mod, binary, + r600_get_llvm_processor_name(sscreen->b.family), tm, + debug); if (r) return r; } - r = si_shader_binary_read(sscreen, shader); + si_shader_binary_read_config(binary, conf, 0); - FREE(shader->binary.config); - FREE(shader->binary.rodata); - FREE(shader->binary.global_symbol_offsets); - if (shader->scratch_bytes_per_wave == 0) { - FREE(shader->binary.code); - FREE(shader->binary.relocs); - memset(&shader->binary, 0, - offsetof(struct radeon_shader_binary, disasm_string)); - } + FREE(binary->config); + FREE(binary->global_symbol_offsets); + binary->config = NULL; + binary->global_symbol_offsets = NULL; return r; } /* Generate code for the hardware VS shader stage to go with a geometry shader */ static int si_generate_gs_copy_shader(struct si_screen *sscreen, struct si_shader_context *si_shader_ctx, - struct si_shader *gs, bool dump) + struct si_shader *gs, bool dump, + struct pipe_debug_callback *debug) { struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm; struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base; @@ -3979,8 +3988,15 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen, if (dump) fprintf(stderr, "Copy Vertex Shader for Geometry Shader:\n\n"); - r = si_compile_llvm(sscreen, si_shader_ctx->shader, - si_shader_ctx->tm, bld_base->base.gallivm->module); + r = si_compile_llvm(sscreen, &si_shader_ctx->shader->binary, + &si_shader_ctx->shader->config, si_shader_ctx->tm, + bld_base->base.gallivm->module, + debug, TGSI_PROCESSOR_GEOMETRY); + if (!r) { + si_shader_dump(sscreen, si_shader_ctx->shader, debug, + TGSI_PROCESSOR_GEOMETRY); + r = si_shader_binary_upload(sscreen, si_shader_ctx->shader); + } radeon_llvm_dispose(&si_shader_ctx->radeon_bld); @@ -4034,7 +4050,8 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f) } int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, - struct si_shader *shader) + struct si_shader *shader, + struct pipe_debug_callback *debug) { struct si_shader_selector *sel = shader->selector; struct tgsi_token *tokens = sel->tokens; @@ -4045,11 +4062,12 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, int r = 0; bool poly_stipple = sel->type == PIPE_SHADER_FRAGMENT && shader->key.ps.poly_stipple; - bool dump = r600_can_dump_shader(&sscreen->b, sel->tokens); + bool dump = r600_can_dump_shader(&sscreen->b, sel->info.processor); if (poly_stipple) { tokens = util_pstipple_create_fragment_shader(tokens, NULL, - SI_POLY_STIPPLE_SAMPLER); + SI_POLY_STIPPLE_SAMPLER, + TGSI_FILE_INPUT); tgsi_scan_shader(tokens, &stipple_shader_info); } @@ -4070,9 +4088,6 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, if (sel->type != PIPE_SHADER_COMPUTE) shader->dx10_clamp_mode = true; - if (sel->info.uses_kill) - shader->db_shader_control |= S_02880C_KILL_ENABLE(1); - shader->uses_instanceid = sel->info.uses_instanceid; bld_base->info = poly_stipple ? &stipple_shader_info : &sel->info; bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant; @@ -4147,17 +4162,6 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, case TGSI_PROCESSOR_FRAGMENT: si_shader_ctx.radeon_bld.load_input = declare_input_fs; bld_base->emit_epilogue = si_llvm_emit_fs_epilogue; - - switch (sel->info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]) { - case TGSI_FS_DEPTH_LAYOUT_GREATER: - shader->db_shader_control |= - S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z); - break; - case TGSI_FS_DEPTH_LAYOUT_LESS: - shader->db_shader_control |= - S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z); - break; - } break; default: assert(!"Unsupported shader type"); @@ -4188,12 +4192,21 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, radeon_llvm_finalize_module(&si_shader_ctx.radeon_bld); mod = bld_base->base.gallivm->module; - r = si_compile_llvm(sscreen, shader, tm, mod); + r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm, + mod, debug, si_shader_ctx.type); if (r) { fprintf(stderr, "LLVM failed to compile shader\n"); goto out; } + si_shader_dump(sscreen, shader, debug, si_shader_ctx.type); + + r = si_shader_binary_upload(sscreen, shader); + if (r) { + fprintf(stderr, "LLVM failed to upload shader\n"); + goto out; + } + radeon_llvm_dispose(&si_shader_ctx.radeon_bld); if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) { @@ -4202,7 +4215,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, shader->gs_copy_shader->key = shader->key; si_shader_ctx.shader = shader->gs_copy_shader; if ((r = si_generate_gs_copy_shader(sscreen, &si_shader_ctx, - shader, dump))) { + shader, dump, debug))) { free(shader->gs_copy_shader); shader->gs_copy_shader = NULL; goto out; @@ -4217,6 +4230,14 @@ out: return r; } +void si_shader_destroy_binary(struct radeon_shader_binary *binary) +{ + FREE(binary->code); + FREE(binary->rodata); + FREE(binary->relocs); + FREE(binary->disasm_string); +} + void si_shader_destroy(struct si_shader *shader) { if (shader->gs_copy_shader) { @@ -4228,8 +4249,5 @@ void si_shader_destroy(struct si_shader *shader) r600_resource_reference(&shader->scratch_bo, NULL); r600_resource_reference(&shader->bo, NULL); - - FREE(shader->binary.code); - FREE(shader->binary.relocs); - FREE(shader->binary.disasm_string); + si_shader_destroy_binary(&shader->binary); } diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index b0c8680ecb3..1635358d505 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -76,10 +76,10 @@ struct radeon_shader_binary; struct radeon_shader_reloc; #define SI_SGPR_RW_BUFFERS 0 /* rings (& stream-out, VS only) */ -#define SI_SGPR_CONST 2 -#define SI_SGPR_SAMPLER 4 -#define SI_SGPR_RESOURCE 6 -#define SI_SGPR_VERTEX_BUFFER 8 /* VS only */ +#define SI_SGPR_CONST_BUFFERS 2 +#define SI_SGPR_SAMPLER_STATES 4 +#define SI_SGPR_SAMPLER_VIEWS 6 +#define SI_SGPR_VERTEX_BUFFERS 8 /* VS only */ #define SI_SGPR_BASE_VERTEX 10 /* VS only */ #define SI_SGPR_START_INSTANCE 11 /* VS only */ #define SI_SGPR_VS_STATE_BITS 12 /* VS(VS) only */ @@ -101,12 +101,12 @@ struct radeon_shader_reloc; /* LLVM function parameter indices */ #define SI_PARAM_RW_BUFFERS 0 -#define SI_PARAM_CONST 1 -#define SI_PARAM_SAMPLER 2 -#define SI_PARAM_RESOURCE 3 +#define SI_PARAM_CONST_BUFFERS 1 +#define SI_PARAM_SAMPLER_STATES 2 +#define SI_PARAM_SAMPLER_VIEWS 3 /* VS only parameters */ -#define SI_PARAM_VERTEX_BUFFER 4 +#define SI_PARAM_VERTEX_BUFFERS 4 #define SI_PARAM_BASE_VERTEX 5 #define SI_PARAM_START_INSTANCE 6 /* [0] = clamp vertex color */ @@ -201,6 +201,7 @@ struct si_shader_selector { bool forces_persample_interp_for_persp; bool forces_persample_interp_for_linear; + /* GS parameters. */ unsigned esgs_itemsize; unsigned gs_input_verts_per_prim; unsigned gs_output_prim; @@ -210,6 +211,9 @@ struct si_shader_selector { unsigned gsvs_vertex_size; unsigned max_gsvs_emit_size; + /* PS parameters. */ + unsigned db_shader_control; + /* masks of "get_unique_index" bits */ uint64_t outputs_written; uint32_t patch_outputs_written; @@ -258,6 +262,17 @@ union si_shader_key { } tes; /* tessellation evaluation shader */ }; +struct si_shader_config { + unsigned num_sgprs; + unsigned num_vgprs; + unsigned lds_size; + unsigned spi_ps_input_ena; + unsigned float_mode; + unsigned scratch_bytes_per_wave; + unsigned rsrc1; + unsigned rsrc2; +}; + struct si_shader { struct si_shader_selector *selector; struct si_shader *next_variant; @@ -266,18 +281,9 @@ struct si_shader { struct si_pm4_state *pm4; struct r600_resource *bo; struct r600_resource *scratch_bo; - struct radeon_shader_binary binary; - unsigned num_sgprs; - unsigned num_vgprs; - unsigned lds_size; - unsigned spi_ps_input_ena; - unsigned float_mode; - unsigned scratch_bytes_per_wave; - unsigned spi_shader_col_format; - unsigned spi_shader_z_format; - unsigned db_shader_control; - unsigned cb_shader_mask; union si_shader_key key; + struct radeon_shader_binary binary; + struct si_shader_config config; unsigned nparam; unsigned vs_output_param_offset[PIPE_MAX_SHADER_OUTPUTS]; @@ -288,9 +294,6 @@ struct si_shader { unsigned nr_param_exports; bool is_gs_copy_shader; bool dx10_clamp_mode; /* convert NaNs to 0 */ - - unsigned rsrc1; - unsigned rsrc2; }; static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx) @@ -327,19 +330,27 @@ static inline bool si_vs_exports_prim_id(struct si_shader *shader) /* radeonsi_shader.c */ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, - struct si_shader *shader); + struct si_shader *shader, + struct pipe_debug_callback *debug); void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f); -int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader, - LLVMTargetMachineRef tm, LLVMModuleRef mod); +int si_compile_llvm(struct si_screen *sscreen, + struct radeon_shader_binary *binary, + struct si_shader_config *conf, + LLVMTargetMachineRef tm, + LLVMModuleRef mod, + struct pipe_debug_callback *debug, + unsigned processor); void si_shader_destroy(struct si_shader *shader); +void si_shader_destroy_binary(struct radeon_shader_binary *binary); unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index); int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader); -int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader); +void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, + struct pipe_debug_callback *debug, unsigned processor); void si_shader_apply_scratch_relocs(struct si_context *sctx, struct si_shader *shader, uint64_t scratch_va); -void si_shader_binary_read_config(const struct si_screen *sscreen, - struct si_shader *shader, - unsigned symbol_offset); +void si_shader_binary_read_config(struct radeon_shader_binary *binary, + struct si_shader_config *conf, + unsigned symbol_offset); #endif diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index e5500111f43..91ccd073267 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -163,7 +163,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx, perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size; lds_size = output_patch0_offset + output_patch_size * *num_patches; - ls_rsrc2 = ls->current->rsrc2; + ls_rsrc2 = ls->current->config.rsrc2; if (sctx->b.chip_class >= CIK) { assert(lds_size <= 65536); @@ -178,7 +178,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx, if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII) radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2); radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2); - radeon_emit(cs, ls->current->rsrc1); + radeon_emit(cs, ls->current->config.rsrc1); radeon_emit(cs, ls_rsrc2); /* Compute userdata SGPRs. */ @@ -818,7 +818,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) si_get_draw_start_count(sctx, info, &start, &count); start_offset = start * ib.index_size; - u_upload_alloc(sctx->b.uploader, start_offset, count * 2, + u_upload_alloc(sctx->b.uploader, start_offset, count * 2, 256, &out_offset, &out_buffer, &ptr); if (!out_buffer) { pipe_resource_reference(&ib.buffer, NULL); @@ -842,7 +842,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) start_offset = start * ib.index_size; u_upload_data(sctx->b.uploader, start_offset, count * ib.index_size, - (char*)ib.user_buffer + start_offset, + 256, (char*)ib.user_buffer + start_offset, &ib.offset, &ib.buffer); if (!ib.buffer) return; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 8700590435f..64adf699604 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -111,7 +111,7 @@ static void si_shader_ls(struct si_shader *shader) vgpr_comp_cnt = shader->uses_instanceid ? 3 : 1; num_user_sgprs = SI_LS_NUM_USER_SGPR; - num_sgprs = shader->num_sgprs; + num_sgprs = shader->config.num_sgprs; if (num_user_sgprs > num_sgprs) { /* Last 2 reserved SGPRs are used for VCC */ num_sgprs = num_user_sgprs + 2; @@ -121,12 +121,12 @@ static void si_shader_ls(struct si_shader *shader) si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8); si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40); - shader->rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) | + shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B528_SGPRS((num_sgprs - 1) / 8) | S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B528_DX10_CLAMP(shader->dx10_clamp_mode); - shader->rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) | - S_00B52C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0); + shader->config.rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) | + S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0); } static void si_shader_hs(struct si_shader *shader) @@ -143,7 +143,7 @@ static void si_shader_hs(struct si_shader *shader) si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER); num_user_sgprs = SI_TCS_NUM_USER_SGPR; - num_sgprs = shader->num_sgprs; + num_sgprs = shader->config.num_sgprs; /* One SGPR after user SGPRs is pre-loaded with tessellation factor * buffer offset. */ if ((num_user_sgprs + 1) > num_sgprs) { @@ -155,12 +155,12 @@ static void si_shader_hs(struct si_shader *shader) si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8); si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, va >> 40); si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS, - S_00B428_VGPRS((shader->num_vgprs - 1) / 4) | + S_00B428_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B428_SGPRS((num_sgprs - 1) / 8) | S_00B428_DX10_CLAMP(shader->dx10_clamp_mode)); si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, S_00B42C_USER_SGPR(num_user_sgprs) | - S_00B42C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); + S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); } static void si_shader_es(struct si_shader *shader) @@ -187,7 +187,7 @@ static void si_shader_es(struct si_shader *shader) } else unreachable("invalid shader selector type"); - num_sgprs = shader->num_sgprs; + num_sgprs = shader->config.num_sgprs; /* One SGPR after user SGPRs is pre-loaded with es2gs_offset */ if ((num_user_sgprs + 1) > num_sgprs) { /* Last 2 reserved SGPRs are used for VCC */ @@ -200,13 +200,13 @@ static void si_shader_es(struct si_shader *shader) si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40); si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES, - S_00B328_VGPRS((shader->num_vgprs - 1) / 4) | + S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B328_SGPRS((num_sgprs - 1) / 8) | S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B328_DX10_CLAMP(shader->dx10_clamp_mode)); si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES, S_00B32C_USER_SGPR(num_user_sgprs) | - S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); + S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); if (shader->selector->type == PIPE_SHADER_TESS_EVAL) si_set_tesseval_regs(shader, pm4); @@ -272,7 +272,7 @@ static void si_shader_gs(struct si_shader *shader) si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, va >> 40); num_user_sgprs = SI_GS_NUM_USER_SGPR; - num_sgprs = shader->num_sgprs; + num_sgprs = shader->config.num_sgprs; /* Two SGPRs after user SGPRs are pre-loaded with gs2vs_offset, gs_wave_id */ if ((num_user_sgprs + 2) > num_sgprs) { /* Last 2 reserved SGPRs are used for VCC */ @@ -281,12 +281,12 @@ static void si_shader_gs(struct si_shader *shader) assert(num_sgprs <= 104); si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, - S_00B228_VGPRS((shader->num_vgprs - 1) / 4) | + S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B228_SGPRS((num_sgprs - 1) / 8) | S_00B228_DX10_CLAMP(shader->dx10_clamp_mode)); si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, S_00B22C_USER_SGPR(num_user_sgprs) | - S_00B22C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); + S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); } static void si_shader_vs(struct si_shader *shader) @@ -329,7 +329,7 @@ static void si_shader_vs(struct si_shader *shader) } else unreachable("invalid shader selector type"); - num_sgprs = shader->num_sgprs; + num_sgprs = shader->config.num_sgprs; if (num_user_sgprs > num_sgprs) { /* Last 2 reserved SGPRs are used for VCC */ num_sgprs = num_user_sgprs + 2; @@ -356,7 +356,7 @@ static void si_shader_vs(struct si_shader *shader) si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8); si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, va >> 40); si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS, - S_00B128_VGPRS((shader->num_vgprs - 1) / 4) | + S_00B128_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B128_SGPRS((num_sgprs - 1) / 8) | S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B128_DX10_CLAMP(shader->dx10_clamp_mode)); @@ -367,7 +367,7 @@ static void si_shader_vs(struct si_shader *shader) S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) | S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) | S_00B12C_SO_EN(!!shader->selector->so.num_outputs) | - S_00B12C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); + S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); if (window_space) si_pm4_set_reg(pm4, R_028818_PA_CL_VTE_CNTL, S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1)); @@ -387,6 +387,8 @@ static void si_shader_ps(struct si_shader *shader) struct tgsi_shader_info *info = &shader->selector->info; struct si_pm4_state *pm4; unsigned i, spi_ps_in_control; + unsigned spi_shader_col_format = 0, cb_shader_mask = 0; + unsigned colors_written, export_16bpc; unsigned num_sgprs, num_user_sgprs; unsigned spi_baryc_cntl = 0; uint64_t va; @@ -422,19 +424,43 @@ static void si_shader_ps(struct si_shader *shader) } } - has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->spi_ps_input_ena) || - G_0286CC_LINEAR_CENTROID_ENA(shader->spi_ps_input_ena); + /* Find out what SPI_SHADER_COL_FORMAT and CB_SHADER_MASK should be. */ + colors_written = info->colors_written; + export_16bpc = shader->key.ps.export_16bpc; + + if (info->colors_written == 0x1 && + info->properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]) { + colors_written |= (1 << (shader->key.ps.last_cbuf + 1)) - 1; + } + + while (colors_written) { + i = u_bit_scan(&colors_written); + if (export_16bpc & (1 << i)) + spi_shader_col_format |= V_028714_SPI_SHADER_FP16_ABGR << (4 * i); + else + spi_shader_col_format |= V_028714_SPI_SHADER_32_ABGR << (4 * i); + cb_shader_mask |= 0xf << (4 * i); + } + + /* Set interpolation controls. */ + has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena) || + G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena); spi_ps_in_control = S_0286D8_NUM_INTERP(shader->nparam) | S_0286D8_BC_OPTIMIZE_DISABLE(has_centroid); + /* Set registers. */ si_pm4_set_reg(pm4, R_0286E0_SPI_BARYC_CNTL, spi_baryc_cntl); si_pm4_set_reg(pm4, R_0286D8_SPI_PS_IN_CONTROL, spi_ps_in_control); - si_pm4_set_reg(pm4, R_028710_SPI_SHADER_Z_FORMAT, shader->spi_shader_z_format); - si_pm4_set_reg(pm4, R_028714_SPI_SHADER_COL_FORMAT, - shader->spi_shader_col_format); - si_pm4_set_reg(pm4, R_02823C_CB_SHADER_MASK, shader->cb_shader_mask); + si_pm4_set_reg(pm4, R_028710_SPI_SHADER_Z_FORMAT, + info->writes_samplemask ? V_028710_SPI_SHADER_32_ABGR : + info->writes_stencil ? V_028710_SPI_SHADER_32_GR : + info->writes_z ? V_028710_SPI_SHADER_32_R : + V_028710_SPI_SHADER_ZERO); + + si_pm4_set_reg(pm4, R_028714_SPI_SHADER_COL_FORMAT, spi_shader_col_format); + si_pm4_set_reg(pm4, R_02823C_CB_SHADER_MASK, cb_shader_mask); va = shader->bo->gpu_address; si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER); @@ -442,7 +468,7 @@ static void si_shader_ps(struct si_shader *shader) si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, va >> 40); num_user_sgprs = SI_PS_NUM_USER_SGPR; - num_sgprs = shader->num_sgprs; + num_sgprs = shader->config.num_sgprs; /* One SGPR after user SGPRs is pre-loaded with {prim_mask, lds_offset} */ if ((num_user_sgprs + 1) > num_sgprs) { /* Last 2 reserved SGPRs are used for VCC */ @@ -451,13 +477,13 @@ static void si_shader_ps(struct si_shader *shader) assert(num_sgprs <= 104); si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS, - S_00B028_VGPRS((shader->num_vgprs - 1) / 4) | + S_00B028_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B028_SGPRS((num_sgprs - 1) / 8) | S_00B028_DX10_CLAMP(shader->dx10_clamp_mode)); si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS, - S_00B02C_EXTRA_LDS_SIZE(shader->lds_size) | + S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) | S_00B02C_USER_SGPR(num_user_sgprs) | - S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); + S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); } static void si_shader_init_pm4_state(struct si_shader *shader) @@ -496,6 +522,16 @@ static void si_shader_init_pm4_state(struct si_shader *shader) } } +static unsigned si_get_alpha_test_func(struct si_context *sctx) +{ + /* Alpha-test should be disabled if colorbuffer 0 is integer. */ + if (sctx->queued.named.dsa && + !sctx->framebuffer.cb0_is_integer) + return sctx->queued.named.dsa->alpha_func; + + return PIPE_FUNC_ALWAYS; +} + /* Compute the key for the hw shader variant */ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_shader_selector *sel, @@ -537,8 +573,10 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, case PIPE_SHADER_FRAGMENT: { struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]) + if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] && + sel->info.colors_written == 0x1) key->ps.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1; + key->ps.export_16bpc = sctx->framebuffer.export_16bpc; if (rs) { @@ -562,11 +600,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, key->ps.clamp_color = rs->clamp_fragment_color; } - key->ps.alpha_func = PIPE_FUNC_ALWAYS; - /* Alpha-test should be disabled if colorbuffer 0 is integer. */ - if (sctx->queued.named.dsa && - !sctx->framebuffer.cb0_is_integer) - key->ps.alpha_func = sctx->queued.named.dsa->alpha_func; + key->ps.alpha_func = si_get_alpha_test_func(sctx); break; } default: @@ -616,7 +650,7 @@ static int si_shader_select(struct pipe_context *ctx, shader->selector = sel; shader->key = key; - r = si_shader_create(sctx->screen, sctx->tm, shader); + r = si_shader_create(sctx->screen, sctx->tm, shader, &sctx->b.debug); if (unlikely(r)) { R600_ERR("Failed to build shader variant (type=%u) %d\n", sel->type, r); @@ -731,6 +765,25 @@ static void *si_create_shader_selector(struct pipe_context *ctx, break; } + /* DB_SHADER_CONTROL */ + sel->db_shader_control = + S_02880C_Z_EXPORT_ENABLE(sel->info.writes_z) | + S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(sel->info.writes_stencil) | + S_02880C_MASK_EXPORT_ENABLE(sel->info.writes_samplemask) | + S_02880C_KILL_ENABLE(sel->info.uses_kill); + + switch (sel->info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]) { + case TGSI_FS_DEPTH_LAYOUT_GREATER: + sel->db_shader_control |= + S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z); + break; + case TGSI_FS_DEPTH_LAYOUT_LESS: + sel->db_shader_control |= + S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z); + break; + } + + /* Pre-compilation. */ if (sscreen->b.debug_flags & DBG_PRECOMPILE) { struct si_shader_ctx_state state = {sel}; @@ -987,7 +1040,7 @@ static void si_emit_spi_ps_input(struct si_context *sctx, struct r600_atom *atom if (!ps) return; - input_ena = ps->spi_ps_input_ena; + input_ena = ps->config.spi_ps_input_ena; /* we need to enable at least one of them, otherwise we hang the GPU */ assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) || @@ -1216,7 +1269,7 @@ static int si_update_scratch_buffer(struct si_context *sctx, return 0; /* This shader doesn't need a scratch buffer */ - if (shader->scratch_bytes_per_wave == 0) + if (shader->config.scratch_bytes_per_wave == 0) return 0; /* This shader is already configured to use the current @@ -1248,7 +1301,7 @@ static unsigned si_get_current_scratch_buffer_size(struct si_context *sctx) static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader *shader) { - return shader ? shader->scratch_bytes_per_wave : 0; + return shader ? shader->config.scratch_bytes_per_wave : 0; } static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx) @@ -1549,6 +1602,10 @@ bool si_update_shaders(struct si_context *sctx) si_update_vgt_shader_config(sctx); if (sctx->ps_shader.cso) { + unsigned db_shader_control = + sctx->ps_shader.cso->db_shader_control | + S_02880C_KILL_ENABLE(si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS); + r = si_shader_select(ctx, &sctx->ps_shader); if (r) return false; @@ -1568,8 +1625,8 @@ bool si_update_shaders(struct si_context *sctx) si_mark_atom_dirty(sctx, &sctx->spi_ps_input); } - if (sctx->ps_db_shader_control != sctx->ps_shader.current->db_shader_control) { - sctx->ps_db_shader_control = sctx->ps_shader.current->db_shader_control; + if (sctx->ps_db_shader_control != db_shader_control) { + sctx->ps_db_shader_control = db_shader_control; si_mark_atom_dirty(sctx, &sctx->db_render_state); } diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h index 8e5e24217a5..d5c4aaae638 100644 --- a/src/gallium/drivers/softpipe/sp_context.h +++ b/src/gallium/drivers/softpipe/sp_context.h @@ -37,6 +37,7 @@ #include "draw/draw_vertex.h" #include "sp_quad_pipe.h" +#include "sp_setup.h" /** Do polygon stipple in the draw module? */ @@ -117,17 +118,17 @@ struct softpipe_context { unsigned const_buffer_size[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS]; /** Vertex format */ + struct sp_setup_info setup_info; struct vertex_info vertex_info; - struct vertex_info vertex_info_vbuf; /** Which vertex shader output slot contains point size */ - int psize_slot; + int8_t psize_slot; /** Which vertex shader output slot contains viewport index */ - int viewport_index_slot; + int8_t viewport_index_slot; /** Which vertex shader output slot contains layer */ - int layer_slot; + int8_t layer_slot; /** The reduced version of the primitive supplied by the state tracker */ unsigned reduced_api_prim; diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c index f8a3eacdb37..95d1ac1514f 100644 --- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c +++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c @@ -161,7 +161,7 @@ sp_vbuf_draw_elements(struct vbuf_render *vbr, const ushort *indices, uint nr) { struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr); struct softpipe_context *softpipe = cvbr->softpipe; - const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float); + const unsigned stride = softpipe->vertex_info.size * sizeof(float); const void *vertex_buffer = cvbr->vertex_buffer; struct setup_context *setup = cvbr->setup; const boolean flatshade_first = softpipe->rasterizer->flatshade_first; @@ -358,7 +358,7 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr) struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr); struct softpipe_context *softpipe = cvbr->softpipe; struct setup_context *setup = cvbr->setup; - const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float); + const unsigned stride = softpipe->vertex_info.size * sizeof(float); const void *vertex_buffer = (void *) get_vert(cvbr->vertex_buffer, start, stride); const boolean flatshade_first = softpipe->rasterizer->flatshade_first; diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c index 76105b4c0ec..c28d28d5f5d 100644 --- a/src/gallium/drivers/softpipe/sp_query.c +++ b/src/gallium/drivers/softpipe/sp_query.c @@ -223,7 +223,7 @@ softpipe_get_query_result(struct pipe_context *pipe, break; case PIPE_QUERY_PIPELINE_STATISTICS: memcpy(vresult, &sq->stats, - sizeof(struct pipe_query_data_pipeline_statistics));; + sizeof(struct pipe_query_data_pipeline_statistics)); break; case PIPE_QUERY_GPU_FINISHED: vresult->b = TRUE; diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c index 9939720e259..29e392b94e8 100644 --- a/src/gallium/drivers/softpipe/sp_screen.c +++ b/src/gallium/drivers/softpipe/sp_screen.c @@ -251,6 +251,13 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: case PIPE_CAP_CLEAR_TEXTURE: + case PIPE_CAP_DRAW_PARAMETERS: + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: + case PIPE_CAP_MULTI_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: + case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: + case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: return 0; } /* should only get here on unhandled cases */ diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c index ac2d97825ce..ffe49260b9a 100644 --- a/src/gallium/drivers/softpipe/sp_setup.c +++ b/src/gallium/drivers/softpipe/sp_setup.c @@ -38,7 +38,6 @@ #include "sp_setup.h" #include "sp_state.h" #include "draw/draw_context.h" -#include "draw/draw_vertex.h" #include "pipe/p_shader_tokens.h" #include "util/u_math.h" #include "util/u_memory.h" @@ -599,10 +598,12 @@ setup_tri_coefficients(struct setup_context *setup) { struct softpipe_context *softpipe = setup->softpipe; const struct tgsi_shader_info *fsInfo = &setup->softpipe->fs_variant->info; - const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe); + const struct sp_setup_info *sinfo = &softpipe->setup_info; uint fragSlot; float v[3]; + assert(sinfo->valid); + /* z and w are done by linear interpolation: */ v[0] = setup->vmin[0][2]; @@ -618,15 +619,16 @@ setup_tri_coefficients(struct setup_context *setup) /* setup interpolation for all the remaining attributes: */ for (fragSlot = 0; fragSlot < fsInfo->num_inputs; fragSlot++) { - const uint vertSlot = vinfo->attrib[fragSlot].src_index; + const uint vertSlot = sinfo->attrib[fragSlot].src_index; uint j; - switch (vinfo->attrib[fragSlot].interp_mode) { - case INTERP_CONSTANT: - for (j = 0; j < TGSI_NUM_CHANNELS; j++) + switch (sinfo->attrib[fragSlot].interp) { + case SP_INTERP_CONSTANT: + for (j = 0; j < TGSI_NUM_CHANNELS; j++) { const_coeff(setup, &setup->coef[fragSlot], vertSlot, j); + } break; - case INTERP_LINEAR: + case SP_INTERP_LINEAR: for (j = 0; j < TGSI_NUM_CHANNELS; j++) { tri_apply_cylindrical_wrap(setup->vmin[vertSlot][j], setup->vmid[vertSlot][j], @@ -636,7 +638,7 @@ setup_tri_coefficients(struct setup_context *setup) tri_linear_coeff(setup, &setup->coef[fragSlot], j, v); } break; - case INTERP_PERSPECTIVE: + case SP_INTERP_PERSPECTIVE: for (j = 0; j < TGSI_NUM_CHANNELS; j++) { tri_apply_cylindrical_wrap(setup->vmin[vertSlot][j], setup->vmid[vertSlot][j], @@ -646,7 +648,7 @@ setup_tri_coefficients(struct setup_context *setup) tri_persp_coeff(setup, &setup->coef[fragSlot], j, v); } break; - case INTERP_POS: + case SP_INTERP_POS: setup_fragcoord_coeff(setup, fragSlot); break; default: @@ -966,11 +968,13 @@ setup_line_coefficients(struct setup_context *setup, { struct softpipe_context *softpipe = setup->softpipe; const struct tgsi_shader_info *fsInfo = &setup->softpipe->fs_variant->info; - const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe); + const struct sp_setup_info *sinfo = &softpipe->setup_info; uint fragSlot; float area; float v[2]; + assert(sinfo->valid); + /* use setup->vmin, vmax to point to vertices */ if (softpipe->rasterizer->flatshade_first) setup->vprovoke = v0; @@ -1001,15 +1005,15 @@ setup_line_coefficients(struct setup_context *setup, /* setup interpolation for all the remaining attributes: */ for (fragSlot = 0; fragSlot < fsInfo->num_inputs; fragSlot++) { - const uint vertSlot = vinfo->attrib[fragSlot].src_index; + const uint vertSlot = sinfo->attrib[fragSlot].src_index; uint j; - switch (vinfo->attrib[fragSlot].interp_mode) { - case INTERP_CONSTANT: + switch (sinfo->attrib[fragSlot].interp) { + case SP_INTERP_CONSTANT: for (j = 0; j < TGSI_NUM_CHANNELS; j++) const_coeff(setup, &setup->coef[fragSlot], vertSlot, j); break; - case INTERP_LINEAR: + case SP_INTERP_LINEAR: for (j = 0; j < TGSI_NUM_CHANNELS; j++) { line_apply_cylindrical_wrap(setup->vmin[vertSlot][j], setup->vmax[vertSlot][j], @@ -1018,7 +1022,7 @@ setup_line_coefficients(struct setup_context *setup, line_linear_coeff(setup, &setup->coef[fragSlot], j, v); } break; - case INTERP_PERSPECTIVE: + case SP_INTERP_PERSPECTIVE: for (j = 0; j < TGSI_NUM_CHANNELS; j++) { line_apply_cylindrical_wrap(setup->vmin[vertSlot][j], setup->vmax[vertSlot][j], @@ -1027,7 +1031,7 @@ setup_line_coefficients(struct setup_context *setup, line_persp_coeff(setup, &setup->coef[fragSlot], j, v); } break; - case INTERP_POS: + case SP_INTERP_POS: setup_fragcoord_coeff(setup, fragSlot); break; default: @@ -1236,7 +1240,7 @@ sp_setup_point(struct setup_context *setup, const boolean round = (boolean) setup->softpipe->rasterizer->point_smooth; const float x = v0[0][0]; /* Note: data[0] is always position */ const float y = v0[0][1]; - const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe); + const struct sp_setup_info *sinfo = &softpipe->setup_info; uint fragSlot; uint layer = 0; unsigned viewport_index = 0; @@ -1245,6 +1249,8 @@ sp_setup_point(struct setup_context *setup, print_vertex(setup, v0); #endif + assert(sinfo->valid); + if (setup->softpipe->no_rast || setup->softpipe->rasterizer->rasterizer_discard) return; @@ -1285,22 +1291,22 @@ sp_setup_point(struct setup_context *setup, const_coeff(setup, &setup->posCoef, 0, 3); for (fragSlot = 0; fragSlot < fsInfo->num_inputs; fragSlot++) { - const uint vertSlot = vinfo->attrib[fragSlot].src_index; + const uint vertSlot = sinfo->attrib[fragSlot].src_index; uint j; - switch (vinfo->attrib[fragSlot].interp_mode) { - case INTERP_CONSTANT: + switch (sinfo->attrib[fragSlot].interp) { + case SP_INTERP_CONSTANT: /* fall-through */ - case INTERP_LINEAR: + case SP_INTERP_LINEAR: for (j = 0; j < TGSI_NUM_CHANNELS; j++) const_coeff(setup, &setup->coef[fragSlot], vertSlot, j); break; - case INTERP_PERSPECTIVE: + case SP_INTERP_PERSPECTIVE: for (j = 0; j < TGSI_NUM_CHANNELS; j++) point_persp_coeff(setup, setup->vprovoke, &setup->coef[fragSlot], vertSlot, j); break; - case INTERP_POS: + case SP_INTERP_POS: setup_fragcoord_coeff(setup, fragSlot); break; default: diff --git a/src/gallium/drivers/softpipe/sp_setup.h b/src/gallium/drivers/softpipe/sp_setup.h index 191494acbb8..a54dc5dad0c 100644 --- a/src/gallium/drivers/softpipe/sp_setup.h +++ b/src/gallium/drivers/softpipe/sp_setup.h @@ -30,11 +30,30 @@ struct setup_context; struct softpipe_context; +/** + * Attribute interpolation mode + */ +enum sp_interp_mode { + SP_INTERP_POS, /**< special case for frag position */ + SP_INTERP_CONSTANT, + SP_INTERP_LINEAR, + SP_INTERP_PERSPECTIVE +}; + + +struct sp_setup_info { + unsigned valid; + struct { + unsigned interp:8; /**< SP_INTERP_X */ + int src_index:8; + } attrib[PIPE_MAX_SHADER_OUTPUTS]; +}; + void -sp_setup_tri( struct setup_context *setup, - const float (*v0)[4], - const float (*v1)[4], - const float (*v2)[4] ); +sp_setup_tri(struct setup_context *setup, + const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4]); void sp_setup_line(struct setup_context *setup, diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h index c35534c931d..16a2897f526 100644 --- a/src/gallium/drivers/softpipe/sp_state.h +++ b/src/gallium/drivers/softpipe/sp_state.h @@ -175,9 +175,6 @@ softpipe_unmap_texture_surfaces(struct softpipe_context *sp); struct vertex_info * -softpipe_get_vertex_info(struct softpipe_context *softpipe); - -struct vertex_info * softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe); diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c index 7e998af1325..d4d03f1be50 100644 --- a/src/gallium/drivers/softpipe/sp_state_derived.c +++ b/src/gallium/drivers/softpipe/sp_state_derived.c @@ -48,7 +48,7 @@ static void invalidate_vertex_layout(struct softpipe_context *softpipe) { - softpipe->vertex_info.num_attribs = 0; + softpipe->setup_info.valid = 0; } @@ -57,50 +57,64 @@ invalidate_vertex_layout(struct softpipe_context *softpipe) * (simple float[][4]) used by the 'draw' module into vertices for * rasterization. * - * This function validates the vertex layout and returns a pointer to a - * vertex_info object. + * This function validates the vertex layout. */ -struct vertex_info * -softpipe_get_vertex_info(struct softpipe_context *softpipe) +static void +softpipe_compute_vertex_info(struct softpipe_context *softpipe) { - struct vertex_info *vinfo = &softpipe->vertex_info; - int vs_index; + struct sp_setup_info *sinfo = &softpipe->setup_info; - if (vinfo->num_attribs == 0) { - /* compute vertex layout now */ + if (sinfo->valid == 0) { const struct tgsi_shader_info *fsInfo = &softpipe->fs_variant->info; - struct vertex_info *vinfo_vbuf = &softpipe->vertex_info_vbuf; - const uint num = draw_num_shader_outputs(softpipe->draw); + struct vertex_info *vinfo = &softpipe->vertex_info; uint i; - - /* Tell draw_vbuf to simply emit the whole post-xform vertex - * as-is. No longer any need to try and emit draw vertex_header - * info. + int vs_index; + /* + * This doesn't quite work right (wrt face injection, prim id, + * wide points) - hit a couple assertions, misrenderings plus + * memory corruption. Albeit could fix (the former two) by calling + * this "more often" (rasterizer changes etc.). (The latter would + * need to be included in draw_prepare_shader_outputs, but it looks + * like that would potentially allocate quite some unused additional + * vertex outputs.) + * draw_prepare_shader_outputs(softpipe->draw); */ - vinfo_vbuf->num_attribs = 0; - for (i = 0; i < num; i++) { - draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i); - } - draw_compute_vertex_size(vinfo_vbuf); /* - * Loop over fragment shader inputs, searching for the matching output - * from the vertex shader. + * Those can't actually be 0 (because pos is always at 0). + * But use ints anyway to avoid confusion (in vs outputs, they + * can very well be at pos 0). */ + softpipe->viewport_index_slot = -1; + softpipe->layer_slot = -1; + softpipe->psize_slot = -1; + vinfo->num_attribs = 0; + + /* + * Put position always first (setup needs it there). + */ + vs_index = draw_find_shader_output(softpipe->draw, + TGSI_SEMANTIC_POSITION, 0); + + draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index); + + /* + * Match FS inputs against VS outputs, emitting the necessary + * attributes. + */ for (i = 0; i < fsInfo->num_inputs; i++) { - int src; - enum interp_mode interp = INTERP_LINEAR; + enum sp_interp_mode interp = SP_INTERP_LINEAR; switch (fsInfo->input_interpolate[i]) { case TGSI_INTERPOLATE_CONSTANT: - interp = INTERP_CONSTANT; + interp = SP_INTERP_CONSTANT; break; case TGSI_INTERPOLATE_LINEAR: - interp = INTERP_LINEAR; + interp = SP_INTERP_LINEAR; break; case TGSI_INTERPOLATE_PERSPECTIVE: - interp = INTERP_PERSPECTIVE; + interp = SP_INTERP_PERSPECTIVE; break; case TGSI_INTERPOLATE_COLOR: assert(fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR); @@ -111,88 +125,121 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe) switch (fsInfo->input_semantic_name[i]) { case TGSI_SEMANTIC_POSITION: - interp = INTERP_POS; + interp = SP_INTERP_POS; break; case TGSI_SEMANTIC_COLOR: if (fsInfo->input_interpolate[i] == TGSI_INTERPOLATE_COLOR) { if (softpipe->rasterizer->flatshade) - interp = INTERP_CONSTANT; + interp = SP_INTERP_CONSTANT; else - interp = INTERP_PERSPECTIVE; + interp = SP_INTERP_PERSPECTIVE; } break; } - /* this includes texcoords and varying vars */ - src = draw_find_shader_output(softpipe->draw, - fsInfo->input_semantic_name[i], - fsInfo->input_semantic_index[i]); - if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR && src == -1) - /* try and find a bcolor */ - src = draw_find_shader_output(softpipe->draw, - TGSI_SEMANTIC_BCOLOR, fsInfo->input_semantic_index[i]); + /* + * Search for each input in current vs output: + */ + vs_index = draw_find_shader_output(softpipe->draw, + fsInfo->input_semantic_name[i], + fsInfo->input_semantic_index[i]); + + if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR && + vs_index == -1) { + /* + * try and find a bcolor. + * Note that if there's both front and back color, draw will + * have copied back to front color already. + */ + vs_index = draw_find_shader_output(softpipe->draw, + TGSI_SEMANTIC_BCOLOR, + fsInfo->input_semantic_index[i]); + } - draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src); + sinfo->attrib[i].interp = interp; + /* extremely pointless index map */ + sinfo->attrib[i].src_index = i + 1; + /* + * For vp index and layer, if the fs requires them but the vs doesn't + * provide them, draw (vbuf) will give us the required 0 (slot -1). + * (This means in this case we'll also use those slots in setup, which + * isn't necessary but they'll contain the correct (0) value.) + */ + if (fsInfo->input_semantic_name[i] == + TGSI_SEMANTIC_VIEWPORT_INDEX) { + softpipe->viewport_index_slot = (int)vinfo->num_attribs; + draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index); + } else if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_LAYER) { + softpipe->layer_slot = (int)vinfo->num_attribs; + draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index); + /* + * Note that we'd actually want to skip position (as we won't use + * the attribute in the fs) but can't. The reason is that we don't + * actually have a input/output map for setup (even though it looks + * like we do...). Could adjust for this though even without a map. + */ + } else { + /* + * Note that we'd actually want to skip position (as we won't use + * the attribute in the fs) but can't. The reason is that we don't + * actually have a input/output map for setup (even though it looks + * like we do...). Could adjust for this though even without a map. + */ + draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index); + } } - /* Figure out if we need pointsize as well. */ + /* Figure out if we need pointsize as well. + */ vs_index = draw_find_shader_output(softpipe->draw, TGSI_SEMANTIC_PSIZE, 0); if (vs_index >= 0) { - softpipe->psize_slot = vinfo->num_attribs; - draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); + softpipe->psize_slot = (int)vinfo->num_attribs; + draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index); } - /* Figure out if we need viewport index */ - vs_index = draw_find_shader_output(softpipe->draw, - TGSI_SEMANTIC_VIEWPORT_INDEX, - 0); - if (vs_index >= 0) { - softpipe->viewport_index_slot = vinfo->num_attribs; - draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); - } else { - softpipe->viewport_index_slot = 0; + /* Figure out if we need viewport index (if it wasn't already in fs input) */ + if (softpipe->viewport_index_slot < 0) { + vs_index = draw_find_shader_output(softpipe->draw, + TGSI_SEMANTIC_VIEWPORT_INDEX, + 0); + if (vs_index >= 0) { + softpipe->viewport_index_slot =(int)vinfo->num_attribs; + draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index); + } } - /* Figure out if we need layer */ - vs_index = draw_find_shader_output(softpipe->draw, - TGSI_SEMANTIC_LAYER, - 0); - if (vs_index >= 0) { - softpipe->layer_slot = vinfo->num_attribs; - draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); - } else { - softpipe->layer_slot = 0; + /* Figure out if we need layer (if it wasn't already in fs input) */ + if (softpipe->layer_slot < 0) { + vs_index = draw_find_shader_output(softpipe->draw, + TGSI_SEMANTIC_LAYER, + 0); + if (vs_index >= 0) { + softpipe->layer_slot = (int)vinfo->num_attribs; + draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index); + } } draw_compute_vertex_size(vinfo); + softpipe->setup_info.valid = 1; } - - return vinfo; + return; } /** * Called from vbuf module. * - * Note that there's actually two different vertex layouts in softpipe. - * - * The normal one is computed in softpipe_get_vertex_info() above and is - * used by the point/line/tri "setup" code. - * - * The other one (this one) is only used by the vbuf module (which is - * not normally used by default but used in testing). For the vbuf module, - * we basically want to pass-through the draw module's vertex layout as-is. - * When the softpipe vbuf code begins drawing, the normal vertex layout - * will come into play again. + * This will trigger validation of the vertex layout (and also compute + * the required information for setup). */ struct vertex_info * softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe) { - (void) softpipe_get_vertex_info(softpipe); - return &softpipe->vertex_info_vbuf; + softpipe_compute_vertex_info(softpipe); + return &softpipe->vertex_info; } diff --git a/src/gallium/drivers/softpipe/sp_state_shader.c b/src/gallium/drivers/softpipe/sp_state_shader.c index dce0404de5b..f0d66a53ec6 100644 --- a/src/gallium/drivers/softpipe/sp_state_shader.c +++ b/src/gallium/drivers/softpipe/sp_state_shader.c @@ -64,7 +64,8 @@ create_fs_variant(struct softpipe_context *softpipe, /* get new shader that implements polygon stippling */ var->tokens = util_pstipple_create_fragment_shader(curfs->tokens, - &var->stipple_sampler_unit, 0); + &var->stipple_sampler_unit, 0, + TGSI_FILE_INPUT); } else #endif diff --git a/src/gallium/drivers/svga/svga_cmd.c b/src/gallium/drivers/svga/svga_cmd.c index 10442cb46e7..e45b3e72aeb 100644 --- a/src/gallium/drivers/svga/svga_cmd.c +++ b/src/gallium/drivers/svga/svga_cmd.c @@ -337,7 +337,7 @@ SVGA3D_DefineSurface2D(struct svga_winsys_context *swc, // IN mipSizes[0].height = height; mipSizes[0].depth = 1; - swc->commit(swc);; + swc->commit(swc); return PIPE_OK; } @@ -372,7 +372,7 @@ SVGA3D_DestroySurface(struct svga_winsys_context *swc, swc->surface_relocation(swc, &cmd->sid, NULL, sid, SVGA_RELOC_WRITE | SVGA_RELOC_INTERNAL); - swc->commit(swc);; + swc->commit(swc); return PIPE_OK; } @@ -473,6 +473,7 @@ SVGA3D_SurfaceDMA(struct svga_winsys_context *swc, pSuffix->flags = flags; swc->commit(swc); + swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH; return PIPE_OK; } @@ -543,6 +544,7 @@ SVGA3D_BufferDMA(struct svga_winsys_context *swc, pSuffix->flags = flags; swc->commit(swc); + swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH; return PIPE_OK; } @@ -1016,7 +1018,7 @@ SVGA3D_BeginDrawPrimitives(struct svga_winsys_context *swc, *decls = declArray; *ranges = rangeArray; - swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED; + swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH; return PIPE_OK; } @@ -1720,6 +1722,7 @@ SVGA3D_UpdateGBImage(struct svga_winsys_context *swc, cmd->box = *box; swc->commit(swc); + swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH; return PIPE_OK; } @@ -1746,6 +1749,7 @@ SVGA3D_UpdateGBSurface(struct svga_winsys_context *swc, SVGA_RELOC_WRITE | SVGA_RELOC_INTERNAL); swc->commit(swc); + swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH; return PIPE_OK; } @@ -1775,6 +1779,7 @@ SVGA3D_ReadbackGBImage(struct svga_winsys_context *swc, cmd->image.mipmap = mipLevel; swc->commit(swc); + swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH; return PIPE_OK; } @@ -1801,6 +1806,7 @@ SVGA3D_ReadbackGBSurface(struct svga_winsys_context *swc, SVGA_RELOC_READ | SVGA_RELOC_INTERNAL); swc->commit(swc); + swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH; return PIPE_OK; } @@ -1829,6 +1835,7 @@ SVGA3D_ReadbackGBImagePartial(struct svga_winsys_context *swc, cmd->invertBox = invertBox; swc->commit(swc); + swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH; return PIPE_OK; } diff --git a/src/gallium/drivers/svga/svga_cmd_vgpu10.c b/src/gallium/drivers/svga/svga_cmd_vgpu10.c index 5c121089f91..4cd9d5b9d1e 100644 --- a/src/gallium/drivers/svga/svga_cmd_vgpu10.c +++ b/src/gallium/drivers/svga/svga_cmd_vgpu10.c @@ -535,7 +535,7 @@ SVGA3D_vgpu10_Draw(struct svga_winsys_context *swc, SVGA3D_COPY_BASIC_2(vertexCount, startVertexLocation); - swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED; + swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH; swc->commit(swc); return PIPE_OK; } @@ -551,7 +551,7 @@ SVGA3D_vgpu10_DrawIndexed(struct svga_winsys_context *swc, SVGA3D_COPY_BASIC_3(indexCount, startIndexLocation, baseVertexLocation); - swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED; + swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH; swc->commit(swc); return PIPE_OK; } @@ -568,7 +568,7 @@ SVGA3D_vgpu10_DrawInstanced(struct svga_winsys_context *swc, SVGA3D_COPY_BASIC_4(vertexCountPerInstance, instanceCount, startVertexLocation, startInstanceLocation); - swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED; + swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH; swc->commit(swc); return PIPE_OK; } @@ -588,7 +588,7 @@ SVGA3D_vgpu10_DrawIndexedInstanced(struct svga_winsys_context *swc, startInstanceLocation); - swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED; + swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH; swc->commit(swc); return PIPE_OK; } @@ -598,7 +598,7 @@ SVGA3D_vgpu10_DrawAuto(struct svga_winsys_context *swc) { SVGA3D_CREATE_COMMAND(DrawAuto, DRAW_AUTO); - swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED; + swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH; swc->commit(swc); return PIPE_OK; } diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c index d407785ddd9..b10eb45e548 100644 --- a/src/gallium/drivers/svga/svga_context.c +++ b/src/gallium/drivers/svga/svga_context.c @@ -46,7 +46,6 @@ #include "svga_winsys.h" #define CONST0_UPLOAD_DEFAULT_SIZE 65536 -#define CONST0_UPLOAD_ALIGNMENT 256 DEBUG_GET_ONCE_BOOL_OPTION(no_swtnl, "SVGA_NO_SWTNL", FALSE) DEBUG_GET_ONCE_BOOL_OPTION(force_swtnl, "SVGA_FORCE_SWTNL", FALSE); @@ -220,8 +219,8 @@ struct pipe_context *svga_context_create(struct pipe_screen *screen, svga->const0_upload = u_upload_create(&svga->pipe, CONST0_UPLOAD_DEFAULT_SIZE, - CONST0_UPLOAD_ALIGNMENT, - PIPE_BIND_CONSTANT_BUFFER); + PIPE_BIND_CONSTANT_BUFFER, + PIPE_USAGE_STREAM); if (!svga->const0_upload) goto cleanup; diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h index 78e346a92b9..e4f29b8497e 100644 --- a/src/gallium/drivers/svga/svga_context.h +++ b/src/gallium/drivers/svga/svga_context.h @@ -74,6 +74,8 @@ */ #define SVGA_MAX_CONST_BUF_SIZE (4096 * 4 * sizeof(int)) +#define CONST0_UPLOAD_ALIGNMENT 256 + struct draw_vertex_shader; struct draw_fragment_shader; struct svga_shader_variant; @@ -312,7 +314,7 @@ struct svga_hw_view_state struct svga_sampler_view *v; unsigned min_lod; unsigned max_lod; - int dirty; + boolean dirty; }; /* Updated by calling svga_update_state( SVGA_STATE_HW_DRAW ) @@ -343,6 +345,11 @@ struct svga_hw_draw_state SVGA3dElementLayoutId layout_id; SVGA3dPrimitiveType topology; + /** Vertex buffer state */ + SVGA3dVertexBuffer vbuffers[PIPE_MAX_ATTRIBS]; + struct svga_winsys_surface *vbuffer_handles[PIPE_MAX_ATTRIBS]; + unsigned num_vbuffers; + struct svga_winsys_surface *ib; /**< index buffer for drawing */ SVGA3dSurfaceFormat ib_format; unsigned ib_offset; diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c index 2d3631d6f9c..80526ed4d15 100644 --- a/src/gallium/drivers/svga/svga_draw.c +++ b/src/gallium/drivers/svga/svga_draw.c @@ -517,11 +517,27 @@ draw_vgpu10(struct svga_hwtnl *hwtnl, buffers[i].offset = hwtnl->cmd.vbufs[i].buffer_offset; } if (vbuf_count > 0) { - ret = SVGA3D_vgpu10_SetVertexBuffers(svga->swc, vbuf_count, - 0, /* startBuffer */ - buffers, vb_handle); - if (ret != PIPE_OK) - return ret; + /* If we haven't yet emitted a drawing command or if any + * vertex buffer state is changing, issue that state now. + */ + if (((hwtnl->cmd.swc->hints & SVGA_HINT_FLAG_CAN_PRE_FLUSH) == 0) || + vbuf_count != svga->state.hw_draw.num_vbuffers || + memcmp(buffers, svga->state.hw_draw.vbuffers, + vbuf_count * sizeof(buffers[0])) || + memcmp(vb_handle, svga->state.hw_draw.vbuffer_handles, + vbuf_count * sizeof(vb_handle[0]))) { + ret = SVGA3D_vgpu10_SetVertexBuffers(svga->swc, vbuf_count, + 0, /* startBuffer */ + buffers, vb_handle); + if (ret != PIPE_OK) + return ret; + + svga->state.hw_draw.num_vbuffers = vbuf_count; + memcpy(svga->state.hw_draw.vbuffers, buffers, + vbuf_count * sizeof(buffers[0])); + memcpy(svga->state.hw_draw.vbuffer_handles, vb_handle, + vbuf_count * sizeof(vb_handle[0])); + } } } diff --git a/src/gallium/drivers/svga/svga_pipe_rasterizer.c b/src/gallium/drivers/svga/svga_pipe_rasterizer.c index fa1744fc33e..8e0db539574 100644 --- a/src/gallium/drivers/svga/svga_pipe_rasterizer.c +++ b/src/gallium/drivers/svga/svga_pipe_rasterizer.c @@ -368,13 +368,16 @@ static void svga_bind_rasterizer_state( struct pipe_context *pipe, struct svga_context *svga = svga_context(pipe); struct svga_rasterizer_state *raster = (struct svga_rasterizer_state *)state; + if (!raster || + !svga->curr.rast || + raster->templ.poly_stipple_enable != + svga->curr.rast->templ.poly_stipple_enable) { + svga->dirty |= SVGA_NEW_STIPPLE; + } + svga->curr.rast = raster; svga->dirty |= SVGA_NEW_RAST; - - if (raster && raster->templ.poly_stipple_enable) { - svga->dirty |= SVGA_NEW_STIPPLE; - } } static void diff --git a/src/gallium/drivers/svga/svga_pipe_sampler.c b/src/gallium/drivers/svga/svga_pipe_sampler.c index 95241176510..3e778f0a087 100644 --- a/src/gallium/drivers/svga/svga_pipe_sampler.c +++ b/src/gallium/drivers/svga/svga_pipe_sampler.c @@ -287,6 +287,7 @@ svga_bind_sampler_states(struct pipe_context *pipe, { struct svga_context *svga = svga_context(pipe); unsigned i; + boolean any_change = FALSE; assert(shader < PIPE_SHADER_TYPES); assert(start + num <= PIPE_MAX_SAMPLERS); @@ -295,8 +296,15 @@ svga_bind_sampler_states(struct pipe_context *pipe, if (!svga_have_vgpu10(svga) && shader != PIPE_SHADER_FRAGMENT) return; - for (i = 0; i < num; i++) + for (i = 0; i < num; i++) { + if (svga->curr.sampler[shader][start + i] != samplers[i]) + any_change = TRUE; svga->curr.sampler[shader][start + i] = samplers[i]; + } + + if (!any_change) { + return; + } /* find highest non-null sampler[] entry */ { @@ -405,6 +413,7 @@ svga_set_sampler_views(struct pipe_context *pipe, unsigned flag_1d = 0; unsigned flag_srgb = 0; uint i; + boolean any_change = FALSE; assert(shader < PIPE_SHADER_TYPES); assert(start + num <= Elements(svga->curr.sampler_views[shader])); @@ -422,6 +431,7 @@ svga_set_sampler_views(struct pipe_context *pipe, pipe_sampler_view_release(pipe, &svga->curr.sampler_views[shader][start + i]); pipe_sampler_view_reference(&svga->curr.sampler_views[shader][start + i], views[i]); + any_change = TRUE; } if (!views[i]) @@ -434,6 +444,10 @@ svga_set_sampler_views(struct pipe_context *pipe, flag_1d |= 1 << (start + i); } + if (!any_change) { + return; + } + /* find highest non-null sampler_views[] entry */ { unsigned j = MAX2(svga->curr.num_sampler_views[shader], start + num); diff --git a/src/gallium/drivers/svga/svga_resource_buffer_upload.c b/src/gallium/drivers/svga/svga_resource_buffer_upload.c index 8c5cff5abc1..7f7ceab0aa5 100644 --- a/src/gallium/drivers/svga/svga_resource_buffer_upload.c +++ b/src/gallium/drivers/svga/svga_resource_buffer_upload.c @@ -221,7 +221,7 @@ svga_buffer_upload_gb_command(struct svga_context *svga, struct svga_3d_update_gb_image *whole_update_cmd = NULL; uint32 numBoxes = sbuf->map.num_ranges; struct pipe_resource *dummy; - unsigned int i; + unsigned i; assert(numBoxes); assert(sbuf->dma.updates == NULL); @@ -308,6 +308,7 @@ svga_buffer_upload_gb_command(struct svga_context *svga, pipe_resource_reference(&dummy, &sbuf->b.b); SVGA_FIFOCommitAll(swc); + swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH; sbuf->dma.flags.discard = FALSE; return PIPE_OK; @@ -381,6 +382,7 @@ svga_buffer_upload_command(struct svga_context *svga, SVGA_FIFOCommitAll(swc); + swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH; sbuf->dma.flags.discard = FALSE; return PIPE_OK; diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c index fca501bc47d..0f41e4ea254 100644 --- a/src/gallium/drivers/svga/svga_screen.c +++ b/src/gallium/drivers/svga/svga_screen.c @@ -342,6 +342,8 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_TEXTURE_GATHER_OFFSETS: case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION: case PIPE_CAP_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: case PIPE_CAP_SAMPLER_VIEW_TARGET: @@ -349,6 +351,8 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_VERTEXID_NOBASE: case PIPE_CAP_POLYGON_OFFSET_CLAMP: case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: return 0; case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT: return 64; @@ -384,6 +388,9 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: case PIPE_CAP_CLEAR_TEXTURE: + case PIPE_CAP_DRAW_PARAMETERS: + case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: + case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: return 0; } @@ -457,6 +464,7 @@ vgpu9_get_shader_param(struct pipe_screen *screen, unsigned shader, case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: + case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: return 0; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; @@ -515,6 +523,7 @@ vgpu9_get_shader_param(struct pipe_screen *screen, unsigned shader, case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: + case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: return 0; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; @@ -606,6 +615,7 @@ vgpu10_get_shader_param(struct pipe_screen *screen, unsigned shader, case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: + case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: return 0; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; diff --git a/src/gallium/drivers/svga/svga_state_constants.c b/src/gallium/drivers/svga/svga_state_constants.c index 2cf41134bd6..8ab1693088a 100644 --- a/src/gallium/drivers/svga/svga_state_constants.c +++ b/src/gallium/drivers/svga/svga_state_constants.c @@ -613,7 +613,8 @@ emit_constbuf_vgpu10(struct svga_context *svga, unsigned shader) */ new_buf_size = align(new_buf_size, 16); - u_upload_alloc(svga->const0_upload, 0, new_buf_size, &offset, + u_upload_alloc(svga->const0_upload, 0, new_buf_size, + CONST0_UPLOAD_ALIGNMENT, &offset, &dst_buffer, &dst_map); if (!dst_map) { if (src_map) diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c index e392778c2fb..bac91669be1 100644 --- a/src/gallium/drivers/svga/svga_state_fs.c +++ b/src/gallium/drivers/svga/svga_state_fs.c @@ -452,6 +452,7 @@ struct svga_tracked_state svga_hw_fs = SVGA_NEW_TEXTURE_BINDING | SVGA_NEW_NEED_SWTNL | SVGA_NEW_RAST | + SVGA_NEW_STIPPLE | SVGA_NEW_REDUCED_PRIMITIVE | SVGA_NEW_SAMPLER | SVGA_NEW_FRAME_BUFFER | diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c index 24574c1bf85..a103dab25fe 100644 --- a/src/gallium/drivers/svga/svga_state_vs.c +++ b/src/gallium/drivers/svga/svga_state_vs.c @@ -173,8 +173,11 @@ make_vs_key(struct svga_context *svga, struct svga_compile_key *key) return; } + /* SVGA_NEW_PRESCALE */ key->vs.need_prescale = svga->state.hw_clear.prescale.enabled && (svga->curr.gs == NULL); + + /* SVGA_NEW_RAST */ key->vs.allow_psiz = svga->curr.rast->templ.point_size_per_vertex; /* SVGA_NEW_FS */ diff --git a/src/gallium/drivers/svga/svga_swtnl_state.c b/src/gallium/drivers/svga/svga_swtnl_state.c index 79dc0bf580c..4d21f4f0e60 100644 --- a/src/gallium/drivers/svga/svga_swtnl_state.c +++ b/src/gallium/drivers/svga/svga_swtnl_state.c @@ -220,8 +220,6 @@ svga_swtnl_update_vdecl( struct svga_context *svga ) struct draw_context *draw = svga->swtnl.draw; struct vertex_info *vinfo = &svga_render->vertex_info; SVGA3dVertexDecl vdecl[PIPE_MAX_ATTRIBS]; - const enum interp_mode colorInterp = - svga->curr.rast->templ.flatshade ? INTERP_CONSTANT : INTERP_LINEAR; struct svga_fragment_shader *fs = svga->curr.fs; int offset = 0; int nr_decls = 0; @@ -236,7 +234,7 @@ svga_swtnl_update_vdecl( struct svga_context *svga ) /* always add position */ src = draw_find_shader_output(draw, TGSI_SEMANTIC_POSITION, 0); - draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_LINEAR, src); + draw_emit_vertex_attr(vinfo, EMIT_4F, src); vinfo->attrib[0].emit = EMIT_4F; vdecl[0].array.offset = offset; vdecl[0].identity.method = SVGA3D_DECLMETHOD_DEFAULT; @@ -257,14 +255,14 @@ svga_swtnl_update_vdecl( struct svga_context *svga ) switch (sem_name) { case TGSI_SEMANTIC_COLOR: - draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src); + draw_emit_vertex_attr(vinfo, EMIT_4F, src); vdecl[nr_decls].identity.usage = SVGA3D_DECLUSAGE_COLOR; vdecl[nr_decls].identity.type = SVGA3D_DECLTYPE_FLOAT4; offset += 16; nr_decls++; break; case TGSI_SEMANTIC_GENERIC: - draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src); + draw_emit_vertex_attr(vinfo, EMIT_4F, src); vdecl[nr_decls].identity.usage = SVGA3D_DECLUSAGE_TEXCOORD; vdecl[nr_decls].identity.type = SVGA3D_DECLTYPE_FLOAT4; vdecl[nr_decls].identity.usageIndex = @@ -273,7 +271,7 @@ svga_swtnl_update_vdecl( struct svga_context *svga ) nr_decls++; break; case TGSI_SEMANTIC_FOG: - draw_emit_vertex_attr(vinfo, EMIT_1F, INTERP_PERSPECTIVE, src); + draw_emit_vertex_attr(vinfo, EMIT_1F, src); vdecl[nr_decls].identity.usage = SVGA3D_DECLUSAGE_TEXCOORD; vdecl[nr_decls].identity.type = SVGA3D_DECLTYPE_FLOAT1; assert(vdecl[nr_decls].identity.usageIndex == 0); diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c index dbb90f7654e..489e68f88e8 100644 --- a/src/gallium/drivers/svga/svga_tgsi_insn.c +++ b/src/gallium/drivers/svga/svga_tgsi_insn.c @@ -166,7 +166,7 @@ scalar(struct src_register src, unsigned comp) static boolean svga_arl_needs_adjustment( const struct svga_shader_emitter *emit ) { - int i; + unsigned i; for (i = 0; i < emit->num_arl_consts; ++i) { if (emit->arl_consts[i].arl_num == emit->current_arl) @@ -179,7 +179,7 @@ svga_arl_needs_adjustment( const struct svga_shader_emitter *emit ) static int svga_arl_adjustment( const struct svga_shader_emitter *emit ) { - int i; + unsigned i; for (i = 0; i < emit->num_arl_consts; ++i) { if (emit->arl_consts[i].arl_num == emit->current_arl) @@ -1175,7 +1175,7 @@ emit_div(struct svga_shader_emitter *emit, const struct src_register src1 = translate_src_register(emit, &insn->Src[1] ); SVGA3dShaderDestToken temp = get_temp( emit ); - int i; + unsigned i; /* For each enabled element, perform a RCP instruction. Note that * RCP is scalar in SVGA3D: @@ -1822,7 +1822,7 @@ emit_tex_swizzle(struct svga_shader_emitter *emit, const unsigned swizzleIn[4] = {swizzle_x, swizzle_y, swizzle_z, swizzle_w}; unsigned srcSwizzle[4]; unsigned srcWritemask = 0x0, zeroWritemask = 0x0, oneWritemask = 0x0; - int i; + unsigned i; /* build writemasks and srcSwizzle terms */ for (i = 0; i < 4; i++) { @@ -3371,7 +3371,7 @@ emit_light_twoside(struct svga_shader_emitter *emit) struct src_register back[2]; SVGA3dShaderDestToken color[2]; int count = emit->internal_color_count; - int i; + unsigned i; SVGA3dShaderInstToken if_token; if (count == 0) @@ -3698,7 +3698,7 @@ static boolean pre_parse_add_indirect( struct svga_shader_emitter *emit, int num, int current_arl) { - int i; + unsigned i; assert(num < 0); for (i = 0; i < emit->num_arl_consts; ++i) { @@ -3844,7 +3844,8 @@ svga_shader_emit_instructions(struct svga_shader_emitter *emit, if (emit->unit == PIPE_SHADER_FRAGMENT && emit->key.fs.pstipple) { unsigned unit; - new_tokens = util_pstipple_create_fragment_shader(tokens, &unit, 0); + new_tokens = util_pstipple_create_fragment_shader(tokens, &unit, 0, + TGSI_FILE_INPUT); if (new_tokens) { /* Setup texture state for stipple */ diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c index c979f4a8a56..1223e446055 100644 --- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c +++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c @@ -2298,11 +2298,13 @@ emit_vgpu10_declaration(struct svga_shader_emitter_v10 *emit, emit->num_samplers = MAX2(emit->num_samplers, decl->Range.Last + 1); return TRUE; +#if 0 case TGSI_FILE_RESOURCE: /*opcode0.opcodeType = VGPU10_OPCODE_DCL_RESOURCE;*/ /* XXX more, VGPU10_RETURN_TYPE_FLOAT */ assert(!"TGSI_FILE_RESOURCE not handled yet"); return FALSE; +#endif case TGSI_FILE_ADDRESS: emit->num_address_regs = MAX2(emit->num_address_regs, @@ -6170,6 +6172,11 @@ emit_vertex_attrib_instructions(struct svga_shader_emitter_v10 *emit) while (adjust_mask) { unsigned index = u_bit_scan(&adjust_mask); + + /* skip the instruction if this vertex attribute is not being used */ + if (emit->info.input_usage_mask[index] == 0) + continue; + unsigned tmp = emit->vs.adjusted_input[index]; struct tgsi_full_src_register input_src = make_src_reg(TGSI_FILE_INPUT, index); @@ -6604,7 +6611,8 @@ transform_fs_pstipple(struct svga_shader_emitter_v10 *emit, tgsi_dump(tokens,0); } - new_tokens = util_pstipple_create_fragment_shader(tokens, &unit, 0); + new_tokens = util_pstipple_create_fragment_shader(tokens, &unit, 0, + TGSI_FILE_INPUT); emit->fs.pstipple_sampler_unit = unit; diff --git a/src/gallium/drivers/svga/svga_winsys.h b/src/gallium/drivers/svga/svga_winsys.h index 3129e46ed06..562c6690fc1 100644 --- a/src/gallium/drivers/svga/svga_winsys.h +++ b/src/gallium/drivers/svga/svga_winsys.h @@ -85,7 +85,7 @@ struct winsys_handle; #define SVGA_QUERY_FLAG_SET (1 << 0) #define SVGA_QUERY_FLAG_REF (1 << 1) -#define SVGA_HINT_FLAG_DRAW_EMITTED (1 << 0) +#define SVGA_HINT_FLAG_CAN_PRE_FLUSH (1 << 0) /* Can preemptively flush */ /** Opaque surface handle */ struct svga_winsys_surface; diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c index 312b006f96e..a0888f23265 100644 --- a/src/gallium/drivers/vc4/vc4_context.c +++ b/src/gallium/drivers/vc4/vc4_context.c @@ -254,8 +254,9 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags) if (!vc4->primconvert) goto fail; - vc4->uploader = u_upload_create(pctx, 16 * 1024, 4, - PIPE_BIND_INDEX_BUFFER); + vc4->uploader = u_upload_create(pctx, 16 * 1024, + PIPE_BIND_INDEX_BUFFER, + PIPE_USAGE_STREAM); vc4_debug |= saved_shaderdb_flag; diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c index c00855698b8..9b0b540d3fc 100644 --- a/src/gallium/drivers/vc4/vc4_draw.c +++ b/src/gallium/drivers/vc4/vc4_draw.c @@ -319,7 +319,7 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) if (vc4->indexbuf.user_buffer) { prsc = NULL; u_upload_data(vc4->uploader, 0, - info->count * index_size, + info->count * index_size, 4, vc4->indexbuf.user_buffer, &offset, &prsc); } else { diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c index aea2b9dbe87..b8ce377ff6b 100644 --- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c +++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c @@ -145,43 +145,6 @@ qir_opt_algebraic(struct vc4_compile *c) list_for_each_entry(struct qinst, inst, &c->instructions, link) { switch (inst->op) { - case QOP_SEL_X_Y_ZS: - case QOP_SEL_X_Y_ZC: - case QOP_SEL_X_Y_NS: - case QOP_SEL_X_Y_NC: - case QOP_SEL_X_Y_CS: - case QOP_SEL_X_Y_CC: - if (is_zero(c, inst->src[1])) { - /* Replace references to a 0 uniform value - * with the SEL_X_0 equivalent. - */ - dump_from(c, inst); - inst->op -= (QOP_SEL_X_Y_ZS - QOP_SEL_X_0_ZS); - inst->src[1] = c->undef; - progress = true; - dump_to(c, inst); - break; - } - - if (is_zero(c, inst->src[0])) { - /* Replace references to a 0 uniform value - * with the SEL_X_0 equivalent, flipping the - * condition being evaluated since the operand - * order is flipped. - */ - dump_from(c, inst); - inst->op -= QOP_SEL_X_Y_ZS; - inst->op ^= 1; - inst->op += QOP_SEL_X_0_ZS; - inst->src[0] = inst->src[1]; - inst->src[1] = c->undef; - progress = true; - dump_to(c, inst); - break; - } - - break; - case QOP_FMIN: if (is_1f(c, inst->src[1]) && inst->src[0].pack >= QPU_UNPACK_8D_REP && diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index da0d21111a0..3e402d048ba 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -89,7 +89,7 @@ indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr) range->dst_offset = c->next_ubo_dst_offset; c->next_ubo_dst_offset += range->size; c->num_ubo_ranges++; - }; + } offset -= range->src_offset; @@ -204,27 +204,6 @@ ntq_get_alu_src(struct vc4_compile *c, nir_alu_instr *instr, return r; }; -static struct qreg -get_swizzled_channel(struct vc4_compile *c, - struct qreg *srcs, int swiz) -{ - switch (swiz) { - default: - case UTIL_FORMAT_SWIZZLE_NONE: - fprintf(stderr, "warning: unknown swizzle\n"); - /* FALLTHROUGH */ - case UTIL_FORMAT_SWIZZLE_0: - return qir_uniform_f(c, 0.0); - case UTIL_FORMAT_SWIZZLE_1: - return qir_uniform_f(c, 1.0); - case UTIL_FORMAT_SWIZZLE_X: - case UTIL_FORMAT_SWIZZLE_Y: - case UTIL_FORMAT_SWIZZLE_Z: - case UTIL_FORMAT_SWIZZLE_W: - return srcs[swiz]; - } -} - static inline struct qreg qir_SAT(struct vc4_compile *c, struct qreg val) { @@ -275,7 +254,7 @@ qir_srgb_decode(struct vc4_compile *c, struct qreg srgb) qir_uniform_f(c, 2.4)); qir_SF(c, qir_FSUB(c, srgb, qir_uniform_f(c, 0.04045))); - return qir_SEL_X_Y_NS(c, low, high); + return qir_SEL(c, QPU_COND_NS, low, high); } static struct qreg @@ -338,30 +317,20 @@ ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr) struct qreg tex = qir_TEX_RESULT(c); c->num_texture_samples++; - struct qreg texture_output[4]; + struct qreg *dest = ntq_get_dest(c, &instr->dest); enum pipe_format format = c->key->tex[unit].format; if (util_format_is_depth_or_stencil(format)) { struct qreg scaled = ntq_scale_depth_texture(c, tex); for (int i = 0; i < 4; i++) - texture_output[i] = scaled; + dest[i] = scaled; } else { - struct qreg tex_result_unpacked[4]; for (int i = 0; i < 4; i++) - tex_result_unpacked[i] = qir_UNPACK_8_F(c, tex, i); - - const uint8_t *format_swiz = - vc4_get_format_swizzle(c->key->tex[unit].format); - for (int i = 0; i < 4; i++) { - texture_output[i] = - get_swizzled_channel(c, tex_result_unpacked, - format_swiz[i]); - } + dest[i] = qir_UNPACK_8_F(c, tex, i); } - struct qreg *dest = ntq_get_dest(c, &instr->dest); for (int i = 0; i < 4; i++) { - dest[i] = get_swizzled_channel(c, texture_output, - c->key->tex[unit].swizzle[i]); + if (c->tex_srgb_decode[unit] & (1 << i)) + dest[i] = qir_srgb_decode(c, dest[i]); } } @@ -470,12 +439,13 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr) enum pipe_format format = c->key->tex[unit].format; - struct qreg unpacked[4]; + struct qreg *dest = ntq_get_dest(c, &instr->dest); if (util_format_is_depth_or_stencil(format)) { struct qreg normalized = ntq_scale_depth_texture(c, tex); struct qreg depth_output; - struct qreg one = qir_uniform_f(c, 1.0f); + struct qreg u0 = qir_uniform_f(c, 0.0f); + struct qreg u1 = qir_uniform_f(c, 1.0f); if (c->key->tex[unit].compare_mode) { if (has_proj) compare = qir_FMUL(c, compare, proj); @@ -485,31 +455,31 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr) depth_output = qir_uniform_f(c, 0.0f); break; case PIPE_FUNC_ALWAYS: - depth_output = one; + depth_output = u1; break; case PIPE_FUNC_EQUAL: qir_SF(c, qir_FSUB(c, compare, normalized)); - depth_output = qir_SEL_X_0_ZS(c, one); + depth_output = qir_SEL(c, QPU_COND_ZS, u1, u0); break; case PIPE_FUNC_NOTEQUAL: qir_SF(c, qir_FSUB(c, compare, normalized)); - depth_output = qir_SEL_X_0_ZC(c, one); + depth_output = qir_SEL(c, QPU_COND_ZC, u1, u0); break; case PIPE_FUNC_GREATER: qir_SF(c, qir_FSUB(c, compare, normalized)); - depth_output = qir_SEL_X_0_NC(c, one); + depth_output = qir_SEL(c, QPU_COND_NC, u1, u0); break; case PIPE_FUNC_GEQUAL: qir_SF(c, qir_FSUB(c, normalized, compare)); - depth_output = qir_SEL_X_0_NS(c, one); + depth_output = qir_SEL(c, QPU_COND_NS, u1, u0); break; case PIPE_FUNC_LESS: qir_SF(c, qir_FSUB(c, compare, normalized)); - depth_output = qir_SEL_X_0_NS(c, one); + depth_output = qir_SEL(c, QPU_COND_NS, u1, u0); break; case PIPE_FUNC_LEQUAL: qir_SF(c, qir_FSUB(c, normalized, compare)); - depth_output = qir_SEL_X_0_NC(c, one); + depth_output = qir_SEL(c, QPU_COND_NC, u1, u0); break; } } else { @@ -517,29 +487,15 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr) } for (int i = 0; i < 4; i++) - unpacked[i] = depth_output; + dest[i] = depth_output; } else { for (int i = 0; i < 4; i++) - unpacked[i] = qir_UNPACK_8_F(c, tex, i); - } - - const uint8_t *format_swiz = vc4_get_format_swizzle(format); - struct qreg texture_output[4]; - for (int i = 0; i < 4; i++) { - texture_output[i] = get_swizzled_channel(c, unpacked, - format_swiz[i]); - } - - if (util_format_is_srgb(format)) { - for (int i = 0; i < 3; i++) - texture_output[i] = qir_srgb_decode(c, - texture_output[i]); + dest[i] = qir_UNPACK_8_F(c, tex, i); } - struct qreg *dest = ntq_get_dest(c, &instr->dest); for (int i = 0; i < 4; i++) { - dest[i] = get_swizzled_channel(c, texture_output, - c->key->tex[unit].swizzle[i]); + if (c->tex_srgb_decode[unit] & (1 << i)) + dest[i] = qir_srgb_decode(c, dest[i]); } } @@ -553,9 +509,8 @@ ntq_ffract(struct vc4_compile *c, struct qreg src) struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src)); struct qreg diff = qir_FSUB(c, src, trunc); qir_SF(c, diff); - return qir_SEL_X_Y_NS(c, - qir_FADD(c, diff, qir_uniform_f(c, 1.0)), - diff); + return qir_SEL(c, QPU_COND_NS, + qir_FADD(c, diff, qir_uniform_f(c, 1.0)), diff); } /** @@ -572,9 +527,8 @@ ntq_ffloor(struct vc4_compile *c, struct qreg src) */ qir_SF(c, qir_FSUB(c, src, trunc)); - return qir_SEL_X_Y_NS(c, - qir_FSUB(c, trunc, qir_uniform_f(c, 1.0)), - trunc); + return qir_SEL(c, QPU_COND_NS, + qir_FSUB(c, trunc, qir_uniform_f(c, 1.0)), trunc); } /** @@ -591,9 +545,8 @@ ntq_fceil(struct vc4_compile *c, struct qreg src) */ qir_SF(c, qir_FSUB(c, trunc, src)); - return qir_SEL_X_Y_NS(c, - qir_FADD(c, trunc, qir_uniform_f(c, 1.0)), - trunc); + return qir_SEL(c, QPU_COND_NS, + qir_FADD(c, trunc, qir_uniform_f(c, 1.0)), trunc); } static struct qreg @@ -668,10 +621,13 @@ ntq_fcos(struct vc4_compile *c, struct qreg src) static struct qreg ntq_fsign(struct vc4_compile *c, struct qreg src) { + struct qreg t = qir_get_temp(c); + qir_SF(c, src); - return qir_SEL_X_Y_NC(c, - qir_SEL_X_0_ZC(c, qir_uniform_f(c, 1.0)), - qir_uniform_f(c, -1.0)); + qir_MOV_dest(c, t, qir_uniform_f(c, 0.0)); + qir_MOV_dest(c, t, qir_uniform_f(c, 1.0))->cond = QPU_COND_ZC; + qir_MOV_dest(c, t, qir_uniform_f(c, -1.0))->cond = QPU_COND_NS; + return t; } static void @@ -888,6 +844,100 @@ ntq_emit_ubfe(struct vc4_compile *c, struct qreg base, struct qreg offset, return qir_UNPACK_8_I(c, base, offset_bit / 8); } +/** + * If compare_instr is a valid comparison instruction, emits the + * compare_instr's comparison and returns the sel_instr's return value based + * on the compare_instr's result. + */ +static bool +ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest, + nir_alu_instr *compare_instr, + nir_alu_instr *sel_instr) +{ + enum qpu_cond cond; + + switch (compare_instr->op) { + case nir_op_feq: + case nir_op_ieq: + case nir_op_seq: + cond = QPU_COND_ZS; + break; + case nir_op_fne: + case nir_op_ine: + case nir_op_sne: + cond = QPU_COND_ZC; + break; + case nir_op_fge: + case nir_op_ige: + case nir_op_uge: + case nir_op_sge: + cond = QPU_COND_NC; + break; + case nir_op_flt: + case nir_op_ilt: + case nir_op_slt: + cond = QPU_COND_NS; + break; + default: + return false; + } + + struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0); + struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1); + + if (nir_op_infos[compare_instr->op].input_types[0] == nir_type_float) + qir_SF(c, qir_FSUB(c, src0, src1)); + else + qir_SF(c, qir_SUB(c, src0, src1)); + + switch (sel_instr->op) { + case nir_op_seq: + case nir_op_sne: + case nir_op_sge: + case nir_op_slt: + *dest = qir_SEL(c, cond, + qir_uniform_f(c, 1.0), qir_uniform_f(c, 0.0)); + break; + + case nir_op_bcsel: + *dest = qir_SEL(c, cond, + ntq_get_alu_src(c, sel_instr, 1), + ntq_get_alu_src(c, sel_instr, 2)); + break; + + default: + *dest = qir_SEL(c, cond, + qir_uniform_ui(c, ~0), qir_uniform_ui(c, 0)); + break; + } + + return true; +} + +/** + * Attempts to fold a comparison generating a boolean result into the + * condition code for selecting between two values, instead of comparing the + * boolean result against 0 to generate the condition code. + */ +static struct qreg ntq_emit_bcsel(struct vc4_compile *c, nir_alu_instr *instr, + struct qreg *src) +{ + if (!instr->src[0].src.is_ssa) + goto out; + nir_alu_instr *compare = + nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); + if (!compare) + goto out; + + struct qreg dest; + if (ntq_emit_comparison(c, &dest, compare, instr)) + return dest; + +out: + qir_SF(c, src[0]); + return qir_SEL(c, QPU_COND_NS, src[1], src[2]); +} + static void ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) { @@ -974,7 +1024,9 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) case nir_op_i2b: case nir_op_f2b: qir_SF(c, src[0]); - *dest = qir_SEL_X_0_ZC(c, qir_uniform_ui(c, ~0)); + *dest = qir_SEL(c, QPU_COND_ZC, + qir_uniform_ui(c, ~0), + qir_uniform_ui(c, 0)); break; case nir_op_iadd: @@ -1016,65 +1068,29 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) break; case nir_op_seq: - qir_SF(c, qir_FSUB(c, src[0], src[1])); - *dest = qir_SEL_X_0_ZS(c, qir_uniform_f(c, 1.0)); - break; case nir_op_sne: - qir_SF(c, qir_FSUB(c, src[0], src[1])); - *dest = qir_SEL_X_0_ZC(c, qir_uniform_f(c, 1.0)); - break; case nir_op_sge: - qir_SF(c, qir_FSUB(c, src[0], src[1])); - *dest = qir_SEL_X_0_NC(c, qir_uniform_f(c, 1.0)); - break; case nir_op_slt: - qir_SF(c, qir_FSUB(c, src[0], src[1])); - *dest = qir_SEL_X_0_NS(c, qir_uniform_f(c, 1.0)); - break; case nir_op_feq: - qir_SF(c, qir_FSUB(c, src[0], src[1])); - *dest = qir_SEL_X_0_ZS(c, qir_uniform_ui(c, ~0)); - break; case nir_op_fne: - qir_SF(c, qir_FSUB(c, src[0], src[1])); - *dest = qir_SEL_X_0_ZC(c, qir_uniform_ui(c, ~0)); - break; case nir_op_fge: - qir_SF(c, qir_FSUB(c, src[0], src[1])); - *dest = qir_SEL_X_0_NC(c, qir_uniform_ui(c, ~0)); - break; case nir_op_flt: - qir_SF(c, qir_FSUB(c, src[0], src[1])); - *dest = qir_SEL_X_0_NS(c, qir_uniform_ui(c, ~0)); - break; case nir_op_ieq: - qir_SF(c, qir_SUB(c, src[0], src[1])); - *dest = qir_SEL_X_0_ZS(c, qir_uniform_ui(c, ~0)); - break; case nir_op_ine: - qir_SF(c, qir_SUB(c, src[0], src[1])); - *dest = qir_SEL_X_0_ZC(c, qir_uniform_ui(c, ~0)); - break; case nir_op_ige: - qir_SF(c, qir_SUB(c, src[0], src[1])); - *dest = qir_SEL_X_0_NC(c, qir_uniform_ui(c, ~0)); - break; case nir_op_uge: - qir_SF(c, qir_SUB(c, src[0], src[1])); - *dest = qir_SEL_X_0_CC(c, qir_uniform_ui(c, ~0)); - break; case nir_op_ilt: - qir_SF(c, qir_SUB(c, src[0], src[1])); - *dest = qir_SEL_X_0_NS(c, qir_uniform_ui(c, ~0)); + if (!ntq_emit_comparison(c, dest, instr, instr)) { + fprintf(stderr, "Bad comparison instruction\n"); + } break; case nir_op_bcsel: - qir_SF(c, src[0]); - *dest = qir_SEL_X_Y_NS(c, src[1], src[2]); + *dest = ntq_emit_bcsel(c, instr, src); break; case nir_op_fcsel: qir_SF(c, src[0]); - *dest = qir_SEL_X_Y_ZC(c, src[1], src[2]); + *dest = qir_SEL(c, QPU_COND_ZC, src[1], src[2]); break; case nir_op_frcp: @@ -1789,6 +1805,56 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage, if (stage == QSTAGE_FRAG) vc4_nir_lower_blend(c); + struct nir_lower_tex_options tex_options = { + /* We would need to implement txs, but we don't want the + * int/float conversions + */ + .lower_rect = false, + + /* We want to use this, but we don't want to newton-raphson + * its rcp. + */ + .lower_txp = false, + + /* Apply swizzles to all samplers. */ + .swizzle_result = ~0, + }; + + /* Lower the format swizzle and ARB_texture_swizzle-style swizzle. + * The format swizzling applies before sRGB decode, and + * ARB_texture_swizzle is the last thing before returning the sample. + */ + for (int i = 0; i < ARRAY_SIZE(key->tex); i++) { + enum pipe_format format = c->key->tex[i].format; + + if (!format) + continue; + + const uint8_t *format_swizzle = vc4_get_format_swizzle(format); + + for (int j = 0; j < 4; j++) { + uint8_t arb_swiz = c->key->tex[i].swizzle[j]; + + if (arb_swiz <= 3) { + tex_options.swizzles[i][j] = + format_swizzle[arb_swiz]; + } else { + tex_options.swizzles[i][j] = arb_swiz; + } + + /* If ARB_texture_swizzle is reading from the R, G, or + * B channels of an sRGB texture, then we need to + * apply sRGB decode to this channel at sample time. + */ + if (arb_swiz < 3 && util_format_is_srgb(format)) { + c->tex_srgb_decode[i] |= (1 << j); + } + + } + } + + nir_lower_tex(c->s, &tex_options); + if (c->fs_key && c->fs_key->light_twoside) nir_lower_two_sided_color(c->s); diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index c6916c48e7e..efbb69b71a7 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -65,19 +65,6 @@ static const struct qir_op_info qir_op_info[] = { [QOP_XOR] = { "xor", 1, 2 }, [QOP_NOT] = { "not", 1, 1 }, - [QOP_SEL_X_0_NS] = { "fsel_x_0_ns", 1, 1, false, true }, - [QOP_SEL_X_0_NC] = { "fsel_x_0_nc", 1, 1, false, true }, - [QOP_SEL_X_0_ZS] = { "fsel_x_0_zs", 1, 1, false, true }, - [QOP_SEL_X_0_ZC] = { "fsel_x_0_zc", 1, 1, false, true }, - [QOP_SEL_X_0_CS] = { "fsel_x_0_cs", 1, 1, false, true }, - [QOP_SEL_X_0_CC] = { "fsel_x_0_cc", 1, 1, false, true }, - [QOP_SEL_X_Y_NS] = { "fsel_x_y_ns", 1, 2, false, true }, - [QOP_SEL_X_Y_NC] = { "fsel_x_y_nc", 1, 2, false, true }, - [QOP_SEL_X_Y_ZS] = { "fsel_x_y_zs", 1, 2, false, true }, - [QOP_SEL_X_Y_ZC] = { "fsel_x_y_zc", 1, 2, false, true }, - [QOP_SEL_X_Y_CS] = { "fsel_x_y_cs", 1, 2, false, true }, - [QOP_SEL_X_Y_CC] = { "fsel_x_y_cc", 1, 2, false, true }, - [QOP_RCP] = { "rcp", 1, 1, false, true }, [QOP_RSQ] = { "rsq", 1, 1, false, true }, [QOP_EXP2] = { "exp2", 1, 2, false, true }, @@ -219,23 +206,8 @@ qir_is_tex(struct qinst *inst) bool qir_depends_on_flags(struct qinst *inst) { - switch (inst->op) { - case QOP_SEL_X_0_NS: - case QOP_SEL_X_0_NC: - case QOP_SEL_X_0_ZS: - case QOP_SEL_X_0_ZC: - case QOP_SEL_X_0_CS: - case QOP_SEL_X_0_CC: - case QOP_SEL_X_Y_NS: - case QOP_SEL_X_Y_NC: - case QOP_SEL_X_Y_ZS: - case QOP_SEL_X_Y_ZC: - case QOP_SEL_X_Y_CS: - case QOP_SEL_X_Y_CC: - return true; - default: - return false; - } + return (inst->cond != QPU_COND_ALWAYS && + inst->cond != QPU_COND_NEVER); } bool @@ -292,8 +264,19 @@ qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write) void qir_dump_inst(struct vc4_compile *c, struct qinst *inst) { - fprintf(stderr, "%s%s ", + static const char *conditions[] = { + [QPU_COND_ALWAYS] = "", + [QPU_COND_NEVER] = ".never", + [QPU_COND_ZS] = ".zs", + [QPU_COND_ZC] = ".zc", + [QPU_COND_NS] = ".ns", + [QPU_COND_NC] = ".nc", + [QPU_COND_CS] = ".cs", + [QPU_COND_CC] = ".cc", + }; + fprintf(stderr, "%s%s%s ", qir_get_op_name(inst->op), + conditions[inst->cond], inst->sf ? ".sf" : ""); qir_print_reg(c, inst->dst, true); @@ -352,6 +335,7 @@ qir_inst(enum qop op, struct qreg dst, struct qreg src0, struct qreg src1) inst->src = calloc(2, sizeof(inst->src[0])); inst->src[0] = src0; inst->src[1] = src1; + inst->cond = QPU_COND_ALWAYS; return inst; } @@ -503,9 +487,9 @@ qir_SF(struct vc4_compile *c, struct qreg src) if (!list_empty(&c->instructions)) last_inst = (struct qinst *)c->instructions.prev; - if (!last_inst || - last_inst->dst.file != src.file || - last_inst->dst.index != src.index || + if (src.file != QFILE_TEMP || + !c->defs[src.index] || + last_inst != c->defs[src.index] || qir_is_multi_instruction(last_inst)) { src = qir_MOV(c, src); last_inst = (struct qinst *)c->instructions.prev; diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index b0fbb4c1db2..4ab4d35d0ca 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -93,23 +93,6 @@ enum qop { QOP_XOR, QOP_NOT, - /* Note: Orderings of these compares must be the same as in - * qpu_defines.h. Selects the src[0] if the ns flag bit is set, - * otherwise 0. */ - QOP_SEL_X_0_ZS, - QOP_SEL_X_0_ZC, - QOP_SEL_X_0_NS, - QOP_SEL_X_0_NC, - QOP_SEL_X_0_CS, - QOP_SEL_X_0_CC, - /* Selects the src[0] if the ns flag bit is set, otherwise src[1]. */ - QOP_SEL_X_Y_ZS, - QOP_SEL_X_Y_ZC, - QOP_SEL_X_Y_NS, - QOP_SEL_X_Y_NC, - QOP_SEL_X_Y_CS, - QOP_SEL_X_Y_CC, - QOP_FTOI, QOP_ITOF, QOP_RCP, @@ -170,6 +153,7 @@ struct qinst { struct qreg dst; struct qreg *src; bool sf; + uint8_t cond; }; enum qstage { @@ -385,6 +369,11 @@ struct vc4_compile { uint8_t vattr_sizes[8]; + /* Bitfield for whether a given channel of a sampler needs sRGB + * decode. + */ + uint8_t tex_srgb_decode[VC4_MAX_TEXTURE_SAMPLERS]; + /** * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads. * @@ -463,9 +452,11 @@ void qir_schedule_instructions(struct vc4_compile *c); void qir_reorder_uniforms(struct vc4_compile *c); void qir_emit(struct vc4_compile *c, struct qinst *inst); -static inline void qir_emit_nodef(struct vc4_compile *c, struct qinst *inst) +static inline struct qinst * +qir_emit_nodef(struct vc4_compile *c, struct qinst *inst) { list_addtail(&inst->link, &c->instructions); + return inst; } struct qreg qir_get_temp(struct vc4_compile *c); @@ -536,11 +527,12 @@ qir_##name(struct vc4_compile *c, struct qreg a) \ qir_emit(c, qir_inst(QOP_##name, t, a, c->undef)); \ return t; \ } \ -static inline void \ +static inline struct qinst * \ qir_##name##_dest(struct vc4_compile *c, struct qreg dest, \ struct qreg a) \ { \ - qir_emit_nodef(c, qir_inst(QOP_##name, dest, a, c->undef)); \ + return qir_emit_nodef(c, qir_inst(QOP_##name, dest, a, \ + c->undef)); \ } #define QIR_ALU2(name) \ @@ -592,18 +584,6 @@ QIR_ALU2(V8MAX) QIR_ALU2(V8ADDS) QIR_ALU2(V8SUBS) QIR_ALU2(MUL24) -QIR_ALU1(SEL_X_0_ZS) -QIR_ALU1(SEL_X_0_ZC) -QIR_ALU1(SEL_X_0_NS) -QIR_ALU1(SEL_X_0_NC) -QIR_ALU1(SEL_X_0_CS) -QIR_ALU1(SEL_X_0_CC) -QIR_ALU2(SEL_X_Y_ZS) -QIR_ALU2(SEL_X_Y_ZC) -QIR_ALU2(SEL_X_Y_NS) -QIR_ALU2(SEL_X_Y_NC) -QIR_ALU2(SEL_X_Y_CS) -QIR_ALU2(SEL_X_Y_CC) QIR_ALU2(FMIN) QIR_ALU2(FMAX) QIR_ALU2(FMINABS) @@ -648,6 +628,17 @@ QIR_NODST_1(TLB_STENCIL_SETUP) QIR_NODST_1(MS_MASK) static inline struct qreg +qir_SEL(struct vc4_compile *c, uint8_t cond, struct qreg src0, struct qreg src1) +{ + struct qreg t = qir_get_temp(c); + struct qinst *a = qir_MOV_dest(c, t, src0); + struct qinst *b = qir_MOV_dest(c, t, src1); + a->cond = cond; + b->cond = cond ^ 1; + return t; +} + +static inline struct qreg qir_UNPACK_8_F(struct vc4_compile *c, struct qreg src, int i) { struct qreg t = qir_FMOV(c, src); diff --git a/src/gallium/drivers/vc4/vc4_qir_schedule.c b/src/gallium/drivers/vc4/vc4_qir_schedule.c index d20815f055e..2f280c54523 100644 --- a/src/gallium/drivers/vc4/vc4_qir_schedule.c +++ b/src/gallium/drivers/vc4/vc4_qir_schedule.c @@ -250,12 +250,11 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n) else if (inst->dst.file == QFILE_TEMP) add_write_dep(dir, &state->last_temp_write[inst->dst.index], n); + if (qir_depends_on_flags(inst)) + add_dep(dir, state->last_sf, n); + if (inst->sf) add_write_dep(dir, &state->last_sf, n); - - if (qir_depends_on_flags(inst)) { - add_dep(dir, state->last_sf, n); - } } static void diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index cb4e0cfcc3f..b06702afea2 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -64,6 +64,12 @@ set_last_cond_add(struct vc4_compile *c, uint32_t cond) *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond); } +static void +set_last_cond_mul(struct vc4_compile *c, uint32_t cond) +{ + *last_inst(c) = qpu_set_cond_mul(*last_inst(c), cond); +} + /** * Some special registers can be read from either file, which lets us resolve * raddr conflicts without extra MOVs. @@ -306,42 +312,9 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) break; } - switch (qinst->op) { - case QOP_SEL_X_0_ZS: - case QOP_SEL_X_0_ZC: - case QOP_SEL_X_0_NS: - case QOP_SEL_X_0_NC: - case QOP_SEL_X_0_CS: - case QOP_SEL_X_0_CC: - queue(c, qpu_a_MOV(dst, src[0]) | unpack); - set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS + - QPU_COND_ZS); - - queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0())); - set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^ - 1) + QPU_COND_ZS); - break; - - case QOP_SEL_X_Y_ZS: - case QOP_SEL_X_Y_ZC: - case QOP_SEL_X_Y_NS: - case QOP_SEL_X_Y_NC: - case QOP_SEL_X_Y_CS: - case QOP_SEL_X_Y_CC: - queue(c, qpu_a_MOV(dst, src[0])); - if (qinst->src[0].pack) - *(last_inst(c)) |= unpack; - set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS + - QPU_COND_ZS); - - queue(c, qpu_a_MOV(dst, src[1])); - if (qinst->src[1].pack) - *(last_inst(c)) |= unpack; - set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^ - 1) + QPU_COND_ZS); - - break; + bool handled_qinst_cond = true; + switch (qinst->op) { case QOP_RCP: case QOP_RSQ: case QOP_EXP2: @@ -497,16 +470,22 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) queue(c, qpu_m_alu2(translate[qinst->op].op, dst, src[0], src[1]) | unpack); + set_last_cond_mul(c, qinst->cond); } else { queue(c, qpu_a_alu2(translate[qinst->op].op, dst, src[0], src[1]) | unpack); + set_last_cond_add(c, qinst->cond); } + handled_qinst_cond = true; set_last_dst_pack(c, qinst); break; } + assert(qinst->cond == QPU_COND_ALWAYS || + handled_qinst_cond); + if (qinst->sf) { assert(!qir_is_multi_instruction(qinst)); *last_inst(c) |= QPU_SF; diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c index 9e6678a0625..036da329987 100644 --- a/src/gallium/drivers/vc4/vc4_resource.c +++ b/src/gallium/drivers/vc4/vc4_resource.c @@ -921,7 +921,7 @@ vc4_get_shadow_index_buffer(struct pipe_context *pctx, void *data; struct pipe_resource *shadow_rsc = NULL; - u_upload_alloc(vc4->uploader, 0, count * 2, + u_upload_alloc(vc4->uploader, 0, count * 2, 4, shadow_offset, &shadow_rsc, &data); uint16_t *dst = data; diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c index 8ddf0865d21..0e289432bbe 100644 --- a/src/gallium/drivers/vc4/vc4_screen.c +++ b/src/gallium/drivers/vc4/vc4_screen.c @@ -100,6 +100,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_TWO_SIDED_STENCIL: case PIPE_CAP_USER_INDEX_BUFFERS: case PIPE_CAP_TEXTURE_MULTISAMPLE: + case PIPE_CAP_TEXTURE_SWIZZLE: return 1; /* lying for GL 2.0 */ @@ -128,7 +129,6 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_TEXTURE_BUFFER_OBJECTS: case PIPE_CAP_CUBE_MAP_ARRAY: case PIPE_CAP_TEXTURE_MIRROR_CLAMP: - case PIPE_CAP_TEXTURE_SWIZZLE: case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: case PIPE_CAP_MIXED_COLORBUFFER_FORMATS: case PIPE_CAP_SEAMLESS_CUBE_MAP: @@ -171,6 +171,8 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MAX_TEXEL_OFFSET: case PIPE_CAP_MAX_VERTEX_STREAMS: case PIPE_CAP_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: case PIPE_CAP_SAMPLER_VIEW_TARGET: @@ -180,15 +182,20 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: - case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: - case PIPE_CAP_TEXTURE_FLOAT_LINEAR: - case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: - case PIPE_CAP_DEPTH_BOUNDS_TEST: - case PIPE_CAP_TGSI_TXQS: - case PIPE_CAP_FORCE_PERSAMPLE_INTERP: - case PIPE_CAP_SHAREABLE_SHADERS: - case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: - case PIPE_CAP_CLEAR_TEXTURE: + case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: + case PIPE_CAP_DEPTH_BOUNDS_TEST: + case PIPE_CAP_TGSI_TXQS: + case PIPE_CAP_FORCE_PERSAMPLE_INTERP: + case PIPE_CAP_SHAREABLE_SHADERS: + case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: + case PIPE_CAP_DRAW_PARAMETERS: + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: + case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: + case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: return 0; /* Stream output. */ @@ -345,6 +352,8 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, return PIPE_SHADER_IR_TGSI; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; + case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: + return 0; default: fprintf(stderr, "unknown shader param %d\n", param); return 0; diff --git a/src/gallium/drivers/virgl/virgl_context.c b/src/gallium/drivers/virgl/virgl_context.c index 527f7637cb6..c322503d816 100644 --- a/src/gallium/drivers/virgl/virgl_context.c +++ b/src/gallium/drivers/virgl/virgl_context.c @@ -605,7 +605,7 @@ static void virgl_draw_vbo(struct pipe_context *ctx, ib.offset = vctx->index_buffer.offset + info.start * ib.index_size; if (ib.user_buffer) { - u_upload_data(vctx->uploader, 0, info.count * ib.index_size, + u_upload_data(vctx->uploader, 0, info.count * ib.index_size, 256, ib.user_buffer, &ib.offset, &ib.buffer); ib.user_buffer = NULL; } @@ -948,8 +948,8 @@ struct pipe_context *virgl_context_create(struct pipe_screen *pscreen, 16, UTIL_SLAB_SINGLETHREADED); vctx->primconvert = util_primconvert_create(&vctx->base, rs->caps.caps.v1.prim_mask); - vctx->uploader = u_upload_create(&vctx->base, 1024 * 1024, 256, - PIPE_BIND_INDEX_BUFFER); + vctx->uploader = u_upload_create(&vctx->base, 1024 * 1024, + PIPE_BIND_INDEX_BUFFER, PIPE_USAGE_STREAM); if (!vctx->uploader) goto fail; diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c index 26a4f7736e3..e8d82b37c0f 100644 --- a/src/gallium/drivers/virgl/virgl_screen.c +++ b/src/gallium/drivers/virgl/virgl_screen.c @@ -201,6 +201,8 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION: case PIPE_CAP_MAX_VERTEX_STREAMS: case PIPE_CAP_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE: @@ -219,6 +221,11 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_CLEAR_TEXTURE: + case PIPE_CAP_DRAW_PARAMETERS: + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: + case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: + case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: return 0; case PIPE_CAP_VENDOR_ID: return 0x1af4; |