diff options
author | Jason Ekstrand <[email protected]> | 2015-10-19 11:15:32 -0700 |
---|---|---|
committer | Jason Ekstrand <[email protected]> | 2015-10-19 14:14:21 -0700 |
commit | 958fc04dc51a2561c8598f42df59e3d9139e56a7 (patch) | |
tree | b6acf05aa073e97ae8e58647bf05c2c3e816f041 /src | |
parent | 995d9c4ac7fb046e01196cec308ebe10002a28da (diff) | |
parent | de862f03accb12b044ced60cb98f47a055457223 (diff) |
Merge remote-tracking branch 'mesa-public/master' into vulkan
Diffstat (limited to 'src')
318 files changed, 10004 insertions, 6872 deletions
diff --git a/src/Makefile.am b/src/Makefile.am index 13cfaa5b367..da638a811fb 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -66,7 +66,6 @@ AM_CPPFLAGS = \ noinst_LTLIBRARIES = libglsl_util.la libglsl_util_la_SOURCES = \ - glsl/shader_enums.c \ mesa/main/imports.c \ mesa/program/prog_hash_table.c \ mesa/program/symbol_table.c \ diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources index 1fa36416b8e..9df4e265b5b 100644 --- a/src/gallium/auxiliary/Makefile.sources +++ b/src/gallium/auxiliary/Makefile.sources @@ -137,6 +137,8 @@ C_SOURCES := \ tgsi/tgsi_dump.h \ tgsi/tgsi_exec.c \ tgsi/tgsi_exec.h \ + tgsi/tgsi_emulate.c \ + tgsi/tgsi_emulate.h \ tgsi/tgsi_info.c \ tgsi/tgsi_info.h \ tgsi/tgsi_iterate.c \ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c index c4ae30461cb..c88dfbf974a 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c @@ -129,7 +129,8 @@ lp_build_emit_llvm_unary( unsigned tgsi_opcode, LLVMValueRef arg0) { - struct lp_build_emit_data emit_data; + struct lp_build_emit_data emit_data = {{0}}; + emit_data.info = tgsi_get_opcode_info(tgsi_opcode); emit_data.arg_count = 1; emit_data.args[0] = arg0; return lp_build_emit_llvm(bld_base, tgsi_opcode, &emit_data); @@ -142,7 +143,8 @@ lp_build_emit_llvm_binary( LLVMValueRef arg0, LLVMValueRef arg1) { - struct lp_build_emit_data emit_data; + struct lp_build_emit_data emit_data = {{0}}; + emit_data.info = tgsi_get_opcode_info(tgsi_opcode); emit_data.arg_count = 2; emit_data.args[0] = arg0; emit_data.args[1] = arg1; @@ -157,7 +159,8 @@ lp_build_emit_llvm_ternary( LLVMValueRef arg1, LLVMValueRef arg2) { - struct lp_build_emit_data emit_data; + struct lp_build_emit_data emit_data = {{0}}; + emit_data.info = tgsi_get_opcode_info(tgsi_opcode); emit_data.arg_count = 3; emit_data.args[0] = arg0; emit_data.args[1] = arg1; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c index 0ad78b0ace2..3d5e2cb316b 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c @@ -538,12 +538,19 @@ lrp_emit( struct lp_build_tgsi_context * bld_base, struct lp_build_emit_data * emit_data) { - LLVMValueRef tmp; - tmp = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_SUB, - emit_data->args[1], - emit_data->args[2]); - emit_data->output[emit_data->chan] = lp_build_emit_llvm_ternary(bld_base, - TGSI_OPCODE_MAD, emit_data->args[0], tmp, emit_data->args[2]); + struct lp_build_context *bld = &bld_base->base; + LLVMValueRef inv, a, b; + + /* This uses the correct version: (1 - t)*a + t*b + * + * An alternative version is "a + t*(b-a)". The problem is this version + * doesn't return "b" for t = 1, because "a + (b-a)" isn't equal to "b" + * because of the floating-point rounding. + */ + inv = lp_build_sub(bld, bld_base->base.one, emit_data->args[0]); + a = lp_build_mul(bld, emit_data->args[1], emit_data->args[0]); + b = lp_build_mul(bld, emit_data->args[2], inv); + emit_data->output[emit_data->chan] = lp_build_add(bld, a, b); } /* TGSI_OPCODE_MAD */ diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c index 95eed2698bc..ffe30b8fa79 100644 --- a/src/gallium/auxiliary/hud/hud_context.c +++ b/src/gallium/auxiliary/hud/hud_context.c @@ -987,6 +987,9 @@ hud_parse_env_var(struct hud_context *hud, const char *env) case ',': env++; + if (!pane) + break; + y += height + hud->font.glyph_height * (pane->num_graphs + 2); height = 100; diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c index cf43ef2506f..0539cfc16a1 100644 --- a/src/gallium/auxiliary/nir/tgsi_to_nir.c +++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c @@ -27,7 +27,7 @@ #include "glsl/nir/nir_control_flow.h" #include "glsl/nir/nir_builder.h" #include "glsl/list.h" -#include "glsl/shader_enums.h" +#include "glsl/nir/shader_enums.h" #include "nir/tgsi_to_nir.h" #include "tgsi/tgsi_parse.h" diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c index 8ceb5b47584..5d80cca5b0e 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_dump.c +++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c @@ -648,6 +648,7 @@ tgsi_dump_instruction( ctx.indent = 0; ctx.dump_printf = dump_ctx_printf; ctx.indentation = 0; + ctx.file = NULL; iter_instruction( &ctx.iter, (struct tgsi_full_instruction *)inst ); } diff --git a/src/gallium/auxiliary/tgsi/tgsi_emulate.c b/src/gallium/auxiliary/tgsi/tgsi_emulate.c new file mode 100644 index 00000000000..59d2e4c95b1 --- /dev/null +++ b/src/gallium/auxiliary/tgsi/tgsi_emulate.c @@ -0,0 +1,169 @@ +/* + * Copyright (C) 2015 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include "tgsi/tgsi_transform.h" +#include "tgsi/tgsi_scan.h" +#include "tgsi/tgsi_dump.h" +#include "util/u_debug.h" + +#include "tgsi_emulate.h" + +struct tgsi_emulation_context { + struct tgsi_transform_context base; + struct tgsi_shader_info info; + unsigned flags; + bool first_instruction_emitted; +}; + +static inline struct tgsi_emulation_context * +tgsi_emulation_context(struct tgsi_transform_context *tctx) +{ + return (struct tgsi_emulation_context *)tctx; +} + +static void +transform_decl(struct tgsi_transform_context *tctx, + struct tgsi_full_declaration *decl) +{ + struct tgsi_emulation_context *ctx = tgsi_emulation_context(tctx); + + if (ctx->flags & TGSI_EMU_FORCE_PERSAMPLE_INTERP && + decl->Declaration.File == TGSI_FILE_INPUT) { + assert(decl->Declaration.Interpolate); + decl->Interp.Location = TGSI_INTERPOLATE_LOC_SAMPLE; + } + + tctx->emit_declaration(tctx, decl); +} + +static void +passthrough_edgeflag(struct tgsi_transform_context *tctx) +{ + struct tgsi_emulation_context *ctx = tgsi_emulation_context(tctx); + struct tgsi_full_declaration decl; + struct tgsi_full_instruction new_inst; + + /* Input */ + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_INPUT; + decl.Range.First = decl.Range.Last = ctx->info.num_inputs; + tctx->emit_declaration(tctx, &decl); + + /* Output */ + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_OUTPUT; + decl.Declaration.Semantic = true; + decl.Range.First = decl.Range.Last = ctx->info.num_outputs; + decl.Semantic.Name = TGSI_SEMANTIC_EDGEFLAG; + decl.Semantic.Index = 0; + tctx->emit_declaration(tctx, &decl); + + /* MOV */ + new_inst = tgsi_default_full_instruction(); + new_inst.Instruction.Opcode = TGSI_OPCODE_MOV; + + new_inst.Instruction.NumDstRegs = 1; + new_inst.Dst[0].Register.File = TGSI_FILE_OUTPUT; + new_inst.Dst[0].Register.Index = ctx->info.num_outputs; + new_inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW; + + new_inst.Instruction.NumSrcRegs = 1; + new_inst.Src[0].Register.File = TGSI_FILE_INPUT; + new_inst.Src[0].Register.Index = ctx->info.num_inputs; + new_inst.Src[0].Register.SwizzleX = TGSI_SWIZZLE_X; + new_inst.Src[0].Register.SwizzleY = TGSI_SWIZZLE_X; + new_inst.Src[0].Register.SwizzleZ = TGSI_SWIZZLE_X; + new_inst.Src[0].Register.SwizzleW = TGSI_SWIZZLE_X; + + tctx->emit_instruction(tctx, &new_inst); +} + +static void +transform_instr(struct tgsi_transform_context *tctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_emulation_context *ctx = tgsi_emulation_context(tctx); + + /* Pass through edgeflags. */ + if (!ctx->first_instruction_emitted) { + ctx->first_instruction_emitted = true; + + if (ctx->flags & TGSI_EMU_PASSTHROUGH_EDGEFLAG) + passthrough_edgeflag(tctx); + } + + /* Clamp color outputs. */ + if (ctx->flags & TGSI_EMU_CLAMP_COLOR_OUTPUTS) { + int i; + for (i = 0; i < inst->Instruction.NumDstRegs; i++) { + unsigned semantic; + + if (inst->Dst[i].Register.File != TGSI_FILE_OUTPUT || + inst->Dst[i].Register.Indirect) + continue; + + semantic = + ctx->info.output_semantic_name[inst->Dst[i].Register.Index]; + + if (semantic == TGSI_SEMANTIC_COLOR || + semantic == TGSI_SEMANTIC_BCOLOR) + inst->Instruction.Saturate = true; + } + } + + tctx->emit_instruction(tctx, inst); +} + +const struct tgsi_token * +tgsi_emulate(const struct tgsi_token *tokens, unsigned flags) +{ + struct tgsi_emulation_context ctx; + struct tgsi_token *newtoks; + int newlen; + + if (!(flags & (TGSI_EMU_CLAMP_COLOR_OUTPUTS | + TGSI_EMU_PASSTHROUGH_EDGEFLAG | + TGSI_EMU_FORCE_PERSAMPLE_INTERP))) + return NULL; + + memset(&ctx, 0, sizeof(ctx)); + ctx.flags = flags; + tgsi_scan_shader(tokens, &ctx.info); + + if (flags & TGSI_EMU_FORCE_PERSAMPLE_INTERP) + ctx.base.transform_declaration = transform_decl; + + if (flags & (TGSI_EMU_CLAMP_COLOR_OUTPUTS | + TGSI_EMU_PASSTHROUGH_EDGEFLAG)) + ctx.base.transform_instruction = transform_instr; + + newlen = tgsi_num_tokens(tokens) + 20; + newtoks = tgsi_alloc_tokens(newlen); + if (!newtoks) + return NULL; + + tgsi_transform_shader(tokens, newtoks, newlen, &ctx.base); + return newtoks; +} diff --git a/src/gallium/auxiliary/tgsi/tgsi_emulate.h b/src/gallium/auxiliary/tgsi/tgsi_emulate.h new file mode 100644 index 00000000000..425cec72ee1 --- /dev/null +++ b/src/gallium/auxiliary/tgsi/tgsi_emulate.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2015 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef TGSI_GL_EMULATION_H_ +#define TGSI_GL_EMULATION_H_ + +#include "pipe/p_shader_tokens.h" + +#define TGSI_EMU_CLAMP_COLOR_OUTPUTS (1 << 0) +#define TGSI_EMU_PASSTHROUGH_EDGEFLAG (1 << 1) +#define TGSI_EMU_FORCE_PERSAMPLE_INTERP (1 << 2) + +const struct tgsi_token * +tgsi_emulate(const struct tgsi_token *tokens, unsigned flags); + +#endif /* TGSI_GL_EMULATION_H_ */ diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c index d76dddbf7d9..b84a1753eeb 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_scan.c +++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c @@ -409,6 +409,8 @@ tgsi_scan_shader(const struct tgsi_token *tokens, info->writes_edgeflag = TRUE; } } + } else if (file == TGSI_FILE_SAMPLER) { + info->samplers_declared |= 1 << reg; } } } diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h index 3ceb55717ee..d60ccabda6d 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_scan.h +++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h @@ -64,6 +64,7 @@ struct tgsi_shader_info uint file_count[TGSI_FILE_COUNT]; /**< number of declared registers */ int file_max[TGSI_FILE_COUNT]; /**< highest index of declared registers */ int const_file_max[PIPE_MAX_CONSTANT_BUFFERS]; + unsigned samplers_declared; /**< bitmask of declared samplers */ ubyte input_array_first[PIPE_MAX_SHADER_INPUTS]; ubyte input_array_last[PIPE_MAX_SHADER_INPUTS]; diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c index 3d213195090..f2f518130fb 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c +++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c @@ -35,6 +35,7 @@ #include "tgsi/tgsi_dump.h" #include "tgsi/tgsi_sanity.h" #include "util/u_debug.h" +#include "util/u_inlines.h" #include "util/u_memory.h" #include "util/u_math.h" #include "util/u_bitmask.h" @@ -1830,29 +1831,6 @@ void ureg_free_tokens( const struct tgsi_token *tokens ) } -static inline unsigned -pipe_shader_from_tgsi_processor(unsigned processor) -{ - switch (processor) { - case TGSI_PROCESSOR_VERTEX: - return PIPE_SHADER_VERTEX; - case TGSI_PROCESSOR_TESS_CTRL: - return PIPE_SHADER_TESS_CTRL; - case TGSI_PROCESSOR_TESS_EVAL: - return PIPE_SHADER_TESS_EVAL; - case TGSI_PROCESSOR_GEOMETRY: - return PIPE_SHADER_GEOMETRY; - case TGSI_PROCESSOR_FRAGMENT: - return PIPE_SHADER_FRAGMENT; - case TGSI_PROCESSOR_COMPUTE: - return PIPE_SHADER_COMPUTE; - default: - assert(0); - return PIPE_SHADER_VERTEX; - } -} - - struct ureg_program * ureg_create(unsigned processor) { @@ -1872,7 +1850,7 @@ ureg_create_with_screen(unsigned processor, struct pipe_screen *screen) ureg->supports_any_inout_decl_range = screen && screen->get_shader_param(screen, - pipe_shader_from_tgsi_processor(processor), + util_pipe_shader_from_tgsi_processor(processor), PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE) != 0; for (i = 0; i < Elements(ureg->properties); i++) diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c index 5fe9e33e208..7388a499c74 100644 --- a/src/gallium/auxiliary/util/u_debug.c +++ b/src/gallium/auxiliary/util/u_debug.c @@ -276,7 +276,7 @@ debug_get_flags_option(const char *name, for (; flags->name; ++flags) namealign = MAX2(namealign, strlen(flags->name)); for (flags = orig; flags->name; ++flags) - _debug_printf("| %*s [0x%0*"PRIu64"]%s%s\n", namealign, flags->name, + _debug_printf("| %*s [0x%0*"PRIx64"]%s%s\n", namealign, flags->name, (int)sizeof(uint64_t)*CHAR_BIT/4, flags->value, flags->desc ? " " : "", flags->desc ? flags->desc : ""); } @@ -291,9 +291,9 @@ debug_get_flags_option(const char *name, if (debug_get_option_should_print()) { if (str) { - debug_printf("%s: %s = 0x%"PRIu64" (%s)\n", __FUNCTION__, name, result, str); + debug_printf("%s: %s = 0x%"PRIx64" (%s)\n", __FUNCTION__, name, result, str); } else { - debug_printf("%s: %s = 0x%"PRIu64"\n", __FUNCTION__, name, result); + debug_printf("%s: %s = 0x%"PRIx64"\n", __FUNCTION__, name, result); } } diff --git a/src/gallium/auxiliary/util/u_inlines.h b/src/gallium/auxiliary/util/u_inlines.h index bb99a02ce49..384e267b593 100644 --- a/src/gallium/auxiliary/util/u_inlines.h +++ b/src/gallium/auxiliary/util/u_inlines.h @@ -651,6 +651,28 @@ util_max_layer(const struct pipe_resource *r, unsigned level) } } +static inline unsigned +util_pipe_shader_from_tgsi_processor(unsigned processor) +{ + switch (processor) { + case TGSI_PROCESSOR_VERTEX: + return PIPE_SHADER_VERTEX; + case TGSI_PROCESSOR_TESS_CTRL: + return PIPE_SHADER_TESS_CTRL; + case TGSI_PROCESSOR_TESS_EVAL: + return PIPE_SHADER_TESS_EVAL; + case TGSI_PROCESSOR_GEOMETRY: + return PIPE_SHADER_GEOMETRY; + case TGSI_PROCESSOR_FRAGMENT: + return PIPE_SHADER_FRAGMENT; + case TGSI_PROCESSOR_COMPUTE: + return PIPE_SHADER_COMPUTE; + default: + assert(0); + return PIPE_SHADER_VERTEX; + } +} + #ifdef __cplusplus } #endif diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c index 3d2193c3bf5..b31ada138b8 100644 --- a/src/gallium/auxiliary/util/u_vbuf.c +++ b/src/gallium/auxiliary/util/u_vbuf.c @@ -544,6 +544,7 @@ u_vbuf_translate_find_free_vb_slots(struct u_vbuf *mgr, index = ffs(unused_vb_mask) - 1; fallback_vbs[type] = index; + unused_vb_mask &= ~(1 << index); /*printf("found slot=%i for type=%i\n", index, type);*/ } } diff --git a/src/gallium/drivers/freedreno/Makefile.am b/src/gallium/drivers/freedreno/Makefile.am index dff95ba5270..3de8e0fd5ad 100644 --- a/src/gallium/drivers/freedreno/Makefile.am +++ b/src/gallium/drivers/freedreno/Makefile.am @@ -19,7 +19,7 @@ libfreedreno_la_SOURCES = \ noinst_PROGRAMS = ir3_compiler -# XXX: Required due to the C++ sources in libnir/libglsl_util +# XXX: Required due to the C++ sources in libnir nodist_EXTRA_ir3_compiler_SOURCES = dummy.cpp ir3_compiler_SOURCES = \ ir3/ir3_cmdline.c @@ -28,7 +28,6 @@ ir3_compiler_LDADD = \ libfreedreno.la \ $(top_builddir)/src/gallium/auxiliary/libgallium.la \ $(top_builddir)/src/glsl/libnir.la \ - $(top_builddir)/src/libglsl_util.la \ $(top_builddir)/src/util/libmesautil.la \ $(GALLIUM_COMMON_LIB_DEPS) \ $(FREEDRENO_LIBS) diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c index 6153d92dc21..411f5b76329 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c @@ -798,11 +798,7 @@ fd3_emit_restore(struct fd_context *ctx) OUT_RING(ring, A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(0) | A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(0)); - OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_INVALIDATE0_REG, 2); - OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE0_REG_ADDR(0)); - OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE1_REG_ADDR(0) | - A3XX_UCHE_CACHE_INVALIDATE1_REG_OPCODE(INVALIDATE) | - A3XX_UCHE_CACHE_INVALIDATE1_REG_ENTIRE_CACHE); + fd3_emit_cache_flush(ctx, ring); OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); OUT_RING(ring, 0x00000000); /* GRAS_CL_CLIP_CNTL */ diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h index 795654706a7..42483f6c39b 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h @@ -90,4 +90,15 @@ void fd3_emit_restore(struct fd_context *ctx); void fd3_emit_init(struct pipe_context *pctx); +static inline void +fd3_emit_cache_flush(struct fd_context *ctx, struct fd_ringbuffer *ring) +{ + fd_wfi(ctx, ring); + OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_INVALIDATE0_REG, 2); + OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE0_REG_ADDR(0)); + OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE1_REG_ADDR(0) | + A3XX_UCHE_CACHE_INVALIDATE1_REG_OPCODE(INVALIDATE) | + A3XX_UCHE_CACHE_INVALIDATE1_REG_ENTIRE_CACHE); +} + #endif /* FD3_EMIT_H */ diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c index 9a5b45e2fcb..21fb59e450d 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c @@ -558,6 +558,8 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile) OUT_RING(ring, fui(x1)); OUT_RING(ring, fui(y1)); + fd3_emit_cache_flush(ctx, ring); + for (i = 0; i < 4; i++) { OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(i), 1); OUT_RING(ring, A3XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY) | diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c index 6831a58749c..7bf3343f43a 100644 --- a/src/gallium/drivers/freedreno/freedreno_draw.c +++ b/src/gallium/drivers/freedreno/freedreno_draw.c @@ -187,6 +187,9 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) for (i = 0; i < ctx->streamout.num_targets; i++) ctx->streamout.offsets[i] += prims; + if (fd_mesa_debug & FD_DBG_DDRAW) + ctx->dirty = 0xffffffff; + /* if an app (or, well, piglit test) does many thousands of draws * without flush (or anything which implicitly flushes, like * changing render targets), we can exceed the ringbuffer size. diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index 0d0100590d6..b64f78ca32b 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -61,7 +61,7 @@ static const struct debug_named_value debug_options[] = { {"msgs", FD_DBG_MSGS, "Print debug messages"}, {"disasm", FD_DBG_DISASM, "Dump TGSI and adreno shader disassembly"}, {"dclear", FD_DBG_DCLEAR, "Mark all state dirty after clear"}, - {"flush", FD_DBG_FLUSH, "Force flush after every draw"}, + {"ddraw", FD_DBG_DDRAW, "Mark all state dirty after draw"}, {"noscis", FD_DBG_NOSCIS, "Disable scissor optimization"}, {"direct", FD_DBG_DIRECT, "Force inline (SS_DIRECT) state loads"}, {"nobypass", FD_DBG_NOBYPASS, "Disable GMEM bypass"}, @@ -70,6 +70,7 @@ static const struct debug_named_value debug_options[] = { {"optmsgs", FD_DBG_OPTMSGS,"Enable optimizer debug messages"}, {"glsl120", FD_DBG_GLSL120,"Temporary flag to force GLSL 1.20 (rather than 1.30) on a3xx+"}, {"shaderdb", FD_DBG_SHADERDB, "Enable shaderdb output"}, + {"flush", FD_DBG_FLUSH, "Force flush after every draw"}, DEBUG_NAMED_VALUE_END }; diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h index 7129a1bddd1..0d2418e1e00 100644 --- a/src/gallium/drivers/freedreno/freedreno_util.h +++ b/src/gallium/drivers/freedreno/freedreno_util.h @@ -63,7 +63,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op); #define FD_DBG_MSGS 0x0001 #define FD_DBG_DISASM 0x0002 #define FD_DBG_DCLEAR 0x0004 -#define FD_DBG_FLUSH 0x0008 +#define FD_DBG_DDRAW 0x0008 #define FD_DBG_NOSCIS 0x0010 #define FD_DBG_DIRECT 0x0020 #define FD_DBG_NOBYPASS 0x0040 @@ -72,6 +72,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op); #define FD_DBG_OPTMSGS 0x0200 #define FD_DBG_GLSL120 0x0400 #define FD_DBG_SHADERDB 0x0800 +#define FD_DBG_FLUSH 0x1000 extern int fd_mesa_debug; extern bool fd_binning_enabled; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index 7eddbdd3825..8c9234b3847 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -36,7 +36,6 @@ #include "tgsi/tgsi_strings.h" #include "nir/tgsi_to_nir.h" -#include "glsl/shader_enums.h" #include "freedreno_util.h" diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.h b/src/gallium/drivers/freedreno/ir3/ir3_nir.h index f3d3075e6a6..9950782dc38 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_nir.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.h @@ -30,6 +30,7 @@ #define IR3_NIR_H_ #include "glsl/nir/nir.h" +#include "glsl/nir/shader_enums.h" bool ir3_nir_lower_if_else(nir_shader *shader); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h index 6dc0ce1133f..7e2c27d9765 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h @@ -30,7 +30,7 @@ #define IR3_SHADER_H_ #include "pipe/p_state.h" -#include "glsl/shader_enums.h" +#include "glsl/nir/shader_enums.h" #include "ir3.h" #include "disasm.h" diff --git a/src/gallium/drivers/ilo/core/ilo_builder.c b/src/gallium/drivers/ilo/core/ilo_builder.c index 4e05a3aca1e..9d5195129b7 100644 --- a/src/gallium/drivers/ilo/core/ilo_builder.c +++ b/src/gallium/drivers/ilo/core/ilo_builder.c @@ -25,6 +25,8 @@ * Chia-I Wu <[email protected]> */ +#include "util/u_memory.h" + #include "ilo_builder.h" #include "ilo_builder_render.h" /* for ilo_builder_batch_patch_sba() */ diff --git a/src/gallium/drivers/ilo/core/ilo_core.h b/src/gallium/drivers/ilo/core/ilo_core.h index da7db90a54b..cbc568c4cd0 100644 --- a/src/gallium/drivers/ilo/core/ilo_core.h +++ b/src/gallium/drivers/ilo/core/ilo_core.h @@ -30,8 +30,6 @@ #include "pipe/p_compiler.h" -#include "util/u_debug.h" #include "util/u_math.h" -#include "util/u_memory.h" #endif /* ILO_CORE_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_debug.h b/src/gallium/drivers/ilo/core/ilo_debug.h index 9833233d796..532a2aa7ed6 100644 --- a/src/gallium/drivers/ilo/core/ilo_debug.h +++ b/src/gallium/drivers/ilo/core/ilo_debug.h @@ -28,6 +28,8 @@ #ifndef ILO_DEBUG_H #define ILO_DEBUG_H +#include "util/u_debug.h" + #include "ilo_core.h" /* enable debug flags affecting hot pathes only with debug builds */ diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c index fa547ac5c36..6eefc8f46d2 100644 --- a/src/gallium/drivers/ilo/core/ilo_image.c +++ b/src/gallium/drivers/ilo/core/ilo_image.c @@ -286,8 +286,8 @@ image_get_gen6_tiling(const struct ilo_dev *dev, info->bind_surface_dp_typed)) return GEN6_TILING_NONE; - if (estimated_size <= 64 || - estimated_size > info->prefer_linear_threshold) + if (estimated_size <= 64 || (info->prefer_linear_threshold && + estimated_size > info->prefer_linear_threshold)) return GEN6_TILING_NONE; if (estimated_size <= 2048) diff --git a/src/gallium/drivers/ilo/core/ilo_image.h b/src/gallium/drivers/ilo/core/ilo_image.h index 646ed6f5727..546e0ff7739 100644 --- a/src/gallium/drivers/ilo/core/ilo_image.h +++ b/src/gallium/drivers/ilo/core/ilo_image.h @@ -102,7 +102,7 @@ struct ilo_image_info { /* * prefer GEN6_TILING_NONE when the (estimated) image size exceeds the - * threshold + * threshold; ignored when zero */ uint32_t prefer_linear_threshold; diff --git a/src/gallium/drivers/ilo/core/ilo_state_cc.c b/src/gallium/drivers/ilo/core/ilo_state_cc.c index 83ee8de979c..1f2456e19ea 100644 --- a/src/gallium/drivers/ilo/core/ilo_state_cc.c +++ b/src/gallium/drivers/ilo/core/ilo_state_cc.c @@ -694,10 +694,10 @@ cc_set_gen8_3DSTATE_PS_BLEND(struct ilo_state_cc *cc, cc_get_gen6_effective_rt(dev, info, 0, &rt0); /* 0x0 is reserved for blend factors and we have to set them all */ - dw1 |= rt0.a_src << GEN8_PS_BLEND_DW1_SRC_ALPHA_FACTOR__SHIFT | - rt0.a_dst << GEN8_PS_BLEND_DW1_DST_ALPHA_FACTOR__SHIFT | - rt0.rgb_src << GEN8_PS_BLEND_DW1_SRC_COLOR_FACTOR__SHIFT | - rt0.rgb_dst << GEN8_PS_BLEND_DW1_DST_COLOR_FACTOR__SHIFT; + dw1 |= rt0.a_src << GEN8_PS_BLEND_DW1_RT0_SRC_ALPHA_FACTOR__SHIFT | + rt0.a_dst << GEN8_PS_BLEND_DW1_RT0_DST_ALPHA_FACTOR__SHIFT | + rt0.rgb_src << GEN8_PS_BLEND_DW1_RT0_SRC_COLOR_FACTOR__SHIFT | + rt0.rgb_dst << GEN8_PS_BLEND_DW1_RT0_DST_COLOR_FACTOR__SHIFT; for (i = 0; i < blend->rt_count; i++) { if (blend->rt[i].argb_write_disables != 0xf) { @@ -707,10 +707,10 @@ cc_set_gen8_3DSTATE_PS_BLEND(struct ilo_state_cc *cc, } if (rt0.blend_enable) { - dw1 |= GEN8_PS_BLEND_DW1_BLEND_ENABLE; + dw1 |= GEN8_PS_BLEND_DW1_RT0_BLEND_ENABLE; if (rt0.a_src != rt0.rgb_src || rt0.a_dst != rt0.rgb_dst) - dw1 |= GEN8_PS_BLEND_DW1_INDEPENDENT_ALPHA_ENABLE; + dw1 |= GEN8_PS_BLEND_DW1_RT0_INDEPENDENT_ALPHA_ENABLE; } } diff --git a/src/gallium/drivers/ilo/core/ilo_state_raster.c b/src/gallium/drivers/ilo/core/ilo_state_raster.c index ed64a1f0d3c..a694f71bbbf 100644 --- a/src/gallium/drivers/ilo/core/ilo_state_raster.c +++ b/src/gallium/drivers/ilo/core/ilo_state_raster.c @@ -512,7 +512,7 @@ raster_set_gen8_3DSTATE_RASTER(struct ilo_state_raster *rs, /* where should line_msaa_enable be set? */ if (setup->msaa_enable) - dw1 |= GEN8_RASTER_DW1_API_MULTISAMPLE_ENABLE; + dw1 |= GEN8_RASTER_DW1_DX_MULTISAMPLE_ENABLE; if (tri->depth_offset_solid) dw1 |= GEN8_RASTER_DW1_DEPTH_OFFSET_SOLID; @@ -574,10 +574,6 @@ get_gen6_sample_count(const struct ilo_dev *dev, uint8_t sample_count) c = GEN7_NUMSAMPLES_8; min_gen = ILO_GEN(7); break; - case 16: - c = GEN8_NUMSAMPLES_16; - min_gen = ILO_GEN(8); - break; default: assert(!"unexpected sample count"); c = GEN6_NUMSAMPLES_1; @@ -792,17 +788,17 @@ raster_set_gen8_3DSTATE_WM(struct ilo_state_raster *rs, if (ilo_dev_gen(dev) < ILO_GEN(8)) { switch (scan->earlyz_op) { case ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR: - dw1 |= GEN7_WM_DW1_DEPTH_CLEAR; + dw1 |= GEN7_WM_DW1_LEGACY_DEPTH_CLEAR; break; case ILO_STATE_RASTER_EARLYZ_DEPTH_RESOLVE: - dw1 |= GEN7_WM_DW1_DEPTH_RESOLVE; + dw1 |= GEN7_WM_DW1_LEGACY_DEPTH_RESOLVE; break; case ILO_STATE_RASTER_EARLYZ_HIZ_RESOLVE: - dw1 |= GEN7_WM_DW1_HIZ_RESOLVE; + dw1 |= GEN7_WM_DW1_LEGACY_HIZ_RESOLVE; break; default: if (scan->earlyz_stencil_clear) - dw1 |= GEN7_WM_DW1_DEPTH_CLEAR; + dw1 |= GEN7_WM_DW1_LEGACY_DEPTH_CLEAR; break; } } diff --git a/src/gallium/drivers/ilo/core/ilo_state_sbe.c b/src/gallium/drivers/ilo/core/ilo_state_sbe.c index 5d1d400acdd..1b4ca0683c9 100644 --- a/src/gallium/drivers/ilo/core/ilo_state_sbe.c +++ b/src/gallium/drivers/ilo/core/ilo_state_sbe.c @@ -239,8 +239,8 @@ sbe_set_gen8_3DSTATE_SBE(struct ilo_state_sbe *sbe, vue_read_len << GEN7_SBE_DW1_URB_READ_LEN__SHIFT; if (ilo_dev_gen(dev) >= ILO_GEN(8)) { - dw1 |= GEN8_SBE_DW1_USE_URB_READ_LEN | - GEN8_SBE_DW1_USE_URB_READ_OFFSET | + dw1 |= GEN8_SBE_DW1_FORCE_URB_READ_LEN | + GEN8_SBE_DW1_FORCE_URB_READ_OFFSET | vue_read_offset << GEN8_SBE_DW1_URB_READ_OFFSET__SHIFT; } else { dw1 |= vue_read_offset << GEN7_SBE_DW1_URB_READ_OFFSET__SHIFT; @@ -286,10 +286,10 @@ sbe_set_gen8_3DSTATE_SBE_SWIZ(struct ilo_state_sbe *sbe, swizzle->attr << GEN8_SBE_SWIZ_SRC_ATTR__SHIFT; if (swizzle->force_zeros) { - swiz[i] |= GEN8_SBE_SWIZ_OVERRIDE_W | - GEN8_SBE_SWIZ_OVERRIDE_Z | - GEN8_SBE_SWIZ_OVERRIDE_Y | - GEN8_SBE_SWIZ_OVERRIDE_X | + swiz[i] |= GEN8_SBE_SWIZ_CONST_OVERRIDE_W | + GEN8_SBE_SWIZ_CONST_OVERRIDE_Z | + GEN8_SBE_SWIZ_CONST_OVERRIDE_Y | + GEN8_SBE_SWIZ_CONST_OVERRIDE_X | GEN8_SBE_SWIZ_CONST_0000; } } diff --git a/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c b/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c index f4d801e9b56..ceeb68a460e 100644 --- a/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c +++ b/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c @@ -592,7 +592,12 @@ ps_set_gen8_3DSTATE_PS(struct ilo_state_ps *ps, ILO_DEV_ASSERT(dev, 8, 8); - dw3 = ff->sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT | + /* + * Set VME here for correct computation of LODs and others. Not sure why + * it is needed now. + */ + dw3 = GEN6_THREADDISP_VME | + ff->sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT | ff->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT; if (false) diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.c b/src/gallium/drivers/ilo/core/ilo_state_surface.c index 40fe15f316f..27c37535fc8 100644 --- a/src/gallium/drivers/ilo/core/ilo_state_surface.c +++ b/src/gallium/drivers/ilo/core/ilo_state_surface.c @@ -814,10 +814,6 @@ surface_get_gen6_image_sample_count(const struct ilo_dev *dev, *sample_count = GEN7_NUMSAMPLES_8; min_gen = ILO_GEN(7); break; - case 16: - *sample_count = GEN8_NUMSAMPLES_16; - min_gen = ILO_GEN(8); - break; default: assert(!"invalid sample count"); *sample_count = GEN6_NUMSAMPLES_1; diff --git a/src/gallium/drivers/ilo/core/ilo_state_vf.c b/src/gallium/drivers/ilo/core/ilo_state_vf.c index 9faf835fef2..8f091e21a27 100644 --- a/src/gallium/drivers/ilo/core/ilo_state_vf.c +++ b/src/gallium/drivers/ilo/core/ilo_state_vf.c @@ -369,14 +369,14 @@ vf_params_set_gen8_3DSTATE_VF_SGVS(struct ilo_state_vf *vf, if (params->prepend_instanceid) { dw1 |= GEN8_SGVS_DW1_IID_ENABLE | - 1 << GEN8_SGVS_DW1_IID_VE_COMP__SHIFT | - attr << GEN8_SGVS_DW1_IID_VE_INDEX__SHIFT; + 1 << GEN8_SGVS_DW1_IID_COMP__SHIFT | + attr << GEN8_SGVS_DW1_IID_OFFSET__SHIFT; } if (params->prepend_vertexid) { dw1 |= GEN8_SGVS_DW1_VID_ENABLE | - 0 << GEN8_SGVS_DW1_VID_VE_COMP__SHIFT | - attr << GEN8_SGVS_DW1_VID_VE_INDEX__SHIFT; + 0 << GEN8_SGVS_DW1_VID_COMP__SHIFT | + attr << GEN8_SGVS_DW1_VID_OFFSET__SHIFT; } STATIC_ASSERT(ARRAY_SIZE(vf->sgvs) >= 1); diff --git a/src/gallium/drivers/ilo/genhw/gen_eu_message.xml.h b/src/gallium/drivers/ilo/genhw/gen_eu_message.xml.h index fe8b26908c0..96cf543d27e 100644 --- a/src/gallium/drivers/ilo/genhw/gen_eu_message.xml.h +++ b/src/gallium/drivers/ilo/genhw/gen_eu_message.xml.h @@ -41,7 +41,9 @@ enum gen_eu_urb_op { GEN7_MSG_URB_READ_OWORD = 0x3, GEN7_MSG_URB_ATOMIC_MOV = 0x4, GEN7_MSG_URB_ATOMIC_INC = 0x5, + GEN75_MSG_URB_ATOMIC_ADD = 0x6, GEN8_MSG_URB_SIMD8_WRITE = 0x7, + GEN8_MSG_URB_SIMD8_READ = 0x8, }; enum gen_eu_pi_simd { @@ -137,6 +139,7 @@ enum gen_eu_dp_op { GEN75_MSG_DP_RC_MEMORY_FENCE = 0x7, GEN75_MSG_DP_RC_MEDIA_BLOCK_WRITE = 0xa, GEN75_MSG_DP_RC_RT_WRITE = 0xc, + GEN8_MSG_DP_RC_RT_READ = 0xd, GEN75_MSG_DP_CC_OWORD_BLOCK_READ = 0x0, GEN75_MSG_DP_CC_UNALIGNED_OWORD_BLOCK_READ = 0x1, GEN75_MSG_DP_CC_OWORD_DUAL_BLOCK_READ = 0x2, diff --git a/src/gallium/drivers/ilo/genhw/gen_mi.xml.h b/src/gallium/drivers/ilo/genhw/gen_mi.xml.h index 5a0bb4f8d77..36f9618eb2d 100644 --- a/src/gallium/drivers/ilo/genhw/gen_mi.xml.h +++ b/src/gallium/drivers/ilo/genhw/gen_mi.xml.h @@ -84,6 +84,8 @@ enum gen_mi_alu_operand { #define GEN7_MI_OPCODE_MI_PREDICATE (0xc << 23) #define GEN7_MI_OPCODE_MI_URB_CLEAR (0x19 << 23) #define GEN75_MI_OPCODE_MI_MATH (0x1a << 23) +#define GEN8_MI_OPCODE_MI_SEMAPHORE_SIGNAL (0x1b << 23) +#define GEN8_MI_OPCODE_MI_SEMAPHORE_WAIT (0x1c << 23) #define GEN6_MI_OPCODE_MI_STORE_DATA_IMM (0x20 << 23) #define GEN6_MI_OPCODE_MI_LOAD_REGISTER_IMM (0x22 << 23) #define GEN6_MI_OPCODE_MI_STORE_REGISTER_MEM (0x24 << 23) @@ -91,8 +93,11 @@ enum gen_mi_alu_operand { #define GEN6_MI_OPCODE_MI_REPORT_PERF_COUNT (0x28 << 23) #define GEN7_MI_OPCODE_MI_LOAD_REGISTER_MEM (0x29 << 23) #define GEN75_MI_OPCODE_MI_LOAD_REGISTER_REG (0x2a << 23) +#define GEN75_MI_OPCODE_MI_RS_STORE_DATA_IMM (0x2b << 23) #define GEN75_MI_OPCODE_MI_LOAD_URB_MEM (0x2c << 23) #define GEN75_MI_OPCODE_MI_STORE_URB_MEM (0x2d << 23) +#define GEN8_MI_OPCODE_MI_COPY_MEM_MEM (0x2e << 23) +#define GEN8_MI_OPCODE_MI_ATOMIC (0x2f << 23) #define GEN6_MI_OPCODE_MI_BATCH_BUFFER_START (0x31 << 23) #define GEN6_MI_LENGTH__MASK 0x0000003f #define GEN6_MI_LENGTH__SHIFT 0 @@ -155,8 +160,41 @@ enum gen_mi_alu_operand { #define GEN75_MI_MATH_DW_SRC2__MASK 0x000007ff #define GEN75_MI_MATH_DW_SRC2__SHIFT 0 +#define GEN8_MI_SEMAPHORE_SIGNAL__SIZE 2 +#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_POST_SYNC_OP (0x1 << 21) +#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_ENGINE__MASK 0x00038000 +#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_ENGINE__SHIFT 15 +#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_ENGINE_RCS (0x0 << 15) +#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_ENGINE_VCS0 (0x1 << 15) +#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_ENGINE_BCS (0x2 << 15) +#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_ENGINE_VECS (0x3 << 15) +#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_ENGINE_VCS1 (0x4 << 15) + + +#define GEN8_MI_SEMAPHORE_WAIT__SIZE 4 +#define GEN8_MI_SEMAPHORE_WAIT_DW0_USE_GGTT (0x1 << 22) +#define GEN8_MI_SEMAPHORE_WAIT_DW0_WAIT_MODE__MASK 0x00008000 +#define GEN8_MI_SEMAPHORE_WAIT_DW0_WAIT_MODE__SHIFT 15 +#define GEN8_MI_SEMAPHORE_WAIT_DW0_WAIT_MODE_SIGNAL (0x0 << 15) +#define GEN8_MI_SEMAPHORE_WAIT_DW0_WAIT_MODE_POLL (0x1 << 15) +#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP__MASK 0x00007000 +#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP__SHIFT 12 +#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP_SAD_GREATER_THAN_SDD (0x0 << 12) +#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP_SAD_GREATER_THAN_OR_EQUAL_SDD (0x1 << 12) +#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP_SAD_LESS_THAN_SDD (0x2 << 12) +#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP_SAD_LESS_THAN_OR_EQUAL_SDD (0x3 << 12) +#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP_SAD_EQUAL_SDD (0x4 << 12) +#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP_SAD_NO_EQUAL_SDD (0x5 << 12) + + +#define GEN8_MI_SEMAPHORE_WAIT_DW2_ADDR_ADDR__MASK 0xfffffffc +#define GEN8_MI_SEMAPHORE_WAIT_DW2_ADDR_ADDR__SHIFT 2 +#define GEN8_MI_SEMAPHORE_WAIT_DW2_ADDR_ADDR__SHR 2 + + #define GEN6_MI_STORE_DATA_IMM__SIZE 6 #define GEN6_MI_STORE_DATA_IMM_DW0_USE_GGTT (0x1 << 22) +#define GEN8_MI_STORE_DATA_IMM_DW0_STORE_QWORD (0x1 << 21) #define GEN6_MI_STORE_DATA_IMM_DW2_ADDR__MASK 0xfffffffc @@ -188,7 +226,17 @@ enum gen_mi_alu_operand { #define GEN6_MI_STORE_REGISTER_MEM_DW2_ADDR__SHR 2 -#define GEN6_MI_FLUSH_DW__SIZE 4 +#define GEN6_MI_FLUSH_DW__SIZE 5 +#define GEN6_MI_FLUSH_DW_DW0_WRITE__MASK 0x0000c000 +#define GEN6_MI_FLUSH_DW_DW0_WRITE__SHIFT 14 +#define GEN6_MI_FLUSH_DW_DW0_WRITE_NONE (0x0 << 14) +#define GEN6_MI_FLUSH_DW_DW0_WRITE_IMM (0x1 << 14) +#define GEN6_MI_FLUSH_DW_DW0_WRITE_TIMESTAMP (0x3 << 14) + +#define GEN6_MI_FLUSH_DW_DW1_USE_GGTT (0x1 << 2) +#define GEN6_MI_FLUSH_DW_DW1_ADDR__MASK 0xfffffff8 +#define GEN6_MI_FLUSH_DW_DW1_ADDR__SHIFT 3 +#define GEN6_MI_FLUSH_DW_DW1_ADDR__SHR 3 @@ -225,6 +273,17 @@ enum gen_mi_alu_operand { #define GEN75_MI_LOAD_REGISTER_REG_DW2_DST_REG__SHIFT 2 #define GEN75_MI_LOAD_REGISTER_REG_DW2_DST_REG__SHR 2 +#define GEN75_MI_RS_STORE_DATA_IMM__SIZE 6 +#define GEN75_MI_RS_STORE_DATA_IMM_DW0_USE_GGTT (0x1 << 22) + + +#define GEN75_MI_RS_STORE_DATA_IMM_DW2_ADDR__MASK 0xfffffffc +#define GEN75_MI_RS_STORE_DATA_IMM_DW2_ADDR__SHIFT 2 +#define GEN75_MI_RS_STORE_DATA_IMM_DW2_ADDR__SHR 2 + + + + #define GEN75_MI_LOAD_URB_MEM__SIZE 4 #define GEN75_MI_LOAD_URB_MEM_DW1_ADDR__MASK 0x00007ffc @@ -247,12 +306,47 @@ enum gen_mi_alu_operand { #define GEN75_MI_STORE_URB_MEM_DW2_ADDR__SHR 6 +#define GEN8_MI_COPY_MEM_MEM__SIZE 5 +#define GEN8_MI_COPY_MEM_MEM_DW0_USE_GGTT_SRC (0x1 << 22) +#define GEN8_MI_COPY_MEM_MEM_DW0_USE_GGTT_DST (0x1 << 21) + +#define GEN8_MI_COPY_MEM_MEM_DW1_DST_ADDR__MASK 0xfffffffc +#define GEN8_MI_COPY_MEM_MEM_DW1_DST_ADDR__SHIFT 2 +#define GEN8_MI_COPY_MEM_MEM_DW1_DST_ADDR__SHR 2 + + +#define GEN8_MI_COPY_MEM_MEM_DW3_SRC_ADDR__MASK 0xfffffffc +#define GEN8_MI_COPY_MEM_MEM_DW3_SRC_ADDR__SHIFT 2 +#define GEN8_MI_COPY_MEM_MEM_DW3_SRC_ADDR__SHR 2 + + +#define GEN8_MI_ATOMIC__SIZE 11 +#define GEN8_MI_ATOMIC_DW0_USE_GGTT (0x1 << 22) +#define GEN8_MI_ATOMIC_DW0_POST_SYNC_OP (0x1 << 21) +#define GEN8_MI_ATOMIC_DW0_SIZE__MASK 0x00180000 +#define GEN8_MI_ATOMIC_DW0_SIZE__SHIFT 19 +#define GEN8_MI_ATOMIC_DW0_SIZE_DWORD (0x0 << 19) +#define GEN8_MI_ATOMIC_DW0_SIZE_QWORD (0x1 << 19) +#define GEN8_MI_ATOMIC_DW0_SIZE_OWORD (0x2 << 19) +#define GEN8_MI_ATOMIC_DW0_INLINE_DATA (0x1 << 18) +#define GEN8_MI_ATOMIC_DW0_CS_STALL (0x1 << 17) +#define GEN8_MI_ATOMIC_DW0_RETURN_DATA_CONTROL (0x1 << 16) +#define GEN8_MI_ATOMIC_DW0_OP__MASK 0x0000ff00 +#define GEN8_MI_ATOMIC_DW0_OP__SHIFT 8 + +#define GEN8_MI_ATOMIC_DW1_ADDR__MASK 0xfffffffc +#define GEN8_MI_ATOMIC_DW1_ADDR__SHIFT 2 +#define GEN8_MI_ATOMIC_DW1_ADDR__SHR 2 + + + #define GEN6_MI_BATCH_BUFFER_START__SIZE 3 #define GEN75_MI_BATCH_BUFFER_START_DW0_SECOND_LEVEL (0x1 << 22) #define GEN75_MI_BATCH_BUFFER_START_DW0_ADD_OFFSET_ENABLE (0x1 << 16) #define GEN75_MI_BATCH_BUFFER_START_DW0_PREDICATION_ENABLE (0x1 << 15) #define GEN75_MI_BATCH_BUFFER_START_DW0_NON_PRIVILEGED (0x1 << 13) #define GEN6_MI_BATCH_BUFFER_START_DW0_CLEAR_COMMAND_BUFFER (0x1 << 11) +#define GEN75_MI_BATCH_BUFFER_START_DW0_RS_ENABLE (0x1 << 10) #define GEN6_MI_BATCH_BUFFER_START_DW0_USE_PPGTT (0x1 << 8) #define GEN6_MI_BATCH_BUFFER_START_DW1_ADDR__MASK 0xfffffffc diff --git a/src/gallium/drivers/ilo/genhw/gen_regs.xml.h b/src/gallium/drivers/ilo/genhw/gen_regs.xml.h index c51e4f78bc0..54ec13eaafa 100644 --- a/src/gallium/drivers/ilo/genhw/gen_regs.xml.h +++ b/src/gallium/drivers/ilo/genhw/gen_regs.xml.h @@ -37,6 +37,14 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define GEN6_REG__SIZE 0x400000 #define GEN6_REG_NOPID 0x2094 + +#define GEN6_REG_SO_PRIM_STORAGE_NEEDED 0x2280 + +#define GEN6_REG_SO_NUM_PRIMS_WRITTEN 0x2288 + + +#define GEN7_REG_TS_GPGPU_THREADS_DISPATCHED 0x2290 + #define GEN7_REG_HS_INVOCATION_COUNT 0x2300 #define GEN7_REG_DS_INVOCATION_COUNT 0x2308 @@ -95,10 +103,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define GEN75_REG_CS_GPR__ESIZE 0x8 #define GEN75_REG_CS_GPR__LEN 0x10 +#define GEN7_REG_GPGPU_DISPATCHDIMX 0x2500 -#define GEN6_REG_SO_PRIM_STORAGE_NEEDED 0x2280 +#define GEN7_REG_GPGPU_DISPATCHDIMY 0x2504 -#define GEN6_REG_SO_NUM_PRIMS_WRITTEN 0x2288 +#define GEN7_REG_GPGPU_DISPATCHDIMZ 0x2508 #define GEN7_REG_SO_NUM_PRIMS_WRITTEN(i0) (0x5200 + 0x8*(i0)) @@ -118,8 +127,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define GEN7_REG_CACHE_MODE_0_HIZ_RAW_STALL_OPT_DISABLE (0x1 << 2) #define GEN7_REG_CACHE_MODE_1 0x7004 -#define GEN8_REG_CACHE_MODE_1_HIZ_NP_EARLY_Z_FAILS_DISABLE (0x1 << 13) -#define GEN8_REG_CACHE_MODE_1_HIZ_NP_PMA_FIX_ENABLE (0x1 << 11) +#define GEN8_REG_CACHE_MODE_1_NP_EARLY_Z_FAILS_DISABLE (0x1 << 13) +#define GEN8_REG_CACHE_MODE_1_NP_PMA_FIX_ENABLE (0x1 << 11) #define GEN8_REG_L3CNTLREG 0x7034 diff --git a/src/gallium/drivers/ilo/genhw/gen_render.xml.h b/src/gallium/drivers/ilo/genhw/gen_render.xml.h index 2e86ba96ae2..43d271d838a 100644 --- a/src/gallium/drivers/ilo/genhw/gen_render.xml.h +++ b/src/gallium/drivers/ilo/genhw/gen_render.xml.h @@ -102,6 +102,16 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define GEN7_RENDER_OPCODE_3DSTATE_URB_HS (0x31 << 16) #define GEN7_RENDER_OPCODE_3DSTATE_URB_DS (0x32 << 16) #define GEN7_RENDER_OPCODE_3DSTATE_URB_GS (0x33 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_GATHER_CONSTANT_VS (0x34 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_GATHER_CONSTANT_GS (0x35 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_GATHER_CONSTANT_HS (0x36 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_GATHER_CONSTANT_DS (0x37 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_GATHER_CONSTANT_PS (0x38 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_BINDING_TABLE_EDIT_VS (0x43 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_BINDING_TABLE_EDIT_GS (0x44 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_BINDING_TABLE_EDIT_HS (0x45 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_BINDING_TABLE_EDIT_DS (0x45 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_BINDING_TABLE_EDIT_PS (0x46 << 16) #define GEN8_RENDER_OPCODE_3DSTATE_VF_INSTANCING (0x49 << 16) #define GEN8_RENDER_OPCODE_3DSTATE_VF_SGVS (0x4a << 16) #define GEN8_RENDER_OPCODE_3DSTATE_VF_TOPOLOGY (0x4b << 16) @@ -130,6 +140,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_PS (0x116 << 16) #define GEN7_RENDER_OPCODE_3DSTATE_SO_DECL_LIST (0x117 << 16) #define GEN7_RENDER_OPCODE_3DSTATE_SO_BUFFER (0x118 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_BINDING_TABLE_POOL_ALLOC (0x119 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_GATHER_POOL_ALLOC (0x11a << 16) #define GEN8_RENDER_OPCODE_3DSTATE_SAMPLE_PATTERN (0x11c << 16) #define GEN6_RENDER_OPCODE_PIPE_CONTROL (0x200 << 16) #define GEN6_RENDER_OPCODE_3DPRIMITIVE (0x300 << 16) @@ -178,6 +190,10 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define GEN8_SBA_MOCS__MASK 0x000007f0 #define GEN8_SBA_MOCS__SHIFT 4 #define GEN6_SBA_ADDR_MODIFIED (0x1 << 0) +#define GEN8_SBA_SIZE__MASK 0xfffff000 +#define GEN8_SBA_SIZE__SHIFT 12 +#define GEN8_SBA_SIZE__SHR 12 +#define GEN8_SBA_SIZE_MODIFIED (0x1 << 0) #define GEN6_BINDING_TABLE_ADDR__MASK 0x0000ffe0 #define GEN6_BINDING_TABLE_ADDR__SHIFT 5 #define GEN6_BINDING_TABLE_ADDR__SHR 5 diff --git a/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h b/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h index 52173fe5d07..c79a4f3a830 100644 --- a/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h +++ b/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h @@ -168,7 +168,6 @@ enum gen_sample_count { GEN8_NUMSAMPLES_2 = 0x1, GEN6_NUMSAMPLES_4 = 0x2, GEN7_NUMSAMPLES_8 = 0x3, - GEN8_NUMSAMPLES_16 = 0x4, }; enum gen_inputattr_select { @@ -297,11 +296,58 @@ enum gen_msrast_mode { #define GEN7_URB_DW1_OFFSET__MASK 0x3e000000 #define GEN7_URB_DW1_OFFSET__SHIFT 25 +#define GEN75_URB_DW1_OFFSET__MASK 0x7e000000 +#define GEN75_URB_DW1_OFFSET__SHIFT 25 +#define GEN8_URB_DW1_OFFSET__MASK 0xfe000000 +#define GEN8_URB_DW1_OFFSET__SHIFT 25 #define GEN7_URB_DW1_ENTRY_SIZE__MASK 0x01ff0000 #define GEN7_URB_DW1_ENTRY_SIZE__SHIFT 16 #define GEN7_URB_DW1_ENTRY_COUNT__MASK 0x0000ffff #define GEN7_URB_DW1_ENTRY_COUNT__SHIFT 0 +#define GEN75_3DSTATE_GATHER_CONSTANT_ANY__SIZE 130 + + +#define GEN75_GATHER_CONST_DW1_BT_VALID__MASK 0xffff0000 +#define GEN75_GATHER_CONST_DW1_BT_VALID__SHIFT 16 +#define GEN75_GATHER_CONST_DW1_BT_BLOCK__MASK 0x0000f000 +#define GEN75_GATHER_CONST_DW1_BT_BLOCK__SHIFT 12 + +#define GEN75_GATHER_CONST_DW2_GATHER_BUFFER_OFFSET__MASK 0x007fffc0 +#define GEN75_GATHER_CONST_DW2_GATHER_BUFFER_OFFSET__SHIFT 6 +#define GEN75_GATHER_CONST_DW2_GATHER_BUFFER_OFFSET__SHR 6 +#define GEN8_GATHER_CONST_DW2_DX9_STALL (0x1 << 5) +#define GEN75_GATHER_CONST_DW2_DX9_ENABLE (0x1 << 4) + +#define GEN75_GATHER_CONST_DW_ENTRY_HIGH__MASK 0xffff0000 +#define GEN75_GATHER_CONST_DW_ENTRY_HIGH__SHIFT 16 +#define GEN75_GATHER_CONST_DW_ENTRY_OFFSET__MASK 0x0000ff00 +#define GEN75_GATHER_CONST_DW_ENTRY_OFFSET__SHIFT 8 +#define GEN75_GATHER_CONST_DW_ENTRY_CHANNEL_MASK__MASK 0x000000f0 +#define GEN75_GATHER_CONST_DW_ENTRY_CHANNEL_MASK__SHIFT 4 +#define GEN75_GATHER_CONST_DW_ENTRY_BT_INDEX__MASK 0x0000001f +#define GEN75_GATHER_CONST_DW_ENTRY_BT_INDEX__SHIFT 0 + +#define GEN75_3DSTATE_BINDING_TABLE_EDIT_ANY__SIZE 258 + + +#define GEN75_BT_EDIT_DW1_BT_BLOCK_CLEAR__MASK 0xffff0000 +#define GEN75_BT_EDIT_DW1_BT_BLOCK_CLEAR__SHIFT 16 +#define GEN75_BT_EDIT_DW1_TARGET__MASK 0x00000003 +#define GEN75_BT_EDIT_DW1_TARGET__SHIFT 0 +#define GEN75_BT_EDIT_DW1_TARGET_CORE0 0x1 +#define GEN75_BT_EDIT_DW1_TARGET_CORE1 0x2 +#define GEN75_BT_EDIT_DW1_TARGET_ALL 0x3 + +#define GEN75_BT_EDIT_DW_ENTRY_BT_INDEX__MASK 0x00ff0000 +#define GEN75_BT_EDIT_DW_ENTRY_BT_INDEX__SHIFT 16 +#define GEN75_BT_EDIT_DW_ENTRY_SURFACE_STATE_ADDR__MASK 0x0000ffff +#define GEN75_BT_EDIT_DW_ENTRY_SURFACE_STATE_ADDR__SHIFT 0 +#define GEN75_BT_EDIT_DW_ENTRY_SURFACE_STATE_ADDR__SHR 5 +#define GEN8_BT_EDIT_DW_ENTRY_SURFACE_STATE_ADDR__MASK 0x0000ffff +#define GEN8_BT_EDIT_DW_ENTRY_SURFACE_STATE_ADDR__SHIFT 0 +#define GEN8_BT_EDIT_DW_ENTRY_SURFACE_STATE_ADDR__SHR 6 + #define GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_ANY__SIZE 2 @@ -315,6 +361,48 @@ enum gen_msrast_mode { #define GEN75_PCB_ALLOC_DW1_SIZE__MASK 0x0000003f #define GEN75_PCB_ALLOC_DW1_SIZE__SHIFT 0 +#define GEN75_3DSTATE_BINDING_TABLE_POOL_ALLOC__SIZE 3 + + +#define GEN75_BT_POOL_ALLOC_DW1_ADDR__MASK 0xfffff000 +#define GEN75_BT_POOL_ALLOC_DW1_ADDR__SHIFT 12 +#define GEN75_BT_POOL_ALLOC_DW1_ADDR__SHR 12 +#define GEN75_BT_POOL_ALLOC_DW1_ENABLE (0x1 << 11) +#define GEN75_BT_POOL_ALLOC_DW1_MOCS__MASK 0x00000780 +#define GEN75_BT_POOL_ALLOC_DW1_MOCS__SHIFT 7 +#define GEN8_BT_POOL_ALLOC_DW1_MOCS__MASK 0x0000007f +#define GEN8_BT_POOL_ALLOC_DW1_MOCS__SHIFT 0 + +#define GEN75_BT_POOL_ALLOC_DW2_END_ADDR__MASK 0xfffff000 +#define GEN75_BT_POOL_ALLOC_DW2_END_ADDR__SHIFT 12 +#define GEN75_BT_POOL_ALLOC_DW2_END_ADDR__SHR 12 + + +#define GEN8_BT_POOL_ALLOC_DW3_SIZE__MASK 0xfffff000 +#define GEN8_BT_POOL_ALLOC_DW3_SIZE__SHIFT 12 +#define GEN8_BT_POOL_ALLOC_DW3_SIZE__SHR 12 + +#define GEN75_3DSTATE_GATHER_POOL_ALLOC__SIZE 3 + + +#define GEN75_GATHER_POOL_ALLOC_DW1_ADDR__MASK 0xfffff000 +#define GEN75_GATHER_POOL_ALLOC_DW1_ADDR__SHIFT 12 +#define GEN75_GATHER_POOL_ALLOC_DW1_ADDR__SHR 12 +#define GEN75_GATHER_POOL_ALLOC_DW1_ENABLE (0x1 << 11) +#define GEN75_GATHER_POOL_ALLOC_DW1_MOCS__MASK 0x0000000f +#define GEN75_GATHER_POOL_ALLOC_DW1_MOCS__SHIFT 0 +#define GEN8_GATHER_POOL_ALLOC_DW1_MOCS__MASK 0x0000007f +#define GEN8_GATHER_POOL_ALLOC_DW1_MOCS__SHIFT 0 + +#define GEN75_GATHER_POOL_ALLOC_DW2_END_ADDR__MASK 0xfffff000 +#define GEN75_GATHER_POOL_ALLOC_DW2_END_ADDR__SHIFT 12 +#define GEN75_GATHER_POOL_ALLOC_DW2_END_ADDR__SHR 12 + + +#define GEN8_GATHER_POOL_ALLOC_DW3_SIZE__MASK 0xfffff000 +#define GEN8_GATHER_POOL_ALLOC_DW3_SIZE__SHIFT 12 +#define GEN8_GATHER_POOL_ALLOC_DW3_SIZE__SHR 12 + #define GEN6_3DSTATE_VERTEX_BUFFERS__SIZE 133 @@ -402,15 +490,15 @@ enum gen_msrast_mode { #define GEN8_SGVS_DW1_IID_ENABLE (0x1 << 31) -#define GEN8_SGVS_DW1_IID_VE_COMP__MASK 0x60000000 -#define GEN8_SGVS_DW1_IID_VE_COMP__SHIFT 29 -#define GEN8_SGVS_DW1_IID_VE_INDEX__MASK 0x003f0000 -#define GEN8_SGVS_DW1_IID_VE_INDEX__SHIFT 16 +#define GEN8_SGVS_DW1_IID_COMP__MASK 0x60000000 +#define GEN8_SGVS_DW1_IID_COMP__SHIFT 29 +#define GEN8_SGVS_DW1_IID_OFFSET__MASK 0x003f0000 +#define GEN8_SGVS_DW1_IID_OFFSET__SHIFT 16 #define GEN8_SGVS_DW1_VID_ENABLE (0x1 << 15) -#define GEN8_SGVS_DW1_VID_VE_COMP__MASK 0x00006000 -#define GEN8_SGVS_DW1_VID_VE_COMP__SHIFT 13 -#define GEN8_SGVS_DW1_VID_VE_INDEX__MASK 0x0000003f -#define GEN8_SGVS_DW1_VID_VE_INDEX__SHIFT 0 +#define GEN8_SGVS_DW1_VID_COMP__MASK 0x00006000 +#define GEN8_SGVS_DW1_VID_COMP__SHIFT 13 +#define GEN8_SGVS_DW1_VID_OFFSET__MASK 0x0000003f +#define GEN8_SGVS_DW1_VID_OFFSET__SHIFT 0 #define GEN8_3DSTATE_VF_TOPOLOGY__SIZE 2 @@ -464,6 +552,10 @@ enum gen_msrast_mode { #define GEN7_3DSTATE_POINTERS_ANY__SIZE 2 +#define GEN7_PTR_DW1_ADDR__MASK 0xffffffe0 +#define GEN7_PTR_DW1_ADDR__SHIFT 5 +#define GEN7_PTR_DW1_ADDR__SHR 5 +#define GEN8_PTR_DW1_CHANGED (0x1 << 0) #define GEN6_3DSTATE_VS__SIZE 9 @@ -513,12 +605,14 @@ enum gen_msrast_mode { #define GEN8_VS_DW7_CACHE_DISABLE (0x1 << 1) #define GEN8_VS_DW7_VS_ENABLE (0x1 << 0) -#define GEN8_VS_DW8_URB_WRITE_OFFSET__MASK 0x03e00000 -#define GEN8_VS_DW8_URB_WRITE_OFFSET__SHIFT 21 -#define GEN8_VS_DW8_URB_WRITE_LEN__MASK 0x001f0000 -#define GEN8_VS_DW8_URB_WRITE_LEN__SHIFT 16 +#define GEN8_VS_DW8_VUE_OUT_READ_OFFSET__MASK 0x07e00000 +#define GEN8_VS_DW8_VUE_OUT_READ_OFFSET__SHIFT 21 +#define GEN8_VS_DW8_VUE_OUT_LEN__MASK 0x001f0000 +#define GEN8_VS_DW8_VUE_OUT_LEN__SHIFT 16 #define GEN8_VS_DW8_UCP_CLIP_ENABLES__MASK 0x0000ff00 #define GEN8_VS_DW8_UCP_CLIP_ENABLES__SHIFT 8 +#define GEN8_VS_DW8_UCP_CULL_ENABLES__MASK 0x000000ff +#define GEN8_VS_DW8_UCP_CULL_ENABLES__SHIFT 0 #define GEN7_3DSTATE_HS__SIZE 9 @@ -558,11 +652,11 @@ enum gen_msrast_mode { -#define GEN8_HS_DW1_DISPATCH_MAX_THREADS__MASK 0x000000ff -#define GEN8_HS_DW1_DISPATCH_MAX_THREADS__SHIFT 0 #define GEN8_HS_DW2_HS_ENABLE (0x1 << 31) #define GEN8_HS_DW2_STATISTICS (0x1 << 29) +#define GEN8_HS_DW2_MAX_THREADS__MASK 0x0001ff00 +#define GEN8_HS_DW2_MAX_THREADS__SHIFT 8 #define GEN8_HS_DW2_INSTANCE_COUNT__MASK 0x0000000f #define GEN8_HS_DW2_INSTANCE_COUNT__SHIFT 0 @@ -584,9 +678,6 @@ enum gen_msrast_mode { #define GEN8_HS_DW7_URB_READ_OFFSET__MASK 0x000003f0 #define GEN8_HS_DW7_URB_READ_OFFSET__SHIFT 4 -#define GEN8_HS_DW8_URB_SEMAPHORE_ADDR__MASK 0x00001fff -#define GEN8_HS_DW8_URB_SEMAPHORE_ADDR__SHIFT 0 -#define GEN8_HS_DW8_URB_SEMAPHORE_ADDR__SHR 6 #define GEN7_3DSTATE_TE__SIZE 4 @@ -660,16 +751,19 @@ enum gen_msrast_mode { #define GEN8_DS_DW7_MAX_THREADS__MASK 0x3fe00000 #define GEN8_DS_DW7_MAX_THREADS__SHIFT 21 #define GEN8_DS_DW7_STATISTICS (0x1 << 10) +#define GEN8_DS_DW7_SIMD8_ENABLE (0x1 << 3) #define GEN8_DS_DW7_COMPUTE_W (0x1 << 2) #define GEN8_DS_DW7_CACHE_DISABLE (0x1 << 1) #define GEN8_DS_DW7_DS_ENABLE (0x1 << 0) -#define GEN8_DS_DW8_URB_WRITE_OFFSET__MASK 0x03e00000 -#define GEN8_DS_DW8_URB_WRITE_OFFSET__SHIFT 21 -#define GEN8_DS_DW8_URB_WRITE_LEN__MASK 0x001f0000 -#define GEN8_DS_DW8_URB_WRITE_LEN__SHIFT 16 +#define GEN8_DS_DW8_VUE_OUT_READ_OFFSET__MASK 0x07e00000 +#define GEN8_DS_DW8_VUE_OUT_READ_OFFSET__SHIFT 21 +#define GEN8_DS_DW8_VUE_OUT_LEN__MASK 0x001f0000 +#define GEN8_DS_DW8_VUE_OUT_LEN__SHIFT 16 #define GEN8_DS_DW8_UCP_CLIP_ENABLES__MASK 0x0000ff00 #define GEN8_DS_DW8_UCP_CLIP_ENABLES__SHIFT 8 +#define GEN8_DS_DW8_UCP_CULL_ENABLES__MASK 0x000000ff +#define GEN8_DS_DW8_UCP_CULL_ENABLES__SHIFT 0 @@ -771,7 +865,7 @@ enum gen_msrast_mode { #define GEN8_GS_DW1_KERNEL_ADDR__SHR 6 -#define GEN8_GS_DW3_EXPECTED_VERTEX_COUNT__MASK 0x0000007f +#define GEN8_GS_DW3_EXPECTED_VERTEX_COUNT__MASK 0x0000003f #define GEN8_GS_DW3_EXPECTED_VERTEX_COUNT__SHIFT 0 @@ -815,18 +909,20 @@ enum gen_msrast_mode { #define GEN8_GS_DW8_GSCTRL__SHIFT 31 #define GEN8_GS_DW8_GSCTRL_CUT (0x0 << 31) #define GEN8_GS_DW8_GSCTRL_SID (0x1 << 31) -#define GEN8_GS_DW8_URB_SEMAPHORE_ADDR__MASK 0x00001fff -#define GEN8_GS_DW8_URB_SEMAPHORE_ADDR__SHIFT 0 -#define GEN8_GS_DW8_URB_SEMAPHORE_ADDR__SHR 6 -#define GEN9_GS_DW8_MAX_THREADS__MASK 0x00001fff +#define GEN8_GS_DW8_STATIC_OUTPUT (0x1 << 30) +#define GEN8_GS_DW8_STATIC_OUTPUT_VERTEX_COUNT__MASK 0x07ff0000 +#define GEN8_GS_DW8_STATIC_OUTPUT_VERTEX_COUNT__SHIFT 16 +#define GEN9_GS_DW8_MAX_THREADS__MASK 0x000001ff #define GEN9_GS_DW8_MAX_THREADS__SHIFT 0 -#define GEN8_GS_DW9_URB_WRITE_OFFSET__MASK 0x03e00000 -#define GEN8_GS_DW9_URB_WRITE_OFFSET__SHIFT 21 -#define GEN8_GS_DW9_URB_WRITE_LEN__MASK 0x001f0000 -#define GEN8_GS_DW9_URB_WRITE_LEN__SHIFT 16 +#define GEN8_GS_DW9_VUE_OUT_READ_OFFSET__MASK 0x07e00000 +#define GEN8_GS_DW9_VUE_OUT_READ_OFFSET__SHIFT 21 +#define GEN8_GS_DW9_VUE_OUT_LEN__MASK 0x001f0000 +#define GEN8_GS_DW9_VUE_OUT_LEN__SHIFT 16 #define GEN8_GS_DW9_UCP_CLIP_ENABLES__MASK 0x0000ff00 #define GEN8_GS_DW9_UCP_CLIP_ENABLES__SHIFT 8 +#define GEN8_GS_DW9_UCP_CULL_ENABLES__MASK 0x000000ff +#define GEN8_GS_DW9_UCP_CULL_ENABLES__SHIFT 0 #define GEN7_3DSTATE_STREAMOUT__SIZE 5 @@ -838,6 +934,11 @@ enum gen_msrast_mode { #define GEN7_SO_DW1_REORDER_MODE__MASK 0x04000000 #define GEN7_SO_DW1_REORDER_MODE__SHIFT 26 #define GEN7_SO_DW1_STATISTICS (0x1 << 25) +#define GEN8_SO_DW1_FORCE_RENDERING__MASK 0x01800000 +#define GEN8_SO_DW1_FORCE_RENDERING__SHIFT 23 +#define GEN8_SO_DW1_FORCE_RENDERING_NORMAL (0x0 << 23) +#define GEN8_SO_DW1_FORCE_RENDERING_OFF (0x2 << 23) +#define GEN8_SO_DW1_FORCE_RENDERING_ON (0x3 << 23) #define GEN7_SO_DW1_BUFFER_ENABLES__MASK 0x00000f00 #define GEN7_SO_DW1_BUFFER_ENABLES__SHIFT 8 @@ -928,9 +1029,9 @@ enum gen_msrast_mode { -#define GEN8_SO_BUF_DW5_OFFSET_ADDR__MASK 0xfffffffc -#define GEN8_SO_BUF_DW5_OFFSET_ADDR__SHIFT 2 -#define GEN8_SO_BUF_DW5_OFFSET_ADDR__SHR 2 +#define GEN8_SO_BUF_DW5_OFFSET_ADDR_ADDR__MASK 0xfffffffc +#define GEN8_SO_BUF_DW5_OFFSET_ADDR_ADDR__SHIFT 2 +#define GEN8_SO_BUF_DW5_OFFSET_ADDR_ADDR__SHR 2 @@ -939,6 +1040,7 @@ enum gen_msrast_mode { #define GEN7_CLIP_DW1_FRONT_WINDING__MASK 0x00100000 #define GEN7_CLIP_DW1_FRONT_WINDING__SHIFT 20 +#define GEN8_CLIP_DW1_FORCE_UCP_CULL_ENABLES (0x1 << 20) #define GEN7_CLIP_DW1_SUBPIXEL__MASK 0x00080000 #define GEN7_CLIP_DW1_SUBPIXEL__SHIFT 19 #define GEN7_CLIP_DW1_SUBPIXEL_8BITS (0x0 << 19) @@ -946,6 +1048,8 @@ enum gen_msrast_mode { #define GEN7_CLIP_DW1_EARLY_CULL_ENABLE (0x1 << 18) #define GEN7_CLIP_DW1_CULL_MODE__MASK 0x00030000 #define GEN7_CLIP_DW1_CULL_MODE__SHIFT 16 +#define GEN8_CLIP_DW1_FORCE_UCP_CLIP_ENABLES (0x1 << 17) +#define GEN8_CLIP_DW1_FORCE_CLIP_MODE (0x1 << 16) #define GEN6_CLIP_DW1_STATISTICS (0x1 << 10) #define GEN6_CLIP_DW1_UCP_CULL_ENABLES__MASK 0x000000ff #define GEN6_CLIP_DW1_UCP_CULL_ENABLES__SHIFT 0 @@ -1026,6 +1130,7 @@ enum gen_msrast_mode { #define GEN7_SF_DW3_TRIFAN_PROVOKE__MASK 0x06000000 #define GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT 25 #define GEN7_SF_DW3_TRUE_AA_LINE_DISTANCE (0x1 << 14) +#define GEN8_SF_DW3_SMOOTH_POINT_ENABLE (0x1 << 13) #define GEN7_SF_DW3_SUBPIXEL__MASK 0x00001000 #define GEN7_SF_DW3_SUBPIXEL__SHIFT 12 #define GEN7_SF_DW3_SUBPIXEL_8BITS (0x0 << 12) @@ -1037,8 +1142,8 @@ enum gen_msrast_mode { #define GEN7_3DSTATE_SBE_DW1__SIZE 13 -#define GEN8_SBE_DW1_USE_URB_READ_LEN (0x1 << 29) -#define GEN8_SBE_DW1_USE_URB_READ_OFFSET (0x1 << 28) +#define GEN8_SBE_DW1_FORCE_URB_READ_LEN (0x1 << 29) +#define GEN8_SBE_DW1_FORCE_URB_READ_OFFSET (0x1 << 28) #define GEN7_SBE_DW1_ATTR_SWIZZLE__MASK 0x10000000 #define GEN7_SBE_DW1_ATTR_SWIZZLE__SHIFT 28 #define GEN7_SBE_DW1_ATTR_SWIZZLE_0_15 (0x0 << 28) @@ -1050,21 +1155,28 @@ enum gen_msrast_mode { #define GEN7_SBE_DW1_POINT_SPRITE_TEXCOORD__SHIFT 20 #define GEN7_SBE_DW1_POINT_SPRITE_TEXCOORD_UPPERLEFT (0x0 << 20) #define GEN7_SBE_DW1_POINT_SPRITE_TEXCOORD_LOWERLEFT (0x1 << 20) +#define GEN8_SBE_DW1_PID_OVERRIDE_W (0x1 << 19) +#define GEN8_SBE_DW1_PID_OVERRIDE_Z (0x1 << 18) +#define GEN8_SBE_DW1_PID_OVERRIDE_Y (0x1 << 17) +#define GEN8_SBE_DW1_PID_OVERRIDE_X (0x1 << 16) #define GEN7_SBE_DW1_URB_READ_LEN__MASK 0x0000f800 #define GEN7_SBE_DW1_URB_READ_LEN__SHIFT 11 #define GEN7_SBE_DW1_URB_READ_OFFSET__MASK 0x000003f0 #define GEN7_SBE_DW1_URB_READ_OFFSET__SHIFT 4 #define GEN8_SBE_DW1_URB_READ_OFFSET__MASK 0x000007e0 #define GEN8_SBE_DW1_URB_READ_OFFSET__SHIFT 5 +#define GEN8_SBE_DW1_PID_OVERRIDE_ATTR__MASK 0x0000001f +#define GEN8_SBE_DW1_PID_OVERRIDE_ATTR__SHIFT 0 #define GEN8_3DSTATE_SBE_SWIZ_DW1_DW8__SIZE 8 #define GEN8_SBE_SWIZ_HIGH__MASK 0xffff0000 #define GEN8_SBE_SWIZ_HIGH__SHIFT 16 -#define GEN8_SBE_SWIZ_OVERRIDE_W (0x1 << 15) -#define GEN8_SBE_SWIZ_OVERRIDE_Z (0x1 << 14) -#define GEN8_SBE_SWIZ_OVERRIDE_Y (0x1 << 13) -#define GEN8_SBE_SWIZ_OVERRIDE_X (0x1 << 12) +#define GEN8_SBE_SWIZ_CONST_OVERRIDE_W (0x1 << 15) +#define GEN8_SBE_SWIZ_CONST_OVERRIDE_Z (0x1 << 14) +#define GEN8_SBE_SWIZ_CONST_OVERRIDE_Y (0x1 << 13) +#define GEN8_SBE_SWIZ_CONST_OVERRIDE_X (0x1 << 12) +#define GEN8_SBE_SWIZ_SWIZZLE_CONTROL (0x1 << 11) #define GEN8_SBE_SWIZ_CONST__MASK 0x00000600 #define GEN8_SBE_SWIZ_CONST__SHIFT 9 #define GEN8_SBE_SWIZ_CONST_0000 (0x0 << 9) @@ -1126,12 +1238,28 @@ enum gen_msrast_mode { #define GEN9_RASTER_DW1_Z_TEST_FAR_ENABLE (0x1 << 26) +#define GEN8_RASTER_DW1_API__MASK 0x00c00000 +#define GEN8_RASTER_DW1_API__SHIFT 22 +#define GEN8_RASTER_DW1_API_DX9_OGL (0x0 << 22) +#define GEN8_RASTER_DW1_API_DX10 (0x1 << 22) +#define GEN8_RASTER_DW1_API_DX10_1 (0x2 << 22) #define GEN8_RASTER_DW1_FRONT_WINDING__MASK 0x00200000 #define GEN8_RASTER_DW1_FRONT_WINDING__SHIFT 21 +#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT__MASK 0x001c0000 +#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT__SHIFT 18 +#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT_NUMRASTSAMPLES_0 (0x0 << 18) +#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT_NUMRASTSAMPLES_1 (0x1 << 18) +#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT_NUMRASTSAMPLES_2 (0x2 << 18) +#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT_NUMRASTSAMPLES_4 (0x3 << 18) +#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT_NUMRASTSAMPLES_8 (0x4 << 18) +#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT_NUMRASTSAMPLES_16 (0x5 << 18) #define GEN8_RASTER_DW1_CULL_MODE__MASK 0x00030000 #define GEN8_RASTER_DW1_CULL_MODE__SHIFT 16 +#define GEN8_RASTER_DW1_FORCE_MULTISAMPLE_ENABLE (0x1 << 14) #define GEN8_RASTER_DW1_SMOOTH_POINT_ENABLE (0x1 << 13) -#define GEN8_RASTER_DW1_API_MULTISAMPLE_ENABLE (0x1 << 12) +#define GEN8_RASTER_DW1_DX_MULTISAMPLE_ENABLE (0x1 << 12) +#define GEN8_RASTER_DW1_DX_MSRASTMODE__MASK 0x00000c00 +#define GEN8_RASTER_DW1_DX_MSRASTMODE__SHIFT 10 #define GEN8_RASTER_DW1_DEPTH_OFFSET_SOLID (0x1 << 9) #define GEN8_RASTER_DW1_DEPTH_OFFSET_WIREFRAME (0x1 << 8) #define GEN8_RASTER_DW1_DEPTH_OFFSET_POINT (0x1 << 7) @@ -1223,10 +1351,10 @@ enum gen_msrast_mode { #define GEN7_WM_DW1_STATISTICS (0x1 << 31) -#define GEN7_WM_DW1_DEPTH_CLEAR (0x1 << 30) +#define GEN7_WM_DW1_LEGACY_DEPTH_CLEAR (0x1 << 30) #define GEN7_WM_DW1_PS_DISPATCH_ENABLE (0x1 << 29) -#define GEN7_WM_DW1_DEPTH_RESOLVE (0x1 << 28) -#define GEN7_WM_DW1_HIZ_RESOLVE (0x1 << 27) +#define GEN7_WM_DW1_LEGACY_DEPTH_RESOLVE (0x1 << 28) +#define GEN7_WM_DW1_LEGACY_HIZ_RESOLVE (0x1 << 27) #define GEN7_WM_DW1_LEGACY_LINE_RAST (0x1 << 26) #define GEN7_WM_DW1_PS_KILL_PIXEL (0x1 << 25) #define GEN7_WM_DW1_PSCDEPTH__MASK 0x01800000 @@ -1235,6 +1363,11 @@ enum gen_msrast_mode { #define GEN7_WM_DW1_EDSC__SHIFT 21 #define GEN7_WM_DW1_PS_USE_DEPTH (0x1 << 20) #define GEN7_WM_DW1_PS_USE_W (0x1 << 19) +#define GEN8_WM_DW1_FORCE_DISPATCH_ENABLE__MASK 0x00180000 +#define GEN8_WM_DW1_FORCE_DISPATCH_ENABLE__SHIFT 19 +#define GEN8_WM_DW1_FORCE_DISPATCH_ENABLE_NORMAL (0x0 << 19) +#define GEN8_WM_DW1_FORCE_DISPATCH_ENABLE_OFF (0x1 << 19) +#define GEN8_WM_DW1_FORCE_DISPATCH_ENABLE_ON (0x2 << 19) #define GEN7_WM_DW1_ZW_INTERP__MASK 0x00060000 #define GEN7_WM_DW1_ZW_INTERP__SHIFT 17 #define GEN7_WM_DW1_BARYCENTRIC_INTERP__MASK 0x0001f800 @@ -1261,6 +1394,11 @@ enum gen_msrast_mode { #define GEN7_WM_DW1_POINT_RASTRULE_UPPER_RIGHT (0x1 << 2) #define GEN7_WM_DW1_MSRASTMODE__MASK 0x00000003 #define GEN7_WM_DW1_MSRASTMODE__SHIFT 0 +#define GEN8_WM_DW1_FORCE_KILL_PIXEL__MASK 0x00000003 +#define GEN8_WM_DW1_FORCE_KILL_PIXEL__SHIFT 0 +#define GEN8_WM_DW1_FORCE_KILL_PIXEL_NORMAL 0x0 +#define GEN8_WM_DW1_FORCE_KILL_PIXEL_OFF 0x1 +#define GEN8_WM_DW1_FORCE_KILL_PIXEL_ON 0x2 #define GEN7_WM_DW2_MSDISPMODE__MASK 0x80000000 #define GEN7_WM_DW2_MSDISPMODE__SHIFT 31 @@ -1271,6 +1409,7 @@ enum gen_msrast_mode { #define GEN8_3DSTATE_WM_CHROMAKEY__SIZE 2 +#define GEN8_CHROMAKEY_DW1_KILL_ENABLE (0x1 << 31) #define GEN8_3DSTATE_WM_DEPTH_STENCIL__SIZE 4 @@ -1318,6 +1457,7 @@ enum gen_msrast_mode { #define GEN8_WM_HZ_DW1_STENCIL_CLEAR (0x1 << 31) #define GEN8_WM_HZ_DW1_DEPTH_CLEAR (0x1 << 30) +#define GEN8_WM_HZ_DW1_SCISSOR_ENABLE (0x1 << 29) #define GEN8_WM_HZ_DW1_DEPTH_RESOLVE (0x1 << 28) #define GEN8_WM_HZ_DW1_HIZ_RESOLVE (0x1 << 27) #define GEN8_WM_HZ_DW1_PIXEL_OFFSET_ENABLE (0x1 << 26) @@ -1443,17 +1583,17 @@ enum gen_msrast_mode { #define GEN8_PS_BLEND_DW1_ALPHA_TO_COVERAGE (0x1 << 31) #define GEN8_PS_BLEND_DW1_WRITABLE_RT (0x1 << 30) -#define GEN8_PS_BLEND_DW1_BLEND_ENABLE (0x1 << 29) -#define GEN8_PS_BLEND_DW1_SRC_ALPHA_FACTOR__MASK 0x1f000000 -#define GEN8_PS_BLEND_DW1_SRC_ALPHA_FACTOR__SHIFT 24 -#define GEN8_PS_BLEND_DW1_DST_ALPHA_FACTOR__MASK 0x00f80000 -#define GEN8_PS_BLEND_DW1_DST_ALPHA_FACTOR__SHIFT 19 -#define GEN8_PS_BLEND_DW1_SRC_COLOR_FACTOR__MASK 0x0007c000 -#define GEN8_PS_BLEND_DW1_SRC_COLOR_FACTOR__SHIFT 14 -#define GEN8_PS_BLEND_DW1_DST_COLOR_FACTOR__MASK 0x00003e00 -#define GEN8_PS_BLEND_DW1_DST_COLOR_FACTOR__SHIFT 9 +#define GEN8_PS_BLEND_DW1_RT0_BLEND_ENABLE (0x1 << 29) +#define GEN8_PS_BLEND_DW1_RT0_SRC_ALPHA_FACTOR__MASK 0x1f000000 +#define GEN8_PS_BLEND_DW1_RT0_SRC_ALPHA_FACTOR__SHIFT 24 +#define GEN8_PS_BLEND_DW1_RT0_DST_ALPHA_FACTOR__MASK 0x00f80000 +#define GEN8_PS_BLEND_DW1_RT0_DST_ALPHA_FACTOR__SHIFT 19 +#define GEN8_PS_BLEND_DW1_RT0_SRC_COLOR_FACTOR__MASK 0x0007c000 +#define GEN8_PS_BLEND_DW1_RT0_SRC_COLOR_FACTOR__SHIFT 14 +#define GEN8_PS_BLEND_DW1_RT0_DST_COLOR_FACTOR__MASK 0x00003e00 +#define GEN8_PS_BLEND_DW1_RT0_DST_COLOR_FACTOR__SHIFT 9 #define GEN8_PS_BLEND_DW1_ALPHA_TEST_ENABLE (0x1 << 8) -#define GEN8_PS_BLEND_DW1_INDEPENDENT_ALPHA_ENABLE (0x1 << 7) +#define GEN8_PS_BLEND_DW1_RT0_INDEPENDENT_ALPHA_ENABLE (0x1 << 7) #define GEN6_3DSTATE_CONSTANT_ANY__SIZE 11 @@ -1469,6 +1609,8 @@ enum gen_msrast_mode { #define GEN6_CONSTANT_DW_ADDR_ADDR__SHR 5 +#define GEN8_CONSTANT_DW0_MOCS__MASK 0x00007f00 +#define GEN8_CONSTANT_DW0_MOCS__SHIFT 8 #define GEN7_CONSTANT_DW1_BUFFER1_READ_LEN__MASK 0xffff0000 #define GEN7_CONSTANT_DW1_BUFFER1_READ_LEN__SHIFT 16 @@ -1502,6 +1644,8 @@ enum gen_msrast_mode { #define GEN6_3DSTATE_DRAWING_RECTANGLE__SIZE 4 +#define GEN8_DRAWING_RECTANGLE_DW0_CORE_MODE_SELECT__MASK 0x0000c000 +#define GEN8_DRAWING_RECTANGLE_DW0_CORE_MODE_SELECT__SHIFT 14 #define GEN6_DRAWING_RECTANGLE_DW1_MIN_Y__MASK 0xffff0000 #define GEN6_DRAWING_RECTANGLE_DW1_MIN_Y__SHIFT 16 @@ -1624,15 +1768,12 @@ enum gen_msrast_mode { #define GEN8_DEPTH_DW5_MOCS__MASK 0x0000007f #define GEN8_DEPTH_DW5_MOCS__SHIFT 0 -#define GEN8_DEPTH_DW6_OFFSET_Y__MASK 0xffff0000 -#define GEN8_DEPTH_DW6_OFFSET_Y__SHIFT 16 -#define GEN8_DEPTH_DW6_OFFSET_X__MASK 0x0000ffff -#define GEN8_DEPTH_DW6_OFFSET_X__SHIFT 0 #define GEN8_DEPTH_DW7_RT_VIEW_EXTENT__MASK 0xffe00000 #define GEN8_DEPTH_DW7_RT_VIEW_EXTENT__SHIFT 21 #define GEN8_DEPTH_DW7_QPITCH__MASK 0x00007fff #define GEN8_DEPTH_DW7_QPITCH__SHIFT 0 +#define GEN8_DEPTH_DW7_QPITCH__SHR 2 #define GEN6_3DSTATE_POLY_STIPPLE_OFFSET__SIZE 2 @@ -1649,6 +1790,11 @@ enum gen_msrast_mode { #define GEN6_3DSTATE_LINE_STIPPLE__SIZE 3 +#define GEN6_LINE_STIPPLE_DW1_CURRENT_MODIFY_ENABLE (0x1 << 31) +#define GEN6_LINE_STIPPLE_DW1_CURRENT_REPEAT_COUNTER__MASK 0x3fe00000 +#define GEN6_LINE_STIPPLE_DW1_CURRENT_REPEAT_COUNTER__SHIFT 21 +#define GEN6_LINE_STIPPLE_DW1_CURRENT_STIPPLE_INDEX__MASK 0x000f0000 +#define GEN6_LINE_STIPPLE_DW1_CURRENT_STIPPLE_INDEX__SHIFT 16 #define GEN6_LINE_STIPPLE_DW1_PATTERN__MASK 0x0000ffff #define GEN6_LINE_STIPPLE_DW1_PATTERN__SHIFT 0 @@ -1664,16 +1810,28 @@ enum gen_msrast_mode { #define GEN6_3DSTATE_AA_LINE_PARAMETERS__SIZE 3 +#define GEN8_AA_LINE_DW1_POINT_BIAS__MASK 0xff000000 +#define GEN8_AA_LINE_DW1_POINT_BIAS__SHIFT 24 +#define GEN8_AA_LINE_DW1_POINT_BIAS__RADIX 8 #define GEN6_AA_LINE_DW1_BIAS__MASK 0x00ff0000 #define GEN6_AA_LINE_DW1_BIAS__SHIFT 16 #define GEN6_AA_LINE_DW1_BIAS__RADIX 8 +#define GEN8_AA_LINE_DW1_POINT_SLOPE__MASK 0x0000ff00 +#define GEN8_AA_LINE_DW1_POINT_SLOPE__SHIFT 8 +#define GEN8_AA_LINE_DW1_POINT_SLOPE__RADIX 8 #define GEN6_AA_LINE_DW1_SLOPE__MASK 0x000000ff #define GEN6_AA_LINE_DW1_SLOPE__SHIFT 0 #define GEN6_AA_LINE_DW1_SLOPE__RADIX 8 +#define GEN8_AA_LINE_DW2_POINT_CAP_BIAS__MASK 0xff000000 +#define GEN8_AA_LINE_DW2_POINT_CAP_BIAS__SHIFT 24 +#define GEN8_AA_LINE_DW2_POINT_CAP_BIAS__RADIX 8 #define GEN6_AA_LINE_DW2_CAP_BIAS__MASK 0x00ff0000 #define GEN6_AA_LINE_DW2_CAP_BIAS__SHIFT 16 #define GEN6_AA_LINE_DW2_CAP_BIAS__RADIX 8 +#define GEN8_AA_LINE_DW2_POINT_CAP_SLOPE__MASK 0x0000ff00 +#define GEN8_AA_LINE_DW2_POINT_CAP_SLOPE__SHIFT 8 +#define GEN8_AA_LINE_DW2_POINT_CAP_SLOPE__RADIX 8 #define GEN6_AA_LINE_DW2_CAP_SLOPE__MASK 0x000000ff #define GEN6_AA_LINE_DW2_CAP_SLOPE__SHIFT 0 #define GEN6_AA_LINE_DW2_CAP_SLOPE__RADIX 8 @@ -1690,7 +1848,7 @@ enum gen_msrast_mode { #define GEN6_3DSTATE_MULTISAMPLE__SIZE 4 -#define GEN75_MULTISAMPLE_DW1_DX9_MULTISAMPLE_ENABLE (0x1 << 5) +#define GEN75_MULTISAMPLE_DW1_PIXEL_OFFSET_ENABLE (0x1 << 5) #define GEN6_MULTISAMPLE_DW1_PIXEL_LOCATION__MASK 0x00000010 #define GEN6_MULTISAMPLE_DW1_PIXEL_LOCATION__SHIFT 4 #define GEN6_MULTISAMPLE_DW1_NUM_SAMPLES__MASK 0x0000000e @@ -1724,6 +1882,7 @@ enum gen_msrast_mode { #define GEN8_STENCIL_DW4_QPITCH__MASK 0x00007fff #define GEN8_STENCIL_DW4_QPITCH__SHIFT 0 +#define GEN8_STENCIL_DW4_QPITCH__SHR 2 #define GEN6_3DSTATE_HIER_DEPTH_BUFFER__SIZE 5 @@ -1739,6 +1898,7 @@ enum gen_msrast_mode { #define GEN8_HIZ_DW4_QPITCH__MASK 0x00007fff #define GEN8_HIZ_DW4_QPITCH__SHIFT 0 +#define GEN8_HIZ_DW4_QPITCH__SHR 2 #define GEN6_3DSTATE_CLEAR_PARAMS__SIZE 3 diff --git a/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h b/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h index b65b704adc6..b2c2142af78 100644 --- a/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h +++ b/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h @@ -430,8 +430,10 @@ enum gen_key_filter { #define GEN7_SAMPLER_DW0_BORDER_COLOR_MODE_DX9 (0x1 << 29) #define GEN6_SAMPLER_DW0_LOD_PRECLAMP_ENABLE (0x1 << 28) #define GEN6_SAMPLER_DW0_MIN_MAG_NOT_EQUAL (0x1 << 27) -#define GEN8_SAMPLER_DW0_LOD_PRECLAMP_ENABLE__MASK 0x18000000 -#define GEN8_SAMPLER_DW0_LOD_PRECLAMP_ENABLE__SHIFT 27 +#define GEN8_SAMPLER_DW0_LOD_PRECLAMP_MODE__MASK 0x18000000 +#define GEN8_SAMPLER_DW0_LOD_PRECLAMP_MODE__SHIFT 27 +#define GEN8_SAMPLER_DW0_LOD_PRECLAMP_MODE_NONE (0x0 << 27) +#define GEN8_SAMPLER_DW0_LOD_PRECLAMP_MODE_OGL (0x2 << 27) #define GEN6_SAMPLER_DW0_BASE_LOD__MASK 0x07c00000 #define GEN6_SAMPLER_DW0_BASE_LOD__SHIFT 22 #define GEN6_SAMPLER_DW0_BASE_LOD__RADIX 1 @@ -493,23 +495,11 @@ enum gen_key_filter { #define GEN6_SAMPLER_DW2_BORDER_COLOR_ADDR__SHIFT 5 #define GEN6_SAMPLER_DW2_BORDER_COLOR_ADDR__SHR 5 -#define GEN8_SAMPLER_DW2_SEP_FILTER_COEFF_TABLE_SIZE__MASK 0xc0000000 -#define GEN8_SAMPLER_DW2_SEP_FILTER_COEFF_TABLE_SIZE__SHIFT 30 -#define GEN8_SAMPLER_DW2_SEP_FILTER_WIDTH__MASK 0x30000000 -#define GEN8_SAMPLER_DW2_SEP_FILTER_WIDTH__SHIFT 28 -#define GEN8_SAMPLER_DW2_SEP_FILTER_HEIGHT__MASK 0x0c000000 -#define GEN8_SAMPLER_DW2_SEP_FILTER_HEIGHT__SHIFT 26 #define GEN8_SAMPLER_DW2_INDIRECT_STATE_ADDR__MASK 0x00ffffc0 #define GEN8_SAMPLER_DW2_INDIRECT_STATE_ADDR__SHIFT 6 #define GEN8_SAMPLER_DW2_INDIRECT_STATE_ADDR__SHR 6 -#define GEN8_SAMPLER_DW2_FLEXIBLE_FILTER_MODE (0x1 << 4) -#define GEN8_SAMPLER_DW2_FLEXIBLE_FILTER_COEFF_SIZE (0x1 << 3) -#define GEN8_SAMPLER_DW2_FLEXIBLE_FILTER_HALIGN (0x1 << 2) -#define GEN8_SAMPLER_DW2_FLEXIBLE_FILTER_VALIGN (0x1 << 1) #define GEN8_SAMPLER_DW2_LOD_CLAMP_MAG_MODE (0x1 << 0) -#define GEN8_SAMPLER_DW3_NON_SEP_FILTER_FOOTPRINT_MASK__MASK 0xff000000 -#define GEN8_SAMPLER_DW3_NON_SEP_FILTER_FOOTPRINT_MASK__SHIFT 24 #define GEN6_SAMPLER_DW3_CHROMAKEY_ENABLE (0x1 << 25) #define GEN6_SAMPLER_DW3_CHROMAKEY_INDEX__MASK 0x01800000 #define GEN6_SAMPLER_DW3_CHROMAKEY_INDEX__SHIFT 23 diff --git a/src/gallium/drivers/ilo/genhw/gen_render_media.xml.h b/src/gallium/drivers/ilo/genhw/gen_render_media.xml.h index 55d830bad32..2476002ec91 100644 --- a/src/gallium/drivers/ilo/genhw/gen_render_media.xml.h +++ b/src/gallium/drivers/ilo/genhw/gen_render_media.xml.h @@ -111,6 +111,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define GEN8_IDRT_DW5_CURBE_READ_LEN__MASK 0xffff0000 #define GEN8_IDRT_DW5_CURBE_READ_LEN__SHIFT 16 +#define GEN8_IDRT_DW5_CURBE_READ_OFFSET__MASK 0x0000ffff +#define GEN8_IDRT_DW5_CURBE_READ_OFFSET__SHIFT 0 #define GEN8_IDRT_DW6_ROUNDING_MODE__MASK 0x00c00000 #define GEN8_IDRT_DW6_ROUNDING_MODE__SHIFT 22 @@ -121,7 +123,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define GEN8_IDRT_DW6_BARRIER_ENABLE (0x1 << 21) #define GEN8_IDRT_DW6_SLM_SIZE__MASK 0x001f0000 #define GEN8_IDRT_DW6_SLM_SIZE__SHIFT 16 -#define GEN8_IDRT_DW6_THREAD_GROUP_SIZE__MASK 0x000000ff +#define GEN8_IDRT_DW6_THREAD_GROUP_SIZE__MASK 0x000003ff #define GEN8_IDRT_DW6_THREAD_GROUP_SIZE__SHIFT 0 #define GEN8_IDRT_DW7_CROSS_THREAD_CURBE_READ_LEN__MASK 0x000000ff @@ -280,6 +282,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define GEN8_GPGPU_DW1_IDRT_OFFSET__MASK 0x0000003f #define GEN8_GPGPU_DW1_IDRT_OFFSET__SHIFT 0 +#define GEN8_GPGPU_DW2_INDIRECT_LEN__MASK 0x0001ffff +#define GEN8_GPGPU_DW2_INDIRECT_LEN__SHIFT 0 #define GEN8_GPGPU_DW3_INDIRECT_ADDR__MASK 0xffffffe0 #define GEN8_GPGPU_DW3_INDIRECT_ADDR__SHIFT 5 diff --git a/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h b/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h index b5d09f64429..c180450ce27 100644 --- a/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h +++ b/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h @@ -388,7 +388,7 @@ enum gen_surface_scs { #define GEN8_SURFACE_DW0_TILING__SHIFT 12 #define GEN8_SURFACE_DW0_VSTRIDE (0x1 << 11) #define GEN8_SURFACE_DW0_VSTRIDE_OFFSET (0x1 << 10) -#define GEN8_SURFACE_DW0_SAMPLER_L2_BYPASS_MODE (0x1 << 9) +#define GEN8_SURFACE_DW0_SAMPLER_L2_BYPASS_DISABLE (0x1 << 9) #define GEN7_SURFACE_DW0_RENDER_CACHE_RW (0x1 << 8) #define GEN7_SURFACE_DW0_MEDIA_BOUNDARY_PIXEL_MODE__MASK 0x000000c0 #define GEN7_SURFACE_DW0_MEDIA_BOUNDARY_PIXEL_MODE__SHIFT 6 @@ -402,6 +402,7 @@ enum gen_surface_scs { #define GEN8_SURFACE_DW1_BASE_LOD__SHIFT 19 #define GEN8_SURFACE_DW1_QPITCH__MASK 0x00007fff #define GEN8_SURFACE_DW1_QPITCH__SHIFT 0 +#define GEN8_SURFACE_DW1_QPITCH__SHR 2 #define GEN7_SURFACE_DW2_HEIGHT__MASK 0x3fff0000 #define GEN7_SURFACE_DW2_HEIGHT__SHIFT 16 @@ -434,7 +435,6 @@ enum gen_surface_scs { #define GEN8_SURFACE_DW4_MULTISAMPLECOUNT_2 (0x1 << 3) #define GEN7_SURFACE_DW4_MULTISAMPLECOUNT_4 (0x2 << 3) #define GEN7_SURFACE_DW4_MULTISAMPLECOUNT_8 (0x3 << 3) -#define GEN8_SURFACE_DW4_MULTISAMPLECOUNT_16 (0x4 << 3) #define GEN7_SURFACE_DW4_MSPOS_INDEX__MASK 0x00000007 #define GEN7_SURFACE_DW4_MSPOS_INDEX__SHIFT 0 #define GEN7_SURFACE_DW4_MIN_ARRAY_ELEMENT_STRBUF__MASK 0x07ffffff @@ -451,8 +451,11 @@ enum gen_surface_scs { #define GEN8_SURFACE_DW5_Y_OFFSET__MASK 0x00e00000 #define GEN8_SURFACE_DW5_Y_OFFSET__SHIFT 21 #define GEN8_SURFACE_DW5_Y_OFFSET__SHR 1 -#define GEN8_SURFACE_DW5_CUBE_EWA (0x1 << 20) -#define GEN8_SURFACE_DW5_COHERENCY_TYPE (0x1 << 14) +#define GEN8_SURFACE_DW5_CUBE_EWA_DISABLE (0x1 << 20) +#define GEN8_SURFACE_DW5_COHERENCY_TYPE__MASK 0x00004000 +#define GEN8_SURFACE_DW5_COHERENCY_TYPE__SHIFT 14 +#define GEN8_SURFACE_DW5_COHERENCY_TYPE_GPU (0x0 << 14) +#define GEN8_SURFACE_DW5_COHERENCY_TYPE_IA (0x1 << 14) #define GEN7_SURFACE_DW5_MIN_LOD__MASK 0x000000f0 #define GEN7_SURFACE_DW5_MIN_LOD__SHIFT 4 #define GEN7_SURFACE_DW5_MIP_COUNT_LOD__MASK 0x0000000f @@ -463,22 +466,23 @@ enum gen_surface_scs { #define GEN7_SURFACE_DW6_UV_X_OFFSET__SHIFT 16 #define GEN7_SURFACE_DW6_UV_Y_OFFSET__MASK 0x00003fff #define GEN7_SURFACE_DW6_UV_Y_OFFSET__SHIFT 0 +#define GEN7_SURFACE_DW6_APPEND_COUNTER_ADDR__MASK 0xffffffc0 +#define GEN7_SURFACE_DW6_APPEND_COUNTER_ADDR__SHIFT 6 +#define GEN7_SURFACE_DW6_APPEND_COUNTER_ADDR__SHR 6 #define GEN7_SURFACE_DW6_MCS_ADDR__MASK 0xfffff000 #define GEN7_SURFACE_DW6_MCS_ADDR__SHIFT 12 #define GEN7_SURFACE_DW6_MCS_ADDR__SHR 12 #define GEN8_SURFACE_DW6_AUX_QPITCH__MASK 0x7fff0000 #define GEN8_SURFACE_DW6_AUX_QPITCH__SHIFT 16 +#define GEN8_SURFACE_DW6_AUX_QPITCH__SHR 2 #define GEN7_SURFACE_DW6_AUX_PITCH__MASK 0x00000ff8 #define GEN7_SURFACE_DW6_AUX_PITCH__SHIFT 3 -#define GEN7_SURFACE_DW6_APPEND_COUNTER_ADDR__MASK 0xffffffc0 -#define GEN7_SURFACE_DW6_APPEND_COUNTER_ADDR__SHIFT 6 -#define GEN7_SURFACE_DW6_APPEND_COUNTER_ADDR__SHR 6 -#define GEN7_SURFACE_DW6_AUX_MODE__MASK 0x00000007 -#define GEN7_SURFACE_DW6_AUX_MODE__SHIFT 0 -#define GEN7_SURFACE_DW6_AUX_MODE_NONE 0x0 -#define GEN7_SURFACE_DW6_AUX_MODE_MCS 0x1 -#define GEN7_SURFACE_DW6_AUX_MODE_APPEND 0x2 -#define GEN8_SURFACE_DW6_AUX_MODE_HIZ 0x3 +#define GEN7_SURFACE_DW6_AUX__MASK 0x00000007 +#define GEN7_SURFACE_DW6_AUX__SHIFT 0 +#define GEN7_SURFACE_DW6_AUX_NONE 0x0 +#define GEN7_SURFACE_DW6_AUX_MCS 0x1 +#define GEN7_SURFACE_DW6_AUX_APPEND 0x2 +#define GEN8_SURFACE_DW6_AUX_HIZ 0x3 #define GEN7_SURFACE_DW7_CC_R__MASK 0x80000000 #define GEN7_SURFACE_DW7_CC_R__SHIFT 31 @@ -504,6 +508,12 @@ enum gen_surface_scs { +#define GEN8_SURFACE_DW11_V_X_OFFSET__MASK 0x3fff0000 +#define GEN8_SURFACE_DW11_V_X_OFFSET__SHIFT 16 +#define GEN8_SURFACE_DW11_V_Y_OFFSET__MASK 0x00003fff +#define GEN8_SURFACE_DW11_V_Y_OFFSET__SHIFT 0 +#define GEN8_SURFACE_DW11_AUX_ADDR_HI__MASK 0xffffffff +#define GEN8_SURFACE_DW11_AUX_ADDR_HI__SHIFT 0 diff --git a/src/gallium/drivers/ilo/ilo_common.h b/src/gallium/drivers/ilo/ilo_common.h index 3dbe79fb872..d3016590551 100644 --- a/src/gallium/drivers/ilo/ilo_common.h +++ b/src/gallium/drivers/ilo/ilo_common.h @@ -34,6 +34,7 @@ #include "util/list.h" #include "util/u_format.h" #include "util/u_inlines.h" +#include "util/u_memory.h" #include "util/u_pointer.h" #include "core/ilo_core.h" diff --git a/src/gallium/drivers/ilo/ilo_shader.c b/src/gallium/drivers/ilo/ilo_shader.c index 5f2b01017e2..73b625e9de4 100644 --- a/src/gallium/drivers/ilo/ilo_shader.c +++ b/src/gallium/drivers/ilo/ilo_shader.c @@ -987,15 +987,6 @@ ilo_shader_destroy(struct ilo_shader_state *shader) } /** - * Return the type (PIPE_SHADER_x) of the shader. - */ -int -ilo_shader_get_type(const struct ilo_shader_state *shader) -{ - return shader->info.type; -} - -/** * Select a kernel for the given context. This will compile a new kernel if * none of the existing kernels work with the context. * @@ -1257,9 +1248,6 @@ ilo_shader_get_kernel_param(const struct ilo_shader_state *shader, case ILO_KERNEL_SAMPLER_COUNT: val = shader->info.num_samplers; break; - case ILO_KERNEL_URB_DATA_START_REG: - val = kernel->in.start_grf; - break; case ILO_KERNEL_SKIP_CBUF0_UPLOAD: val = kernel->skip_cbuf0_upload; break; @@ -1311,9 +1299,6 @@ ilo_shader_get_kernel_param(const struct ilo_shader_state *shader, case ILO_KERNEL_VS_GEN6_SO: val = kernel->stream_output; break; - case ILO_KERNEL_VS_GEN6_SO_START_REG: - val = kernel->gs_start_grf; - break; case ILO_KERNEL_VS_GEN6_SO_POINT_OFFSET: val = kernel->gs_offsets[0]; break; @@ -1340,16 +1325,6 @@ ilo_shader_get_kernel_param(const struct ilo_shader_state *shader, val = kernel->bt.gen6_so_count; break; - case ILO_KERNEL_FS_INPUT_Z: - case ILO_KERNEL_FS_INPUT_W: - val = kernel->in.has_pos; - break; - case ILO_KERNEL_FS_OUTPUT_Z: - val = kernel->out.has_pos; - break; - case ILO_KERNEL_FS_USE_KILL: - val = kernel->has_kill; - break; case ILO_KERNEL_FS_BARYCENTRIC_INTERPOLATIONS: val = kernel->in.barycentric_interpolation_mode; break; diff --git a/src/gallium/drivers/ilo/ilo_shader.h b/src/gallium/drivers/ilo/ilo_shader.h index d9f02a4746a..01de54146b1 100644 --- a/src/gallium/drivers/ilo/ilo_shader.h +++ b/src/gallium/drivers/ilo/ilo_shader.h @@ -36,7 +36,6 @@ enum ilo_kernel_param { ILO_KERNEL_INPUT_COUNT, ILO_KERNEL_OUTPUT_COUNT, ILO_KERNEL_SAMPLER_COUNT, - ILO_KERNEL_URB_DATA_START_REG, ILO_KERNEL_SKIP_CBUF0_UPLOAD, ILO_KERNEL_PCB_CBUF0_SIZE, @@ -53,7 +52,6 @@ enum ilo_kernel_param { ILO_KERNEL_VS_INPUT_EDGEFLAG, ILO_KERNEL_VS_PCB_UCP_SIZE, ILO_KERNEL_VS_GEN6_SO, - ILO_KERNEL_VS_GEN6_SO_START_REG, ILO_KERNEL_VS_GEN6_SO_POINT_OFFSET, ILO_KERNEL_VS_GEN6_SO_LINE_OFFSET, ILO_KERNEL_VS_GEN6_SO_TRI_OFFSET, @@ -64,10 +62,6 @@ enum ilo_kernel_param { ILO_KERNEL_GS_GEN6_SURFACE_SO_BASE, ILO_KERNEL_GS_GEN6_SURFACE_SO_COUNT, - ILO_KERNEL_FS_INPUT_Z, - ILO_KERNEL_FS_INPUT_W, - ILO_KERNEL_FS_OUTPUT_Z, - ILO_KERNEL_FS_USE_KILL, ILO_KERNEL_FS_BARYCENTRIC_INTERPOLATIONS, ILO_KERNEL_FS_DISPATCH_16_OFFSET, ILO_KERNEL_FS_SURFACE_RT_BASE, @@ -149,9 +143,6 @@ ilo_shader_create_cs(const struct ilo_dev *dev, void ilo_shader_destroy(struct ilo_shader_state *shader); -int -ilo_shader_get_type(const struct ilo_shader_state *shader); - bool ilo_shader_select_kernel(struct ilo_shader_state *shader, const struct ilo_state_vector *vec, diff --git a/src/gallium/drivers/nouveau/Makefile.sources b/src/gallium/drivers/nouveau/Makefile.sources index 9346ea3204d..c18e9f5b435 100644 --- a/src/gallium/drivers/nouveau/Makefile.sources +++ b/src/gallium/drivers/nouveau/Makefile.sources @@ -151,6 +151,15 @@ NVC0_C_SOURCES := \ nvc0/nvc0_program.c \ nvc0/nvc0_program.h \ nvc0/nvc0_query.c \ + nvc0/nvc0_query.h \ + nvc0/nvc0_query_hw.c \ + nvc0/nvc0_query_hw.h \ + nvc0/nvc0_query_hw_metric.c \ + nvc0/nvc0_query_hw_metric.h \ + nvc0/nvc0_query_hw_sm.c \ + nvc0/nvc0_query_hw_sm.h \ + nvc0/nvc0_query_sw.c \ + nvc0/nvc0_query_sw.h \ nvc0/nvc0_resource.c \ nvc0/nvc0_resource.h \ nvc0/nvc0_screen.c \ diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp index 400b9f09e51..7859c8e79bd 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp @@ -25,10 +25,24 @@ #include <stack> #include <limits> +#if __cplusplus >= 201103L +#include <unordered_map> +#else #include <tr1/unordered_map> +#endif namespace nv50_ir { +#if __cplusplus >= 201103L +using std::hash; +using std::unordered_map; +#elif !defined(ANDROID) +using std::tr1::hash; +using std::tr1::unordered_map; +#else +#error Android release before Lollipop is not supported! +#endif + #define MAX_REGISTER_FILE_SIZE 256 class RegisterSet @@ -349,12 +363,12 @@ RegAlloc::PhiMovesPass::needNewElseBlock(BasicBlock *b, BasicBlock *p) struct PhiMapHash { size_t operator()(const std::pair<Instruction *, BasicBlock *>& val) const { - return std::tr1::hash<Instruction*>()(val.first) * 31 + - std::tr1::hash<BasicBlock*>()(val.second); + return hash<Instruction*>()(val.first) * 31 + + hash<BasicBlock*>()(val.second); } }; -typedef std::tr1::unordered_map< +typedef unordered_map< std::pair<Instruction *, BasicBlock *>, Value *, PhiMapHash> PhiMap; // Critical edges need to be split up so that work can be inserted along diff --git a/src/gallium/drivers/nouveau/nouveau_fence.c b/src/gallium/drivers/nouveau/nouveau_fence.c index ee4e08dd520..21cf2b9ae5e 100644 --- a/src/gallium/drivers/nouveau/nouveau_fence.c +++ b/src/gallium/drivers/nouveau/nouveau_fence.c @@ -190,8 +190,14 @@ nouveau_fence_wait(struct nouveau_fence *fence) /* wtf, someone is waiting on a fence in flush_notify handler? */ assert(fence->state != NOUVEAU_FENCE_STATE_EMITTING); - if (fence->state < NOUVEAU_FENCE_STATE_EMITTED) - nouveau_fence_emit(fence); + if (fence->state < NOUVEAU_FENCE_STATE_EMITTED) { + PUSH_SPACE(screen->pushbuf, 8); + /* The space allocation might trigger a flush, which could emit the + * current fence. So check again. + */ + if (fence->state < NOUVEAU_FENCE_STATE_EMITTED) + nouveau_fence_emit(fence); + } if (fence->state < NOUVEAU_FENCE_STATE_FLUSHED) if (nouveau_pushbuf_kick(screen->pushbuf, screen->pushbuf->channel)) @@ -224,8 +230,12 @@ nouveau_fence_wait(struct nouveau_fence *fence) void nouveau_fence_next(struct nouveau_screen *screen) { - if (screen->fence.current->state < NOUVEAU_FENCE_STATE_EMITTING) - nouveau_fence_emit(screen->fence.current); + if (screen->fence.current->state < NOUVEAU_FENCE_STATE_EMITTING) { + if (screen->fence.current->ref > 1) + nouveau_fence_emit(screen->fence.current); + else + return; + } nouveau_fence_ref(NULL, &screen->fence.current); diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c index 5757eb1fb16..dbbb8baad79 100644 --- a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c +++ b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c @@ -1,3 +1,4 @@ +#include <strings.h> #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "pipe/p_state.h" diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c index fdde11f4cd5..941555ffbf8 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c @@ -65,14 +65,9 @@ nv50_constbufs_validate(struct nv50_context *nv50) PUSH_DATA (push, (b << 12) | (i << 8) | p | 1); } while (words) { - unsigned nr; - - if (!PUSH_SPACE(push, 16)) - break; - nr = PUSH_AVAIL(push); - assert(nr >= 16); - nr = MIN2(MIN2(nr - 3, words), NV04_PFIFO_MAX_PACKET_LEN); + unsigned nr = MIN2(words, NV04_PFIFO_MAX_PACKET_LEN); + PUSH_SPACE(push, nr + 3); BEGIN_NV04(push, NV50_3D(CB_ADDR), 1); PUSH_DATA (push, (start << 8) | b); BEGIN_NI04(push, NV50_3D(CB_DATA(0)), nr); diff --git a/src/gallium/drivers/nouveau/nv50/nv50_transfer.c b/src/gallium/drivers/nouveau/nv50/nv50_transfer.c index be514077d32..9a3fd1e705f 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_transfer.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_transfer.c @@ -187,14 +187,7 @@ nv50_sifc_linear_u8(struct nouveau_context *nv, PUSH_DATA (push, 0); while (count) { - unsigned nr; - - if (!PUSH_SPACE(push, 16)) - break; - nr = PUSH_AVAIL(push); - assert(nr >= 16); - nr = MIN2(count, nr - 1); - nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN); + unsigned nr = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN); BEGIN_NI04(push, NV50_2D(SIFC_DATA), nr); PUSH_DATAp(push, src, nr); @@ -395,12 +388,9 @@ nv50_cb_push(struct nouveau_context *nv, nouveau_pushbuf_validate(push); while (words) { - unsigned nr; - - nr = PUSH_AVAIL(push); - nr = MIN2(nr - 7, words); - nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN - 1); + unsigned nr = MIN2(words, NV04_PFIFO_MAX_PACKET_LEN); + PUSH_SPACE(push, nr + 7); BEGIN_NV04(push, NV50_3D(CB_DEF_ADDRESS_HIGH), 3); PUSH_DATAh(push, bo->offset + base); PUSH_DATA (push, bo->offset + base); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c index 47bd123621b..e33af042620 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c @@ -37,12 +37,9 @@ nvc0_screen_compute_setup(struct nvc0_screen *screen, switch (dev->chipset & ~0xf) { case 0xc0: - if (dev->chipset == 0xc8) - obj_class = NVC8_COMPUTE_CLASS; - else - obj_class = NVC0_COMPUTE_CLASS; - break; case 0xd0: + /* In theory, GF110+ should also support NVC8_COMPUTE_CLASS but, + * in practice, a ILLEGAL_CLASS dmesg fail appears when using it. */ obj_class = NVC0_COMPUTE_CLASS; break; default: @@ -108,14 +105,6 @@ nvc0_screen_compute_setup(struct nvc0_screen *screen, PUSH_DATAh(push, screen->text->offset); PUSH_DATA (push, screen->text->offset); - /* bind parameters buffer */ - BEGIN_NVC0(push, NVC0_COMPUTE(CB_SIZE), 3); - PUSH_DATA (push, screen->parm->size); - PUSH_DATAh(push, screen->parm->offset); - PUSH_DATA (push, screen->parm->offset); - BEGIN_NVC0(push, NVC0_COMPUTE(CB_BIND), 1); - PUSH_DATA (push, (0 << 8) | 1); - /* TODO: textures & samplers */ return 0; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h index 30bee3a0f8c..4af83c53224 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h @@ -15,6 +15,7 @@ #include "nvc0/nvc0_screen.h" #include "nvc0/nvc0_program.h" #include "nvc0/nvc0_resource.h" +#include "nvc0/nvc0_query.h" #include "nv50/nv50_transfer.h" @@ -231,17 +232,6 @@ uint32_t nvc0_program_symbol_offset(const struct nvc0_program *, uint32_t label); void nvc0_program_init_tcp_empty(struct nvc0_context *); -/* nvc0_query.c */ -void nvc0_init_query_functions(struct nvc0_context *); -void nvc0_query_pushbuf_submit(struct nouveau_pushbuf *, - struct pipe_query *, unsigned result_offset); -void nvc0_query_fifo_wait(struct nouveau_pushbuf *, struct pipe_query *); -void nvc0_so_target_save_offset(struct pipe_context *, - struct pipe_stream_output_target *, unsigned i, - bool *serialize); - -#define NVC0_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0) - /* nvc0_shader_state.c */ void nvc0_vertprog_validate(struct nvc0_context *); void nvc0_tctlprog_validate(struct nvc0_context *); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c index b13df6a9485..e4752e2dbc5 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c @@ -25,519 +25,51 @@ #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING #include "nvc0/nvc0_context.h" -#include "nv_object.xml.h" -#include "nvc0/nve4_compute.xml.h" -#include "nvc0/nvc0_compute.xml.h" - -#define NVC0_QUERY_STATE_READY 0 -#define NVC0_QUERY_STATE_ACTIVE 1 -#define NVC0_QUERY_STATE_ENDED 2 -#define NVC0_QUERY_STATE_FLUSHED 3 - -struct nvc0_query { - uint32_t *data; - uint16_t type; - uint16_t index; - int8_t ctr[4]; - uint32_t sequence; - struct nouveau_bo *bo; - uint32_t base; - uint32_t offset; /* base + i * rotate */ - uint8_t state; - bool is64bit; - uint8_t rotate; - int nesting; /* only used for occlusion queries */ - union { - struct nouveau_mm_allocation *mm; - uint64_t value; - } u; - struct nouveau_fence *fence; -}; - -#define NVC0_QUERY_ALLOC_SPACE 256 - -static boolean nvc0_hw_sm_query_begin(struct nvc0_context *, - struct nvc0_query *); -static void nvc0_hw_sm_query_end(struct nvc0_context *, struct nvc0_query *); -static boolean nvc0_hw_sm_query_result(struct nvc0_context *, - struct nvc0_query *, void *, boolean); - -static inline struct nvc0_query * -nvc0_query(struct pipe_query *pipe) -{ - return (struct nvc0_query *)pipe; -} - -static bool -nvc0_query_allocate(struct nvc0_context *nvc0, struct nvc0_query *q, int size) -{ - struct nvc0_screen *screen = nvc0->screen; - int ret; - - if (q->bo) { - nouveau_bo_ref(NULL, &q->bo); - if (q->u.mm) { - if (q->state == NVC0_QUERY_STATE_READY) - nouveau_mm_free(q->u.mm); - else - nouveau_fence_work(screen->base.fence.current, - nouveau_mm_free_work, q->u.mm); - } - } - if (size) { - q->u.mm = nouveau_mm_allocate(screen->base.mm_GART, size, &q->bo, &q->base); - if (!q->bo) - return false; - q->offset = q->base; - - ret = nouveau_bo_map(q->bo, 0, screen->base.client); - if (ret) { - nvc0_query_allocate(nvc0, q, 0); - return false; - } - q->data = (uint32_t *)((uint8_t *)q->bo->map + q->base); - } - return true; -} - -static void -nvc0_query_destroy(struct pipe_context *pipe, struct pipe_query *pq) -{ - nvc0_query_allocate(nvc0_context(pipe), nvc0_query(pq), 0); - nouveau_fence_ref(NULL, &nvc0_query(pq)->fence); - FREE(nvc0_query(pq)); -} +#include "nvc0/nvc0_query.h" +#include "nvc0/nvc0_query_sw.h" +#include "nvc0/nvc0_query_hw.h" +#include "nvc0/nvc0_query_hw_sm.h" static struct pipe_query * -nvc0_query_create(struct pipe_context *pipe, unsigned type, unsigned index) +nvc0_create_query(struct pipe_context *pipe, unsigned type, unsigned index) { struct nvc0_context *nvc0 = nvc0_context(pipe); struct nvc0_query *q; - unsigned space = NVC0_QUERY_ALLOC_SPACE; - q = CALLOC_STRUCT(nvc0_query); + q = nvc0_sw_create_query(nvc0, type, index); if (!q) - return NULL; - - switch (type) { - case PIPE_QUERY_OCCLUSION_COUNTER: - case PIPE_QUERY_OCCLUSION_PREDICATE: - q->rotate = 32; - space = NVC0_QUERY_ALLOC_SPACE; - break; - case PIPE_QUERY_PIPELINE_STATISTICS: - q->is64bit = true; - space = 512; - break; - case PIPE_QUERY_SO_STATISTICS: - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - q->is64bit = true; - space = 64; - break; - case PIPE_QUERY_PRIMITIVES_GENERATED: - case PIPE_QUERY_PRIMITIVES_EMITTED: - q->is64bit = true; - q->index = index; - space = 32; - break; - case PIPE_QUERY_TIME_ELAPSED: - case PIPE_QUERY_TIMESTAMP: - case PIPE_QUERY_TIMESTAMP_DISJOINT: - case PIPE_QUERY_GPU_FINISHED: - space = 32; - break; - case NVC0_QUERY_TFB_BUFFER_OFFSET: - space = 16; - break; - default: -#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS - if (type >= NVC0_QUERY_DRV_STAT(0) && type <= NVC0_QUERY_DRV_STAT_LAST) { - space = 0; - q->is64bit = true; - q->index = type - NVC0_QUERY_DRV_STAT(0); - break; - } else -#endif - if (nvc0->screen->base.device->drm_version >= 0x01000101) { - if (type >= NVE4_HW_SM_QUERY(0) && type <= NVE4_HW_SM_QUERY_LAST) { - /* for each MP: - * [00] = WS0.C0 - * [04] = WS0.C1 - * [08] = WS0.C2 - * [0c] = WS0.C3 - * [10] = WS1.C0 - * [14] = WS1.C1 - * [18] = WS1.C2 - * [1c] = WS1.C3 - * [20] = WS2.C0 - * [24] = WS2.C1 - * [28] = WS2.C2 - * [2c] = WS2.C3 - * [30] = WS3.C0 - * [34] = WS3.C1 - * [38] = WS3.C2 - * [3c] = WS3.C3 - * [40] = MP.C4 - * [44] = MP.C5 - * [48] = MP.C6 - * [4c] = MP.C7 - * [50] = WS0.sequence - * [54] = WS1.sequence - * [58] = WS2.sequence - * [5c] = WS3.sequence - */ - space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t); - break; - } else - if (type >= NVC0_HW_SM_QUERY(0) && type <= NVC0_HW_SM_QUERY_LAST) { - /* for each MP: - * [00] = MP.C0 - * [04] = MP.C1 - * [08] = MP.C2 - * [0c] = MP.C3 - * [10] = MP.C4 - * [14] = MP.C5 - * [18] = MP.C6 - * [1c] = MP.C7 - * [20] = MP.sequence - */ - space = (8 + 1) * nvc0->screen->mp_count * sizeof(uint32_t); - break; - } - } - debug_printf("invalid query type: %u\n", type); - FREE(q); - return NULL; - } - if (!nvc0_query_allocate(nvc0, q, space)) { - FREE(q); - return NULL; - } - - q->type = type; - - if (q->rotate) { - /* we advance before query_begin ! */ - q->offset -= q->rotate; - q->data -= q->rotate / sizeof(*q->data); - } else - if (!q->is64bit) - q->data[0] = 0; /* initialize sequence */ + q = nvc0_hw_create_query(nvc0, type, index); return (struct pipe_query *)q; } static void -nvc0_query_get(struct nouveau_pushbuf *push, struct nvc0_query *q, - unsigned offset, uint32_t get) +nvc0_destroy_query(struct pipe_context *pipe, struct pipe_query *pq) { - offset += q->offset; - - PUSH_SPACE(push, 5); - PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_WR); - BEGIN_NVC0(push, NVC0_3D(QUERY_ADDRESS_HIGH), 4); - PUSH_DATAh(push, q->bo->offset + offset); - PUSH_DATA (push, q->bo->offset + offset); - PUSH_DATA (push, q->sequence); - PUSH_DATA (push, get); -} - -static void -nvc0_query_rotate(struct nvc0_context *nvc0, struct nvc0_query *q) -{ - q->offset += q->rotate; - q->data += q->rotate / sizeof(*q->data); - if (q->offset - q->base == NVC0_QUERY_ALLOC_SPACE) - nvc0_query_allocate(nvc0, q, NVC0_QUERY_ALLOC_SPACE); + struct nvc0_query *q = nvc0_query(pq); + q->funcs->destroy_query(nvc0_context(pipe), q); } static boolean -nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq) +nvc0_begin_query(struct pipe_context *pipe, struct pipe_query *pq) { - struct nvc0_context *nvc0 = nvc0_context(pipe); - struct nouveau_pushbuf *push = nvc0->base.pushbuf; struct nvc0_query *q = nvc0_query(pq); - bool ret = true; - - /* For occlusion queries we have to change the storage, because a previous - * query might set the initial render conition to false even *after* we re- - * initialized it to true. - */ - if (q->rotate) { - nvc0_query_rotate(nvc0, q); - - /* XXX: can we do this with the GPU, and sync with respect to a previous - * query ? - */ - q->data[0] = q->sequence; /* initialize sequence */ - q->data[1] = 1; /* initial render condition = true */ - q->data[4] = q->sequence + 1; /* for comparison COND_MODE */ - q->data[5] = 0; - } - q->sequence++; - - switch (q->type) { - case PIPE_QUERY_OCCLUSION_COUNTER: - case PIPE_QUERY_OCCLUSION_PREDICATE: - q->nesting = nvc0->screen->num_occlusion_queries_active++; - if (q->nesting) { - nvc0_query_get(push, q, 0x10, 0x0100f002); - } else { - PUSH_SPACE(push, 3); - BEGIN_NVC0(push, NVC0_3D(COUNTER_RESET), 1); - PUSH_DATA (push, NVC0_3D_COUNTER_RESET_SAMPLECNT); - IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 1); - } - break; - case PIPE_QUERY_PRIMITIVES_GENERATED: - nvc0_query_get(push, q, 0x10, 0x09005002 | (q->index << 5)); - break; - case PIPE_QUERY_PRIMITIVES_EMITTED: - nvc0_query_get(push, q, 0x10, 0x05805002 | (q->index << 5)); - break; - case PIPE_QUERY_SO_STATISTICS: - nvc0_query_get(push, q, 0x20, 0x05805002 | (q->index << 5)); - nvc0_query_get(push, q, 0x30, 0x06805002 | (q->index << 5)); - break; - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - nvc0_query_get(push, q, 0x10, 0x03005002 | (q->index << 5)); - break; - case PIPE_QUERY_TIME_ELAPSED: - nvc0_query_get(push, q, 0x10, 0x00005002); - break; - case PIPE_QUERY_PIPELINE_STATISTICS: - nvc0_query_get(push, q, 0xc0 + 0x00, 0x00801002); /* VFETCH, VERTICES */ - nvc0_query_get(push, q, 0xc0 + 0x10, 0x01801002); /* VFETCH, PRIMS */ - nvc0_query_get(push, q, 0xc0 + 0x20, 0x02802002); /* VP, LAUNCHES */ - nvc0_query_get(push, q, 0xc0 + 0x30, 0x03806002); /* GP, LAUNCHES */ - nvc0_query_get(push, q, 0xc0 + 0x40, 0x04806002); /* GP, PRIMS_OUT */ - nvc0_query_get(push, q, 0xc0 + 0x50, 0x07804002); /* RAST, PRIMS_IN */ - nvc0_query_get(push, q, 0xc0 + 0x60, 0x08804002); /* RAST, PRIMS_OUT */ - nvc0_query_get(push, q, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */ - nvc0_query_get(push, q, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */ - nvc0_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */ - break; - default: -#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS - if (q->type >= NVC0_QUERY_DRV_STAT(0) && - q->type <= NVC0_QUERY_DRV_STAT_LAST) { - if (q->index >= 5) - q->u.value = nvc0->screen->base.stats.v[q->index]; - else - q->u.value = 0; - } else -#endif - if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) || - (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST)) { - ret = nvc0_hw_sm_query_begin(nvc0, q); - } - break; - } - q->state = NVC0_QUERY_STATE_ACTIVE; - return ret; + return q->funcs->begin_query(nvc0_context(pipe), q); } static void -nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq) +nvc0_end_query(struct pipe_context *pipe, struct pipe_query *pq) { - struct nvc0_context *nvc0 = nvc0_context(pipe); - struct nouveau_pushbuf *push = nvc0->base.pushbuf; struct nvc0_query *q = nvc0_query(pq); - - if (q->state != NVC0_QUERY_STATE_ACTIVE) { - /* some queries don't require 'begin' to be called (e.g. GPU_FINISHED) */ - if (q->rotate) - nvc0_query_rotate(nvc0, q); - q->sequence++; - } - q->state = NVC0_QUERY_STATE_ENDED; - - switch (q->type) { - case PIPE_QUERY_OCCLUSION_COUNTER: - case PIPE_QUERY_OCCLUSION_PREDICATE: - nvc0_query_get(push, q, 0, 0x0100f002); - if (--nvc0->screen->num_occlusion_queries_active == 0) { - PUSH_SPACE(push, 1); - IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 0); - } - break; - case PIPE_QUERY_PRIMITIVES_GENERATED: - nvc0_query_get(push, q, 0, 0x09005002 | (q->index << 5)); - break; - case PIPE_QUERY_PRIMITIVES_EMITTED: - nvc0_query_get(push, q, 0, 0x05805002 | (q->index << 5)); - break; - case PIPE_QUERY_SO_STATISTICS: - nvc0_query_get(push, q, 0x00, 0x05805002 | (q->index << 5)); - nvc0_query_get(push, q, 0x10, 0x06805002 | (q->index << 5)); - break; - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - /* TODO: How do we sum over all streams for render condition ? */ - /* PRIMS_DROPPED doesn't write sequence, use a ZERO query to sync on */ - nvc0_query_get(push, q, 0x00, 0x03005002 | (q->index << 5)); - nvc0_query_get(push, q, 0x20, 0x00005002); - break; - case PIPE_QUERY_TIMESTAMP: - case PIPE_QUERY_TIME_ELAPSED: - nvc0_query_get(push, q, 0, 0x00005002); - break; - case PIPE_QUERY_GPU_FINISHED: - nvc0_query_get(push, q, 0, 0x1000f010); - break; - case PIPE_QUERY_PIPELINE_STATISTICS: - nvc0_query_get(push, q, 0x00, 0x00801002); /* VFETCH, VERTICES */ - nvc0_query_get(push, q, 0x10, 0x01801002); /* VFETCH, PRIMS */ - nvc0_query_get(push, q, 0x20, 0x02802002); /* VP, LAUNCHES */ - nvc0_query_get(push, q, 0x30, 0x03806002); /* GP, LAUNCHES */ - nvc0_query_get(push, q, 0x40, 0x04806002); /* GP, PRIMS_OUT */ - nvc0_query_get(push, q, 0x50, 0x07804002); /* RAST, PRIMS_IN */ - nvc0_query_get(push, q, 0x60, 0x08804002); /* RAST, PRIMS_OUT */ - nvc0_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */ - nvc0_query_get(push, q, 0x80, 0x0d808002); /* TCP, LAUNCHES */ - nvc0_query_get(push, q, 0x90, 0x0e809002); /* TEP, LAUNCHES */ - break; - case NVC0_QUERY_TFB_BUFFER_OFFSET: - /* indexed by TFB buffer instead of by vertex stream */ - nvc0_query_get(push, q, 0x00, 0x0d005002 | (q->index << 5)); - break; - case PIPE_QUERY_TIMESTAMP_DISJOINT: - /* This query is not issued on GPU because disjoint is forced to false */ - q->state = NVC0_QUERY_STATE_READY; - break; - default: -#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS - if (q->type >= NVC0_QUERY_DRV_STAT(0) && - q->type <= NVC0_QUERY_DRV_STAT_LAST) { - q->u.value = nvc0->screen->base.stats.v[q->index] - q->u.value; - return; - } else -#endif - if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) || - (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST)) { - nvc0_hw_sm_query_end(nvc0, q); - } - break; - } - if (q->is64bit) - nouveau_fence_ref(nvc0->screen->base.fence.current, &q->fence); -} - -static inline void -nvc0_query_update(struct nouveau_client *cli, struct nvc0_query *q) -{ - if (q->is64bit) { - if (nouveau_fence_signalled(q->fence)) - q->state = NVC0_QUERY_STATE_READY; - } else { - if (q->data[0] == q->sequence) - q->state = NVC0_QUERY_STATE_READY; - } + q->funcs->end_query(nvc0_context(pipe), q); } static boolean -nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq, - boolean wait, union pipe_query_result *result) +nvc0_get_query_result(struct pipe_context *pipe, struct pipe_query *pq, + boolean wait, union pipe_query_result *result) { - struct nvc0_context *nvc0 = nvc0_context(pipe); struct nvc0_query *q = nvc0_query(pq); - uint64_t *res64 = (uint64_t*)result; - uint32_t *res32 = (uint32_t*)result; - uint8_t *res8 = (uint8_t*)result; - uint64_t *data64 = (uint64_t *)q->data; - unsigned i; - -#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS - if (q->type >= NVC0_QUERY_DRV_STAT(0) && - q->type <= NVC0_QUERY_DRV_STAT_LAST) { - res64[0] = q->u.value; - return true; - } else -#endif - if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) || - (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST)) { - return nvc0_hw_sm_query_result(nvc0, q, result, wait); - } - - if (q->state != NVC0_QUERY_STATE_READY) - nvc0_query_update(nvc0->screen->base.client, q); - - if (q->state != NVC0_QUERY_STATE_READY) { - if (!wait) { - if (q->state != NVC0_QUERY_STATE_FLUSHED) { - q->state = NVC0_QUERY_STATE_FLUSHED; - /* flush for silly apps that spin on GL_QUERY_RESULT_AVAILABLE */ - PUSH_KICK(nvc0->base.pushbuf); - } - return false; - } - if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->screen->base.client)) - return false; - NOUVEAU_DRV_STAT(&nvc0->screen->base, query_sync_count, 1); - } - q->state = NVC0_QUERY_STATE_READY; - - switch (q->type) { - case PIPE_QUERY_GPU_FINISHED: - res8[0] = true; - break; - case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */ - res64[0] = q->data[1] - q->data[5]; - break; - case PIPE_QUERY_OCCLUSION_PREDICATE: - res8[0] = q->data[1] != q->data[5]; - break; - case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */ - case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */ - res64[0] = data64[0] - data64[2]; - break; - case PIPE_QUERY_SO_STATISTICS: - res64[0] = data64[0] - data64[4]; - res64[1] = data64[2] - data64[6]; - break; - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - res8[0] = data64[0] != data64[2]; - break; - case PIPE_QUERY_TIMESTAMP: - res64[0] = data64[1]; - break; - case PIPE_QUERY_TIMESTAMP_DISJOINT: - res64[0] = 1000000000; - res8[8] = false; - break; - case PIPE_QUERY_TIME_ELAPSED: - res64[0] = data64[1] - data64[3]; - break; - case PIPE_QUERY_PIPELINE_STATISTICS: - for (i = 0; i < 10; ++i) - res64[i] = data64[i * 2] - data64[24 + i * 2]; - break; - case NVC0_QUERY_TFB_BUFFER_OFFSET: - res32[0] = q->data[1]; - break; - default: - assert(0); /* can't happen, we don't create queries with invalid type */ - return false; - } - - return true; -} - -void -nvc0_query_fifo_wait(struct nouveau_pushbuf *push, struct pipe_query *pq) -{ - struct nvc0_query *q = nvc0_query(pq); - unsigned offset = q->offset; - - if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) offset += 0x20; - - PUSH_SPACE(push, 5); - PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD); - BEGIN_NVC0(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4); - PUSH_DATAh(push, q->bo->offset + offset); - PUSH_DATA (push, q->bo->offset + offset); - PUSH_DATA (push, q->sequence); - PUSH_DATA (push, (1 << 12) | - NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL); + return q->funcs->get_query_result(nvc0_context(pipe), q, wait, result); } static void @@ -547,7 +79,8 @@ nvc0_render_condition(struct pipe_context *pipe, { struct nvc0_context *nvc0 = nvc0_context(pipe); struct nouveau_pushbuf *push = nvc0->base.pushbuf; - struct nvc0_query *q; + struct nvc0_query *q = nvc0_query(pq); + struct nvc0_hw_query *hq = nvc0_hw_query(q); uint32_t cond; bool wait = mode != PIPE_RENDER_COND_NO_WAIT && @@ -557,7 +90,6 @@ nvc0_render_condition(struct pipe_context *pipe, cond = NVC0_3D_COND_MODE_ALWAYS; } else { - q = nvc0_query(pq); /* NOTE: comparison of 2 queries only works if both have completed */ switch (q->type) { case PIPE_QUERY_SO_OVERFLOW_PREDICATE: @@ -568,7 +100,7 @@ nvc0_render_condition(struct pipe_context *pipe, case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: if (likely(!condition)) { - if (unlikely(q->nesting)) + if (unlikely(hq->nesting)) cond = wait ? NVC0_3D_COND_MODE_NOT_EQUAL : NVC0_3D_COND_MODE_ALWAYS; else @@ -596,805 +128,17 @@ nvc0_render_condition(struct pipe_context *pipe, } if (wait) - nvc0_query_fifo_wait(push, pq); + nvc0_hw_query_fifo_wait(push, q); PUSH_SPACE(push, 7); - PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD); + PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD); BEGIN_NVC0(push, NVC0_3D(COND_ADDRESS_HIGH), 3); - PUSH_DATAh(push, q->bo->offset + q->offset); - PUSH_DATA (push, q->bo->offset + q->offset); + PUSH_DATAh(push, hq->bo->offset + hq->offset); + PUSH_DATA (push, hq->bo->offset + hq->offset); PUSH_DATA (push, cond); BEGIN_NVC0(push, NVC0_2D(COND_ADDRESS_HIGH), 2); - PUSH_DATAh(push, q->bo->offset + q->offset); - PUSH_DATA (push, q->bo->offset + q->offset); -} - -void -nvc0_query_pushbuf_submit(struct nouveau_pushbuf *push, - struct pipe_query *pq, unsigned result_offset) -{ - struct nvc0_query *q = nvc0_query(pq); - -#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8)) - - PUSH_REFN(push, q->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART); - nouveau_pushbuf_space(push, 0, 0, 1); - nouveau_pushbuf_data(push, q->bo, q->offset + result_offset, 4 | - NVC0_IB_ENTRY_1_NO_PREFETCH); -} - -void -nvc0_so_target_save_offset(struct pipe_context *pipe, - struct pipe_stream_output_target *ptarg, - unsigned index, bool *serialize) -{ - struct nvc0_so_target *targ = nvc0_so_target(ptarg); - - if (*serialize) { - *serialize = false; - PUSH_SPACE(nvc0_context(pipe)->base.pushbuf, 1); - IMMED_NVC0(nvc0_context(pipe)->base.pushbuf, NVC0_3D(SERIALIZE), 0); - - NOUVEAU_DRV_STAT(nouveau_screen(pipe->screen), gpu_serialize_count, 1); - } - - nvc0_query(targ->pq)->index = index; - - nvc0_query_end(pipe, targ->pq); -} - - -/* === DRIVER STATISTICS === */ - -#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS - -static const char *nvc0_drv_stat_names[] = -{ - "drv-tex_obj_current_count", - "drv-tex_obj_current_bytes", - "drv-buf_obj_current_count", - "drv-buf_obj_current_bytes_vid", - "drv-buf_obj_current_bytes_sys", - "drv-tex_transfers_rd", - "drv-tex_transfers_wr", - "drv-tex_copy_count", - "drv-tex_blit_count", - "drv-tex_cache_flush_count", - "drv-buf_transfers_rd", - "drv-buf_transfers_wr", - "drv-buf_read_bytes_staging_vid", - "drv-buf_write_bytes_direct", - "drv-buf_write_bytes_staging_vid", - "drv-buf_write_bytes_staging_sys", - "drv-buf_copy_bytes", - "drv-buf_non_kernel_fence_sync_count", - "drv-any_non_kernel_fence_sync_count", - "drv-query_sync_count", - "drv-gpu_serialize_count", - "drv-draw_calls_array", - "drv-draw_calls_indexed", - "drv-draw_calls_fallback_count", - "drv-user_buffer_upload_bytes", - "drv-constbuf_upload_count", - "drv-constbuf_upload_bytes", - "drv-pushbuf_count", - "drv-resource_validate_count" -}; - -#endif /* NOUVEAU_ENABLE_DRIVER_STATISTICS */ - - -/* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */ - -/* Code to read out MP counters: They are accessible via mmio, too, but let's - * just avoid mapping registers in userspace. We'd have to know which MPs are - * enabled/present, too, and that information is not presently exposed. - * We could add a kernel interface for it, but reading the counters like this - * has the advantage of being async (if get_result isn't called immediately). - */ -static const uint64_t nve4_read_hw_sm_counters_code[] = -{ - /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20 - * mov b32 $r8 $tidx - * mov b32 $r12 $physid - * mov b32 $r0 $pm0 - * mov b32 $r1 $pm1 - * mov b32 $r2 $pm2 - * mov b32 $r3 $pm3 - * mov b32 $r4 $pm4 - * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b - * mov b32 $r5 $pm5 - * mov b32 $r6 $pm6 - * mov b32 $r7 $pm7 - * set $p0 0x1 eq u32 $r8 0x0 - * mov b32 $r10 c0[0x0] - * ext u32 $r8 $r12 0x414 - * mov b32 $r11 c0[0x4] - * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04 - * ext u32 $r9 $r12 0x208 - * (not $p0) exit - * set $p1 0x1 eq u32 $r9 0x0 - * mul $r8 u32 $r8 u32 96 - * mul $r12 u32 $r9 u32 16 - * mul $r13 u32 $r9 u32 4 - * add b32 $r9 $r8 $r13 - * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c - * add b32 $r8 $r8 $r12 - * mov b32 $r12 $r10 - * add b32 $r10 $c $r10 $r8 - * mov b32 $r13 $r11 - * add b32 $r11 $r11 0x0 $c - * add b32 $r12 $c $r12 $r9 - * st b128 wt g[$r10d] $r0q - * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00 - * mov b32 $r0 c0[0x8] - * add b32 $r13 $r13 0x0 $c - * $p1 st b128 wt g[$r12d+0x40] $r4q - * st b32 wt g[$r12d+0x50] $r0 - * exit */ - 0x2202020202020207ULL, - 0x2c00000084021c04ULL, - 0x2c0000000c031c04ULL, - 0x2c00000010001c04ULL, - 0x2c00000014005c04ULL, - 0x2c00000018009c04ULL, - 0x2c0000001c00dc04ULL, - 0x2c00000020011c04ULL, - 0x22b0420042320207ULL, - 0x2c00000024015c04ULL, - 0x2c00000028019c04ULL, - 0x2c0000002c01dc04ULL, - 0x190e0000fc81dc03ULL, - 0x2800400000029de4ULL, - 0x7000c01050c21c03ULL, - 0x280040001002dde4ULL, - 0x204282020042e047ULL, - 0x7000c00820c25c03ULL, - 0x80000000000021e7ULL, - 0x190e0000fc93dc03ULL, - 0x1000000180821c02ULL, - 0x1000000040931c02ULL, - 0x1000000010935c02ULL, - 0x4800000034825c03ULL, - 0x22c042c042c04287ULL, - 0x4800000030821c03ULL, - 0x2800000028031de4ULL, - 0x4801000020a29c03ULL, - 0x280000002c035de4ULL, - 0x0800000000b2dc42ULL, - 0x4801000024c31c03ULL, - 0x9400000000a01fc5ULL, - 0x200002e04202c047ULL, - 0x2800400020001de4ULL, - 0x0800000000d35c42ULL, - 0x9400000100c107c5ULL, - 0x9400000140c01f85ULL, - 0x8000000000001de7ULL -}; - -/* NOTE: intentionally using the same names as NV */ -static const char *nve4_pm_query_names[] = -{ - /* MP counters */ - "active_cycles", - "active_warps", - "atom_count", - "branch", - "divergent_branch", - "gld_request", - "global_ld_mem_divergence_replays", - "global_store_transaction", - "global_st_mem_divergence_replays", - "gred_count", - "gst_request", - "inst_executed", - "inst_issued", - "inst_issued1", - "inst_issued2", - "l1_global_load_hit", - "l1_global_load_miss", - "l1_local_load_hit", - "l1_local_load_miss", - "l1_local_store_hit", - "l1_local_store_miss", - "l1_shared_load_transactions", - "l1_shared_store_transactions", - "local_load", - "local_load_transactions", - "local_store", - "local_store_transactions", - "prof_trigger_00", - "prof_trigger_01", - "prof_trigger_02", - "prof_trigger_03", - "prof_trigger_04", - "prof_trigger_05", - "prof_trigger_06", - "prof_trigger_07", - "shared_load", - "shared_load_replay", - "shared_store", - "shared_store_replay", - "sm_cta_launched", - "threads_launched", - "uncached_global_load_transaction", - "warps_launched", - /* metrics, i.e. functions of the MP counters */ - "metric-ipc", /* inst_executed, clock */ - "metric-ipac", /* inst_executed, active_cycles */ - "metric-ipec", /* inst_executed, (bool)inst_executed */ - "metric-achieved_occupancy", /* active_warps, active_cycles */ - "metric-sm_efficiency", /* active_cycles, clock */ - "metric-inst_replay_overhead" /* inst_issued, inst_executed */ -}; - -/* For simplicity, we will allocate as many group slots as we allocate counter - * slots. This means that a single counter which wants to source from 2 groups - * will have to be declared as using 2 counter slots. This shouldn't really be - * a problem because such queries don't make much sense ... (unless someone is - * really creative). - */ -struct nvc0_mp_counter_cfg -{ - uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */ - uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */ - uint32_t num_src : 3; /* number of sources (1 - 6, only for NVC0:NVE4) */ - uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */ - uint32_t sig_sel : 8; /* signal group */ - uint64_t src_sel; /* signal selection for up to 6 sources (48 bit) */ -}; - -#define NVC0_COUNTER_OPn_SUM 0 -#define NVC0_COUNTER_OPn_OR 1 -#define NVC0_COUNTER_OPn_AND 2 -#define NVC0_COUNTER_OP2_REL_SUM_MM 3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */ -#define NVC0_COUNTER_OP2_DIV_SUM_M0 4 /* sum(ctr0) / ctr1 of MP[0]) */ -#define NVC0_COUNTER_OP2_AVG_DIV_MM 5 /* avg(ctr0 / ctr1) */ -#define NVC0_COUNTER_OP2_AVG_DIV_M0 6 /* avg(ctr0) / ctr1 of MP[0]) */ - -struct nvc0_hw_sm_query_cfg -{ - struct nvc0_mp_counter_cfg ctr[4]; - uint8_t num_counters; - uint8_t op; - uint8_t norm[2]; /* normalization num,denom */ -}; - -#define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } } -#define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } } -#define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \ - { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \ - { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, s1 }, \ - {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } } -#define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \ - { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, s0 }, \ - { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \ - {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } } -#define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \ - { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \ - { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \ - {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } } - -/* NOTES: - * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps - * inst_executed etc.: we only count a single warp scheduler - * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers; - * this is inaccurate ! - */ -static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] = -{ - _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1), - _Q1B(ACTIVE_WARPS, 0x003f, B6, WARP, 0x31483104, 2, 1), - _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1), - _Q1A(BRANCH, 0x0001, B6, BRANCH, 0x0000000c, 1, 1), - _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1), - _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1), - _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1), - _Q1B(GST_TRANSACTIONS, 0x0001, B6, MEM, 0x00000004, 1, 1), - _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1), - _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1), - _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1), - _Q1A(INST_EXECUTED, 0x0003, B6, EXEC, 0x00000398, 1, 1), - _Q1A(INST_ISSUED, 0x0003, B6, ISSUE, 0x00000104, 1, 1), - _Q1A(INST_ISSUED1, 0x0001, B6, ISSUE, 0x00000004, 1, 1), - _Q1A(INST_ISSUED2, 0x0001, B6, ISSUE, 0x00000008, 1, 1), - _Q1B(L1_GLD_HIT, 0x0001, B6, L1, 0x00000010, 1, 1), - _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1), - _Q1B(L1_LOCAL_LD_HIT, 0x0001, B6, L1, 0x00000000, 1, 1), - _Q1B(L1_LOCAL_LD_MISS, 0x0001, B6, L1, 0x00000004, 1, 1), - _Q1B(L1_LOCAL_ST_HIT, 0x0001, B6, L1, 0x00000008, 1, 1), - _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1), - _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1), - _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1), - _Q1A(LOCAL_LD, 0x0001, B6, LDST, 0x00000008, 1, 1), - _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1), - _Q1A(LOCAL_ST, 0x0001, B6, LDST, 0x0000000c, 1, 1), - _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1), - _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1), - _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1), - _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1), - _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1), - _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1), - _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1), - _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1), - _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1), - _Q1A(SHARED_LD, 0x0001, B6, LDST, 0x00000000, 1, 1), - _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1), - _Q1A(SHARED_ST, 0x0001, B6, LDST, 0x00000004, 1, 1), - _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1), - _Q1B(SM_CTA_LAUNCHED, 0x0001, B6, WARP, 0x0000001c, 1, 1), - _Q1A(THREADS_LAUNCHED, 0x003f, B6, LAUNCH, 0x398a4188, 1, 1), - _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1), - _Q1A(WARPS_LAUNCHED, 0x0001, B6, LAUNCH, 0x00000004, 1, 1), - _M2AB(IPC, 0x3, B6, EXEC, 0x398, 0xffff, LOGOP, WARP, 0x0, DIV_SUM_M0, 10, 1), - _M2AB(IPAC, 0x3, B6, EXEC, 0x398, 0x1, B6, WARP, 0x0, AVG_DIV_MM, 10, 1), - _M2A(IPEC, 0x3, B6, EXEC, 0x398, 0xe, LOGOP, EXEC, 0x398, AVG_DIV_MM, 10, 1), - _M2A(INST_REPLAY_OHEAD, 0x3, B6, ISSUE, 0x104, 0x3, B6, EXEC, 0x398, REL_SUM_MM, 100, 1), - _M2B(MP_OCCUPANCY, 0x3f, B6, WARP, 0x31483104, 0x01, B6, WARP, 0x0, AVG_DIV_MM, 200, 64), - _M2B(MP_EFFICIENCY, 0x01, B6, WARP, 0x0, 0xffff, LOGOP, WARP, 0x0, AVG_DIV_M0, 100, 1), -}; - -#undef _Q1A -#undef _Q1B -#undef _M2A -#undef _M2B - -/* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */ -static const uint64_t nvc0_read_hw_sm_counters_code[] = -{ - /* mov b32 $r8 $tidx - * mov b32 $r9 $physid - * mov b32 $r0 $pm0 - * mov b32 $r1 $pm1 - * mov b32 $r2 $pm2 - * mov b32 $r3 $pm3 - * mov b32 $r4 $pm4 - * mov b32 $r5 $pm5 - * mov b32 $r6 $pm6 - * mov b32 $r7 $pm7 - * set $p0 0x1 eq u32 $r8 0x0 - * mov b32 $r10 c0[0x0] - * mov b32 $r11 c0[0x4] - * ext u32 $r8 $r9 0x414 - * (not $p0) exit - * mul $r8 u32 $r8 u32 36 - * add b32 $r10 $c $r10 $r8 - * add b32 $r11 $r11 0x0 $c - * mov b32 $r8 c0[0x8] - * st b128 wt g[$r10d+0x00] $r0q - * st b128 wt g[$r10d+0x10] $r4q - * st b32 wt g[$r10d+0x20] $r8 - * exit */ - 0x2c00000084021c04ULL, - 0x2c0000000c025c04ULL, - 0x2c00000010001c04ULL, - 0x2c00000014005c04ULL, - 0x2c00000018009c04ULL, - 0x2c0000001c00dc04ULL, - 0x2c00000020011c04ULL, - 0x2c00000024015c04ULL, - 0x2c00000028019c04ULL, - 0x2c0000002c01dc04ULL, - 0x190e0000fc81dc03ULL, - 0x2800400000029de4ULL, - 0x280040001002dde4ULL, - 0x7000c01050921c03ULL, - 0x80000000000021e7ULL, - 0x1000000090821c02ULL, - 0x4801000020a29c03ULL, - 0x0800000000b2dc42ULL, - 0x2800400020021de4ULL, - 0x9400000000a01fc5ULL, - 0x9400000040a11fc5ULL, - 0x9400000080a21f85ULL, - 0x8000000000001de7ULL -}; - -static const char *nvc0_pm_query_names[] = -{ - /* MP counters */ - "active_cycles", - "active_warps", - "atom_count", - "branch", - "divergent_branch", - "gld_request", - "gred_count", - "gst_request", - "inst_executed", - "inst_issued1_0", - "inst_issued1_1", - "inst_issued2_0", - "inst_issued2_1", - "local_load", - "local_store", - "prof_trigger_00", - "prof_trigger_01", - "prof_trigger_02", - "prof_trigger_03", - "prof_trigger_04", - "prof_trigger_05", - "prof_trigger_06", - "prof_trigger_07", - "shared_load", - "shared_store", - "threads_launched", - "thread_inst_executed_0", - "thread_inst_executed_1", - "thread_inst_executed_2", - "thread_inst_executed_3", - "warps_launched", -}; - -#define _Q(n, f, m, g, c, s0, s1, s2, s3, s4, s5) [NVC0_HW_SM_QUERY_##n] = { { { f, NVC0_COMPUTE_MP_PM_OP_MODE_##m, c, 0, g, s0|(s1 << 8)|(s2 << 16)|(s3 << 24)|(s4##ULL << 32)|(s5##ULL << 40) }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { 1, 1 } } - -static const struct nvc0_hw_sm_query_cfg nvc0_hw_sm_queries[] = -{ - _Q(ACTIVE_CYCLES, 0xaaaa, LOGOP, 0x11, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(ACTIVE_WARPS, 0xaaaa, LOGOP, 0x24, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65), - _Q(ATOM_COUNT, 0xaaaa, LOGOP, 0x63, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(BRANCH, 0xaaaa, LOGOP, 0x1a, 2, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00), - _Q(DIVERGENT_BRANCH, 0xaaaa, LOGOP, 0x19, 2, 0x20, 0x31, 0x00, 0x00, 0x00, 0x00), - _Q(GLD_REQUEST, 0xaaaa, LOGOP, 0x64, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(GRED_COUNT, 0xaaaa, LOGOP, 0x63, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(GST_REQUEST, 0xaaaa, LOGOP, 0x64, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(INST_EXECUTED, 0xaaaa, LOGOP, 0x2d, 3, 0x00, 0x11, 0x22, 0x00, 0x00, 0x00), - _Q(INST_ISSUED1_0, 0xaaaa, LOGOP, 0x7e, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(INST_ISSUED1_1, 0xaaaa, LOGOP, 0x7e, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(INST_ISSUED2_0, 0xaaaa, LOGOP, 0x7e, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(INST_ISSUED2_1, 0xaaaa, LOGOP, 0x7e, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(LOCAL_LD, 0xaaaa, LOGOP, 0x64, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(LOCAL_ST, 0xaaaa, LOGOP, 0x64, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_0, 0xaaaa, LOGOP, 0x01, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_1, 0xaaaa, LOGOP, 0x01, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_2, 0xaaaa, LOGOP, 0x01, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_3, 0xaaaa, LOGOP, 0x01, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_4, 0xaaaa, LOGOP, 0x01, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_5, 0xaaaa, LOGOP, 0x01, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_6, 0xaaaa, LOGOP, 0x01, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_7, 0xaaaa, LOGOP, 0x01, 1, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(SHARED_LD, 0xaaaa, LOGOP, 0x64, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(SHARED_ST, 0xaaaa, LOGOP, 0x64, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(THREADS_LAUNCHED, 0xaaaa, LOGOP, 0x26, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65), - _Q(TH_INST_EXECUTED_0, 0xaaaa, LOGOP, 0xa3, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), - _Q(TH_INST_EXECUTED_1, 0xaaaa, LOGOP, 0xa5, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), - _Q(TH_INST_EXECUTED_2, 0xaaaa, LOGOP, 0xa4, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), - _Q(TH_INST_EXECUTED_3, 0xaaaa, LOGOP, 0xa6, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), - _Q(WARPS_LAUNCHED, 0xaaaa, LOGOP, 0x26, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00), -}; - -#undef _Q - -static const struct nvc0_hw_sm_query_cfg * -nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_query *q) -{ - struct nvc0_screen *screen = nvc0->screen; - - if (screen->base.class_3d >= NVE4_3D_CLASS) - return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC]; - return &nvc0_hw_sm_queries[q->type - NVC0_HW_SM_QUERY(0)]; -} - -boolean -nvc0_hw_sm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q) -{ - struct nvc0_screen *screen = nvc0->screen; - struct nouveau_pushbuf *push = nvc0->base.pushbuf; - const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS; - const struct nvc0_hw_sm_query_cfg *cfg; - unsigned i, c; - unsigned num_ab[2] = { 0, 0 }; - - cfg = nvc0_hw_sm_query_get_cfg(nvc0, q); - - /* check if we have enough free counter slots */ - for (i = 0; i < cfg->num_counters; ++i) - num_ab[cfg->ctr[i].sig_dom]++; - - if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 || - screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) { - NOUVEAU_ERR("Not enough free MP counter slots !\n"); - return false; - } - - assert(cfg->num_counters <= 4); - PUSH_SPACE(push, 4 * 8 * (is_nve4 ? 1 : 6) + 6); - - if (!screen->pm.mp_counters_enabled) { - screen->pm.mp_counters_enabled = true; - BEGIN_NVC0(push, SUBC_SW(0x06ac), 1); - PUSH_DATA (push, 0x1fcb); - } - - /* set sequence field to 0 (used to check if result is available) */ - for (i = 0; i < screen->mp_count; ++i) - q->data[i * 10 + 10] = 0; - - for (i = 0; i < cfg->num_counters; ++i) { - const unsigned d = cfg->ctr[i].sig_dom; - - if (!screen->pm.num_hw_sm_active[d]) { - uint32_t m = (1 << 22) | (1 << (7 + (8 * !d))); - if (screen->pm.num_hw_sm_active[!d]) - m |= 1 << (7 + (8 * d)); - BEGIN_NVC0(push, SUBC_SW(0x0600), 1); - PUSH_DATA (push, m); - } - screen->pm.num_hw_sm_active[d]++; - - for (c = d * 4; c < (d * 4 + 4); ++c) { - if (!screen->pm.mp_counter[c]) { - q->ctr[i] = c; - screen->pm.mp_counter[c] = (struct pipe_query *)q; - break; - } - } - assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */ - - /* configure and reset the counter(s) */ - if (is_nve4) { - if (d == 0) - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1); - else - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1); - PUSH_DATA (push, cfg->ctr[i].sig_sel); - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1); - PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3)); - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1); - PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1); - PUSH_DATA (push, 0); - } else { - unsigned s; - - for (s = 0; s < cfg->ctr[i].num_src; s++) { - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(s)), 1); - PUSH_DATA (push, cfg->ctr[i].sig_sel); - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(s)), 1); - PUSH_DATA (push, (cfg->ctr[i].src_sel >> (s * 8)) & 0xff); - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(s)), 1); - PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(s)), 1); - PUSH_DATA (push, 0); - } - } - } - return true; -} - -static void -nvc0_hw_sm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q) -{ - struct nvc0_screen *screen = nvc0->screen; - struct pipe_context *pipe = &nvc0->base.pipe; - struct nouveau_pushbuf *push = nvc0->base.pushbuf; - const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS; - uint32_t mask; - uint32_t input[3]; - const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 }; - const uint grid[3] = { screen->mp_count, 1, 1 }; - unsigned c; - const struct nvc0_hw_sm_query_cfg *cfg; - - cfg = nvc0_hw_sm_query_get_cfg(nvc0, q); - - if (unlikely(!screen->pm.prog)) { - struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program); - prog->type = PIPE_SHADER_COMPUTE; - prog->translated = true; - prog->num_gprs = 14; - prog->parm_size = 12; - if (is_nve4) { - prog->code = (uint32_t *)nve4_read_hw_sm_counters_code; - prog->code_size = sizeof(nve4_read_hw_sm_counters_code); - } else { - prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code; - prog->code_size = sizeof(nvc0_read_hw_sm_counters_code); - } - screen->pm.prog = prog; - } - - /* disable all counting */ - PUSH_SPACE(push, 8); - for (c = 0; c < 8; ++c) - if (screen->pm.mp_counter[c]) { - if (is_nve4) { - IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0); - } else { - IMMED_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 0); - } - } - /* release counters for this query */ - for (c = 0; c < 8; ++c) { - if (nvc0_query(screen->pm.mp_counter[c]) == q) { - screen->pm.num_hw_sm_active[c / 4]--; - screen->pm.mp_counter[c] = NULL; - } - } - - BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR, - q->bo); - - PUSH_SPACE(push, 1); - IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0); - - pipe->bind_compute_state(pipe, screen->pm.prog); - input[0] = (q->bo->offset + q->base); - input[1] = (q->bo->offset + q->base) >> 32; - input[2] = q->sequence; - pipe->launch_grid(pipe, block, grid, 0, input); - - nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY); - - /* re-activate other counters */ - PUSH_SPACE(push, 16); - mask = 0; - for (c = 0; c < 8; ++c) { - unsigned i; - q = nvc0_query(screen->pm.mp_counter[c]); - if (!q) - continue; - cfg = nvc0_hw_sm_query_get_cfg(nvc0, q); - for (i = 0; i < cfg->num_counters; ++i) { - if (mask & (1 << q->ctr[i])) - break; - mask |= 1 << q->ctr[i]; - if (is_nve4) { - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(q->ctr[i])), 1); - } else { - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(q->ctr[i])), 1); - } - PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); - } - } -} - -static inline bool -nvc0_hw_sm_query_read_data(uint32_t count[32][4], - struct nvc0_context *nvc0, bool wait, - struct nvc0_query *q, - const struct nvc0_hw_sm_query_cfg *cfg, - unsigned mp_count) -{ - unsigned p, c; - - for (p = 0; p < mp_count; ++p) { - const unsigned b = (0x24 / 4) * p; - - for (c = 0; c < cfg->num_counters; ++c) { - if (q->data[b + 8] != q->sequence) { - if (!wait) - return false; - if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client)) - return false; - } - count[p][c] = q->data[b + q->ctr[c]]; - } - } - return true; -} - -static inline bool -nve4_hw_sm_query_read_data(uint32_t count[32][4], - struct nvc0_context *nvc0, bool wait, - struct nvc0_query *q, - const struct nvc0_hw_sm_query_cfg *cfg, - unsigned mp_count) -{ - unsigned p, c, d; - - for (p = 0; p < mp_count; ++p) { - const unsigned b = (0x60 / 4) * p; - - for (c = 0; c < cfg->num_counters; ++c) { - count[p][c] = 0; - for (d = 0; d < ((q->ctr[c] & ~3) ? 1 : 4); ++d) { - if (q->data[b + 20 + d] != q->sequence) { - if (!wait) - return false; - if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client)) - return false; - } - if (q->ctr[c] & ~0x3) - count[p][c] = q->data[b + 16 + (q->ctr[c] & 3)]; - else - count[p][c] += q->data[b + d * 4 + q->ctr[c]]; - } - } - } - return true; -} - -/* Metric calculations: - * sum(x) ... sum of x over all MPs - * avg(x) ... average of x over all MPs - * - * IPC : sum(inst_executed) / clock - * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued) - * MP_OCCUPANCY : avg((active_warps / 64) / active_cycles) - * MP_EFFICIENCY : avg(active_cycles / clock) - * - * NOTE: Interpretation of IPC requires knowledge of MP count. - */ -static boolean -nvc0_hw_sm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q, - void *result, boolean wait) -{ - uint32_t count[32][4]; - uint64_t value = 0; - unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32); - unsigned p, c; - const struct nvc0_hw_sm_query_cfg *cfg; - bool ret; - - cfg = nvc0_hw_sm_query_get_cfg(nvc0, q); - - if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) - ret = nve4_hw_sm_query_read_data(count, nvc0, wait, q, cfg, mp_count); - else - ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, q, cfg, mp_count); - if (!ret) - return false; - - if (cfg->op == NVC0_COUNTER_OPn_SUM) { - for (c = 0; c < cfg->num_counters; ++c) - for (p = 0; p < mp_count; ++p) - value += count[p][c]; - value = (value * cfg->norm[0]) / cfg->norm[1]; - } else - if (cfg->op == NVC0_COUNTER_OPn_OR) { - uint32_t v = 0; - for (c = 0; c < cfg->num_counters; ++c) - for (p = 0; p < mp_count; ++p) - v |= count[p][c]; - value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1]; - } else - if (cfg->op == NVC0_COUNTER_OPn_AND) { - uint32_t v = ~0; - for (c = 0; c < cfg->num_counters; ++c) - for (p = 0; p < mp_count; ++p) - v &= count[p][c]; - value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1]; - } else - if (cfg->op == NVC0_COUNTER_OP2_REL_SUM_MM) { - uint64_t v[2] = { 0, 0 }; - for (p = 0; p < mp_count; ++p) { - v[0] += count[p][0]; - v[1] += count[p][1]; - } - if (v[0]) - value = ((v[0] - v[1]) * cfg->norm[0]) / (v[0] * cfg->norm[1]); - } else - if (cfg->op == NVC0_COUNTER_OP2_DIV_SUM_M0) { - for (p = 0; p < mp_count; ++p) - value += count[p][0]; - if (count[0][1]) - value = (value * cfg->norm[0]) / (count[0][1] * cfg->norm[1]); - else - value = 0; - } else - if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_MM) { - unsigned mp_used = 0; - for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0]) - if (count[p][1]) - value += (count[p][0] * cfg->norm[0]) / count[p][1]; - if (mp_used) - value /= (uint64_t)mp_used * cfg->norm[1]; - } else - if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_M0) { - unsigned mp_used = 0; - for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0]) - value += count[p][0]; - if (count[0][1] && mp_used) { - value *= cfg->norm[0]; - value /= (uint64_t)count[0][1] * mp_used * cfg->norm[1]; - } else { - value = 0; - } - } - - *(uint64_t *)result = value; - return true; + PUSH_DATAh(push, hq->bo->offset + hq->offset); + PUSH_DATA (push, hq->bo->offset + hq->offset); } int @@ -1403,24 +147,13 @@ nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen, struct pipe_driver_query_info *info) { struct nvc0_screen *screen = nvc0_screen(pscreen); - int count = 0; + int num_sw_queries = 0, num_hw_queries = 0; - count += NVC0_QUERY_DRV_STAT_COUNT; - - if (screen->base.device->drm_version >= 0x01000101) { - if (screen->compute) { - if (screen->base.class_3d == NVE4_3D_CLASS) { - count += NVE4_HW_SM_QUERY_COUNT; - } else - if (screen->base.class_3d < NVE4_3D_CLASS) { - /* NVC0_COMPUTE is not always enabled */ - count += NVC0_HW_SM_QUERY_COUNT; - } - } - } + num_sw_queries = nvc0_sw_get_driver_query_info(screen, 0, NULL); + num_hw_queries = nvc0_hw_get_driver_query_info(screen, 0, NULL); if (!info) - return count; + return num_sw_queries + num_hw_queries; /* Init default values. */ info->name = "this_is_not_the_query_you_are_looking_for"; @@ -1430,36 +163,11 @@ nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen, info->group_id = -1; #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS - if (id < NVC0_QUERY_DRV_STAT_COUNT) { - info->name = nvc0_drv_stat_names[id]; - info->query_type = NVC0_QUERY_DRV_STAT(id); - info->max_value.u64 = 0; - if (strstr(info->name, "bytes")) - info->type = PIPE_DRIVER_QUERY_TYPE_BYTES; - info->group_id = NVC0_QUERY_DRV_STAT_GROUP; - return 1; - } else + if (id < num_sw_queries) + return nvc0_sw_get_driver_query_info(screen, id, info); #endif - if (id < count) { - if (screen->compute) { - if (screen->base.class_3d == NVE4_3D_CLASS) { - info->name = nve4_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT]; - info->query_type = NVE4_HW_SM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT); - info->max_value.u64 = - (id < NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY) ? 0 : 100; - info->group_id = NVC0_QUERY_MP_COUNTER_GROUP; - return 1; - } else - if (screen->base.class_3d < NVE4_3D_CLASS) { - info->name = nvc0_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT]; - info->query_type = NVC0_HW_SM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT); - info->group_id = NVC0_QUERY_MP_COUNTER_GROUP; - return 1; - } - } - } - /* user asked for info about non-existing query */ - return 0; + + return nvc0_hw_get_driver_query_info(screen, id - num_sw_queries, info); } int @@ -1480,7 +188,7 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen, count++; } else if (screen->base.class_3d < NVE4_3D_CLASS) { - count++; /* NVC0_COMPUTE is not always enabled */ + count++; } } } @@ -1488,37 +196,35 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen, if (!info) return count; - if (id == NVC0_QUERY_MP_COUNTER_GROUP) { + if (id == NVC0_HW_SM_QUERY_GROUP) { if (screen->compute) { info->name = "MP counters"; info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_GPU; + /* Because we can't expose the number of hardware counters needed for + * each different query, we don't want to allow more than one active + * query simultaneously to avoid failure when the maximum number of + * counters is reached. Note that these groups of GPU counters are + * currently only used by AMD_performance_monitor. + */ + info->max_active_queries = 1; + if (screen->base.class_3d == NVE4_3D_CLASS) { info->num_queries = NVE4_HW_SM_QUERY_COUNT; - - /* On NVE4+, each multiprocessor have 8 hardware counters separated - * in two distinct domains, but we allow only one active query - * simultaneously because some of them use more than one hardware - * counter and this will result in an undefined behaviour. */ - info->max_active_queries = 1; /* TODO: handle multiple hw counters */ - return 1; + return 1; } else if (screen->base.class_3d < NVE4_3D_CLASS) { info->num_queries = NVC0_HW_SM_QUERY_COUNT; - - /* On NVC0:NVE4, each multiprocessor have 8 hardware counters - * in a single domain. */ - info->max_active_queries = 8; return 1; } } } #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS - else if (id == NVC0_QUERY_DRV_STAT_GROUP) { + else if (id == NVC0_SW_QUERY_DRV_STAT_GROUP) { info->name = "Driver statistics"; info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_CPU; - info->max_active_queries = NVC0_QUERY_DRV_STAT_COUNT; - info->num_queries = NVC0_QUERY_DRV_STAT_COUNT; + info->max_active_queries = NVC0_SW_QUERY_DRV_STAT_COUNT; + info->num_queries = NVC0_SW_QUERY_DRV_STAT_COUNT; return 1; } #endif @@ -1536,10 +242,10 @@ nvc0_init_query_functions(struct nvc0_context *nvc0) { struct pipe_context *pipe = &nvc0->base.pipe; - pipe->create_query = nvc0_query_create; - pipe->destroy_query = nvc0_query_destroy; - pipe->begin_query = nvc0_query_begin; - pipe->end_query = nvc0_query_end; - pipe->get_query_result = nvc0_query_result; + pipe->create_query = nvc0_create_query; + pipe->destroy_query = nvc0_destroy_query; + pipe->begin_query = nvc0_begin_query; + pipe->end_query = nvc0_end_query; + pipe->get_query_result = nvc0_get_query_result; pipe->render_condition = nvc0_render_condition; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query.h new file mode 100644 index 00000000000..6883ab6ab9d --- /dev/null +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.h @@ -0,0 +1,39 @@ +#ifndef __NVC0_QUERY_H__ +#define __NVC0_QUERY_H__ + +#include "pipe/p_context.h" + +#include "nouveau_context.h" + +struct nvc0_context; +struct nvc0_query; + +struct nvc0_query_funcs { + void (*destroy_query)(struct nvc0_context *, struct nvc0_query *); + boolean (*begin_query)(struct nvc0_context *, struct nvc0_query *); + void (*end_query)(struct nvc0_context *, struct nvc0_query *); + boolean (*get_query_result)(struct nvc0_context *, struct nvc0_query *, + boolean, union pipe_query_result *); +}; + +struct nvc0_query { + const struct nvc0_query_funcs *funcs; + uint16_t type; + uint16_t index; +}; + +static inline struct nvc0_query * +nvc0_query(struct pipe_query *pipe) +{ + return (struct nvc0_query *)pipe; +} + +/* + * Driver queries groups: + */ +#define NVC0_HW_SM_QUERY_GROUP 0 +#define NVC0_SW_QUERY_DRV_STAT_GROUP 1 + +void nvc0_init_query_functions(struct nvc0_context *); + +#endif diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c new file mode 100644 index 00000000000..90ee82f21e5 --- /dev/null +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c @@ -0,0 +1,491 @@ +/* + * Copyright 2011 Christoph Bumiller + * Copyright 2015 Samuel Pitoiset + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#define NVC0_PUSH_EXPLICIT_SPACE_CHECKING + +#include "nvc0/nvc0_context.h" +#include "nvc0/nvc0_query_hw.h" +#include "nvc0/nvc0_query_hw_metric.h" +#include "nvc0/nvc0_query_hw_sm.h" + +#define NVC0_HW_QUERY_STATE_READY 0 +#define NVC0_HW_QUERY_STATE_ACTIVE 1 +#define NVC0_HW_QUERY_STATE_ENDED 2 +#define NVC0_HW_QUERY_STATE_FLUSHED 3 + +#define NVC0_HW_QUERY_ALLOC_SPACE 256 + +bool +nvc0_hw_query_allocate(struct nvc0_context *nvc0, struct nvc0_query *q, + int size) +{ + struct nvc0_hw_query *hq = nvc0_hw_query(q); + struct nvc0_screen *screen = nvc0->screen; + int ret; + + if (hq->bo) { + nouveau_bo_ref(NULL, &hq->bo); + if (hq->mm) { + if (hq->state == NVC0_HW_QUERY_STATE_READY) + nouveau_mm_free(hq->mm); + else + nouveau_fence_work(screen->base.fence.current, + nouveau_mm_free_work, hq->mm); + } + } + if (size) { + hq->mm = nouveau_mm_allocate(screen->base.mm_GART, size, &hq->bo, + &hq->base_offset); + if (!hq->bo) + return false; + hq->offset = hq->base_offset; + + ret = nouveau_bo_map(hq->bo, 0, screen->base.client); + if (ret) { + nvc0_hw_query_allocate(nvc0, q, 0); + return false; + } + hq->data = (uint32_t *)((uint8_t *)hq->bo->map + hq->base_offset); + } + return true; +} + +static void +nvc0_hw_query_get(struct nouveau_pushbuf *push, struct nvc0_query *q, + unsigned offset, uint32_t get) +{ + struct nvc0_hw_query *hq = nvc0_hw_query(q); + + offset += hq->offset; + + PUSH_SPACE(push, 5); + PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_WR); + BEGIN_NVC0(push, NVC0_3D(QUERY_ADDRESS_HIGH), 4); + PUSH_DATAh(push, hq->bo->offset + offset); + PUSH_DATA (push, hq->bo->offset + offset); + PUSH_DATA (push, hq->sequence); + PUSH_DATA (push, get); +} + +static void +nvc0_hw_query_rotate(struct nvc0_context *nvc0, struct nvc0_query *q) +{ + struct nvc0_hw_query *hq = nvc0_hw_query(q); + + hq->offset += hq->rotate; + hq->data += hq->rotate / sizeof(*hq->data); + if (hq->offset - hq->base_offset == NVC0_HW_QUERY_ALLOC_SPACE) + nvc0_hw_query_allocate(nvc0, q, NVC0_HW_QUERY_ALLOC_SPACE); +} + +static inline void +nvc0_hw_query_update(struct nouveau_client *cli, struct nvc0_query *q) +{ + struct nvc0_hw_query *hq = nvc0_hw_query(q); + + if (hq->is64bit) { + if (nouveau_fence_signalled(hq->fence)) + hq->state = NVC0_HW_QUERY_STATE_READY; + } else { + if (hq->data[0] == hq->sequence) + hq->state = NVC0_HW_QUERY_STATE_READY; + } +} + +static void +nvc0_hw_destroy_query(struct nvc0_context *nvc0, struct nvc0_query *q) +{ + struct nvc0_hw_query *hq = nvc0_hw_query(q); + nvc0_hw_query_allocate(nvc0, q, 0); + nouveau_fence_ref(NULL, &hq->fence); + FREE(hq); +} + +static boolean +nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_hw_query *hq = nvc0_hw_query(q); + bool ret = true; + + if (hq->funcs && hq->funcs->begin_query) + return hq->funcs->begin_query(nvc0, hq); + + /* For occlusion queries we have to change the storage, because a previous + * query might set the initial render conition to false even *after* we re- + * initialized it to true. + */ + if (hq->rotate) { + nvc0_hw_query_rotate(nvc0, q); + + /* XXX: can we do this with the GPU, and sync with respect to a previous + * query ? + */ + hq->data[0] = hq->sequence; /* initialize sequence */ + hq->data[1] = 1; /* initial render condition = true */ + hq->data[4] = hq->sequence + 1; /* for comparison COND_MODE */ + hq->data[5] = 0; + } + hq->sequence++; + + switch (q->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + hq->nesting = nvc0->screen->num_occlusion_queries_active++; + if (hq->nesting) { + nvc0_hw_query_get(push, q, 0x10, 0x0100f002); + } else { + PUSH_SPACE(push, 3); + BEGIN_NVC0(push, NVC0_3D(COUNTER_RESET), 1); + PUSH_DATA (push, NVC0_3D_COUNTER_RESET_SAMPLECNT); + IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 1); + } + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + nvc0_hw_query_get(push, q, 0x10, 0x09005002 | (q->index << 5)); + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + nvc0_hw_query_get(push, q, 0x10, 0x05805002 | (q->index << 5)); + break; + case PIPE_QUERY_SO_STATISTICS: + nvc0_hw_query_get(push, q, 0x20, 0x05805002 | (q->index << 5)); + nvc0_hw_query_get(push, q, 0x30, 0x06805002 | (q->index << 5)); + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + nvc0_hw_query_get(push, q, 0x10, 0x03005002 | (q->index << 5)); + break; + case PIPE_QUERY_TIME_ELAPSED: + nvc0_hw_query_get(push, q, 0x10, 0x00005002); + break; + case PIPE_QUERY_PIPELINE_STATISTICS: + nvc0_hw_query_get(push, q, 0xc0 + 0x00, 0x00801002); /* VFETCH, VERTICES */ + nvc0_hw_query_get(push, q, 0xc0 + 0x10, 0x01801002); /* VFETCH, PRIMS */ + nvc0_hw_query_get(push, q, 0xc0 + 0x20, 0x02802002); /* VP, LAUNCHES */ + nvc0_hw_query_get(push, q, 0xc0 + 0x30, 0x03806002); /* GP, LAUNCHES */ + nvc0_hw_query_get(push, q, 0xc0 + 0x40, 0x04806002); /* GP, PRIMS_OUT */ + nvc0_hw_query_get(push, q, 0xc0 + 0x50, 0x07804002); /* RAST, PRIMS_IN */ + nvc0_hw_query_get(push, q, 0xc0 + 0x60, 0x08804002); /* RAST, PRIMS_OUT */ + nvc0_hw_query_get(push, q, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */ + nvc0_hw_query_get(push, q, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */ + nvc0_hw_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */ + break; + default: + break; + } + hq->state = NVC0_HW_QUERY_STATE_ACTIVE; + return ret; +} + +static void +nvc0_hw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_hw_query *hq = nvc0_hw_query(q); + + if (hq->funcs && hq->funcs->end_query) { + hq->funcs->end_query(nvc0, hq); + return; + } + + if (hq->state != NVC0_HW_QUERY_STATE_ACTIVE) { + /* some queries don't require 'begin' to be called (e.g. GPU_FINISHED) */ + if (hq->rotate) + nvc0_hw_query_rotate(nvc0, q); + hq->sequence++; + } + hq->state = NVC0_HW_QUERY_STATE_ENDED; + + switch (q->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + nvc0_hw_query_get(push, q, 0, 0x0100f002); + if (--nvc0->screen->num_occlusion_queries_active == 0) { + PUSH_SPACE(push, 1); + IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 0); + } + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + nvc0_hw_query_get(push, q, 0, 0x09005002 | (q->index << 5)); + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + nvc0_hw_query_get(push, q, 0, 0x05805002 | (q->index << 5)); + break; + case PIPE_QUERY_SO_STATISTICS: + nvc0_hw_query_get(push, q, 0x00, 0x05805002 | (q->index << 5)); + nvc0_hw_query_get(push, q, 0x10, 0x06805002 | (q->index << 5)); + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + /* TODO: How do we sum over all streams for render condition ? */ + /* PRIMS_DROPPED doesn't write sequence, use a ZERO query to sync on */ + nvc0_hw_query_get(push, q, 0x00, 0x03005002 | (q->index << 5)); + nvc0_hw_query_get(push, q, 0x20, 0x00005002); + break; + case PIPE_QUERY_TIMESTAMP: + case PIPE_QUERY_TIME_ELAPSED: + nvc0_hw_query_get(push, q, 0, 0x00005002); + break; + case PIPE_QUERY_GPU_FINISHED: + nvc0_hw_query_get(push, q, 0, 0x1000f010); + break; + case PIPE_QUERY_PIPELINE_STATISTICS: + nvc0_hw_query_get(push, q, 0x00, 0x00801002); /* VFETCH, VERTICES */ + nvc0_hw_query_get(push, q, 0x10, 0x01801002); /* VFETCH, PRIMS */ + nvc0_hw_query_get(push, q, 0x20, 0x02802002); /* VP, LAUNCHES */ + nvc0_hw_query_get(push, q, 0x30, 0x03806002); /* GP, LAUNCHES */ + nvc0_hw_query_get(push, q, 0x40, 0x04806002); /* GP, PRIMS_OUT */ + nvc0_hw_query_get(push, q, 0x50, 0x07804002); /* RAST, PRIMS_IN */ + nvc0_hw_query_get(push, q, 0x60, 0x08804002); /* RAST, PRIMS_OUT */ + nvc0_hw_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */ + nvc0_hw_query_get(push, q, 0x80, 0x0d808002); /* TCP, LAUNCHES */ + nvc0_hw_query_get(push, q, 0x90, 0x0e809002); /* TEP, LAUNCHES */ + break; + case PIPE_QUERY_TIMESTAMP_DISJOINT: + /* This query is not issued on GPU because disjoint is forced to false */ + hq->state = NVC0_HW_QUERY_STATE_READY; + break; + case NVC0_HW_QUERY_TFB_BUFFER_OFFSET: + /* indexed by TFB buffer instead of by vertex stream */ + nvc0_hw_query_get(push, q, 0x00, 0x0d005002 | (q->index << 5)); + break; + default: + break; + } + if (hq->is64bit) + nouveau_fence_ref(nvc0->screen->base.fence.current, &hq->fence); +} + +static boolean +nvc0_hw_get_query_result(struct nvc0_context *nvc0, struct nvc0_query *q, + boolean wait, union pipe_query_result *result) +{ + struct nvc0_hw_query *hq = nvc0_hw_query(q); + uint64_t *res64 = (uint64_t*)result; + uint32_t *res32 = (uint32_t*)result; + uint8_t *res8 = (uint8_t*)result; + uint64_t *data64 = (uint64_t *)hq->data; + unsigned i; + + if (hq->funcs && hq->funcs->get_query_result) + return hq->funcs->get_query_result(nvc0, hq, wait, result); + + if (hq->state != NVC0_HW_QUERY_STATE_READY) + nvc0_hw_query_update(nvc0->screen->base.client, q); + + if (hq->state != NVC0_HW_QUERY_STATE_READY) { + if (!wait) { + if (hq->state != NVC0_HW_QUERY_STATE_FLUSHED) { + hq->state = NVC0_HW_QUERY_STATE_FLUSHED; + /* flush for silly apps that spin on GL_QUERY_RESULT_AVAILABLE */ + PUSH_KICK(nvc0->base.pushbuf); + } + return false; + } + if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->screen->base.client)) + return false; + NOUVEAU_DRV_STAT(&nvc0->screen->base, query_sync_count, 1); + } + hq->state = NVC0_HW_QUERY_STATE_READY; + + switch (q->type) { + case PIPE_QUERY_GPU_FINISHED: + res8[0] = true; + break; + case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */ + res64[0] = hq->data[1] - hq->data[5]; + break; + case PIPE_QUERY_OCCLUSION_PREDICATE: + res8[0] = hq->data[1] != hq->data[5]; + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */ + case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */ + res64[0] = data64[0] - data64[2]; + break; + case PIPE_QUERY_SO_STATISTICS: + res64[0] = data64[0] - data64[4]; + res64[1] = data64[2] - data64[6]; + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + res8[0] = data64[0] != data64[2]; + break; + case PIPE_QUERY_TIMESTAMP: + res64[0] = data64[1]; + break; + case PIPE_QUERY_TIMESTAMP_DISJOINT: + res64[0] = 1000000000; + res8[8] = false; + break; + case PIPE_QUERY_TIME_ELAPSED: + res64[0] = data64[1] - data64[3]; + break; + case PIPE_QUERY_PIPELINE_STATISTICS: + for (i = 0; i < 10; ++i) + res64[i] = data64[i * 2] - data64[24 + i * 2]; + break; + case NVC0_HW_QUERY_TFB_BUFFER_OFFSET: + res32[0] = hq->data[1]; + break; + default: + assert(0); /* can't happen, we don't create queries with invalid type */ + return false; + } + + return true; +} + +static const struct nvc0_query_funcs hw_query_funcs = { + .destroy_query = nvc0_hw_destroy_query, + .begin_query = nvc0_hw_begin_query, + .end_query = nvc0_hw_end_query, + .get_query_result = nvc0_hw_get_query_result, +}; + +struct nvc0_query * +nvc0_hw_create_query(struct nvc0_context *nvc0, unsigned type, unsigned index) +{ + struct nvc0_hw_query *hq; + struct nvc0_query *q; + unsigned space = NVC0_HW_QUERY_ALLOC_SPACE; + + hq = nvc0_hw_sm_create_query(nvc0, type); + if (hq) { + hq->base.funcs = &hw_query_funcs; + return (struct nvc0_query *)hq; + } + + hq = nvc0_hw_metric_create_query(nvc0, type); + if (hq) { + hq->base.funcs = &hw_query_funcs; + return (struct nvc0_query *)hq; + } + + hq = CALLOC_STRUCT(nvc0_hw_query); + if (!hq) + return NULL; + + q = &hq->base; + q->funcs = &hw_query_funcs; + q->type = type; + + switch (q->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + hq->rotate = 32; + space = NVC0_HW_QUERY_ALLOC_SPACE; + break; + case PIPE_QUERY_PIPELINE_STATISTICS: + hq->is64bit = true; + space = 512; + break; + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + hq->is64bit = true; + space = 64; + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_PRIMITIVES_EMITTED: + hq->is64bit = true; + q->index = index; + space = 32; + break; + case PIPE_QUERY_TIME_ELAPSED: + case PIPE_QUERY_TIMESTAMP: + case PIPE_QUERY_TIMESTAMP_DISJOINT: + case PIPE_QUERY_GPU_FINISHED: + space = 32; + break; + case NVC0_HW_QUERY_TFB_BUFFER_OFFSET: + space = 16; + break; + default: + debug_printf("invalid query type: %u\n", type); + FREE(q); + return NULL; + } + + if (!nvc0_hw_query_allocate(nvc0, q, space)) { + FREE(hq); + return NULL; + } + + if (hq->rotate) { + /* we advance before query_begin ! */ + hq->offset -= hq->rotate; + hq->data -= hq->rotate / sizeof(*hq->data); + } else + if (!hq->is64bit) + hq->data[0] = 0; /* initialize sequence */ + + return q; +} + +int +nvc0_hw_get_driver_query_info(struct nvc0_screen *screen, unsigned id, + struct pipe_driver_query_info *info) +{ + int num_hw_sm_queries = 0, num_hw_metric_queries = 0; + + num_hw_sm_queries = nvc0_hw_sm_get_driver_query_info(screen, 0, NULL); + num_hw_metric_queries = + nvc0_hw_metric_get_driver_query_info(screen, 0, NULL); + + if (!info) + return num_hw_sm_queries + num_hw_metric_queries; + + if (id < num_hw_sm_queries) + return nvc0_hw_sm_get_driver_query_info(screen, id, info); + + return nvc0_hw_metric_get_driver_query_info(screen, + id - num_hw_sm_queries, info); +} + +void +nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf *push, + struct nvc0_query *q, unsigned result_offset) +{ + struct nvc0_hw_query *hq = nvc0_hw_query(q); + +#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8)) + + PUSH_REFN(push, hq->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART); + nouveau_pushbuf_space(push, 0, 0, 1); + nouveau_pushbuf_data(push, hq->bo, hq->offset + result_offset, 4 | + NVC0_IB_ENTRY_1_NO_PREFETCH); +} + +void +nvc0_hw_query_fifo_wait(struct nouveau_pushbuf *push, struct nvc0_query *q) +{ + struct nvc0_hw_query *hq = nvc0_hw_query(q); + unsigned offset = hq->offset; + + if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) offset += 0x20; + + PUSH_SPACE(push, 5); + PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD); + BEGIN_NVC0(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4); + PUSH_DATAh(push, hq->bo->offset + offset); + PUSH_DATA (push, hq->bo->offset + offset); + PUSH_DATA (push, hq->sequence); + PUSH_DATA (push, (1 << 12) | + NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL); +} diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h new file mode 100644 index 00000000000..3701eb7100f --- /dev/null +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h @@ -0,0 +1,56 @@ +#ifndef __NVC0_QUERY_HW_H__ +#define __NVC0_QUERY_HW_H__ + +#include "nouveau_fence.h" +#include "nouveau_mm.h" + +#include "nvc0_query.h" + +#define NVC0_HW_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0) + +struct nvc0_hw_query; + +struct nvc0_hw_query_funcs { + void (*destroy_query)(struct nvc0_context *, struct nvc0_hw_query *); + boolean (*begin_query)(struct nvc0_context *, struct nvc0_hw_query *); + void (*end_query)(struct nvc0_context *, struct nvc0_hw_query *); + boolean (*get_query_result)(struct nvc0_context *, struct nvc0_hw_query *, + boolean, union pipe_query_result *); +}; + +struct nvc0_hw_query { + struct nvc0_query base; + const struct nvc0_hw_query_funcs *funcs; + uint32_t *data; + uint32_t sequence; + struct nouveau_bo *bo; + uint32_t base_offset; + uint32_t offset; /* base_offset + i * rotate */ + uint8_t state; + boolean is64bit; + uint8_t rotate; + int nesting; /* only used for occlusion queries */ + struct nouveau_mm_allocation *mm; + struct nouveau_fence *fence; +}; + +static inline struct nvc0_hw_query * +nvc0_hw_query(struct nvc0_query *q) +{ + return (struct nvc0_hw_query *)q; +} + +struct nvc0_query * +nvc0_hw_create_query(struct nvc0_context *, unsigned, unsigned); +int +nvc0_hw_get_driver_query_info(struct nvc0_screen *, unsigned, + struct pipe_driver_query_info *); +bool +nvc0_hw_query_allocate(struct nvc0_context *, struct nvc0_query *, int); +void +nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf *, struct nvc0_query *, + unsigned); +void +nvc0_hw_query_fifo_wait(struct nouveau_pushbuf *, struct nvc0_query *); + +#endif diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c new file mode 100644 index 00000000000..25aa09be42a --- /dev/null +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c @@ -0,0 +1,440 @@ +/* + * Copyright 2015 Samuel Pitoiset + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "nvc0/nvc0_context.h" +#include "nvc0/nvc0_query_hw_metric.h" +#include "nvc0/nvc0_query_hw_sm.h" + +/* === PERFORMANCE MONITORING METRICS for NVC0:NVE4 === */ +static const char *nvc0_hw_metric_names[] = +{ + "metric-achieved_occupancy", + "metric-branch_efficiency", + "metric-inst_issued", + "metric-inst_per_wrap", + "metric-inst_replay_overhead", + "metric-issued_ipc", + "metric-issue_slots", + "metric-issue_slot_utilization", + "metric-ipc", +}; + +struct nvc0_hw_metric_query_cfg { + uint32_t queries[8]; + uint32_t num_queries; +}; + +#define _SM(n) NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_ ##n) +#define _M(n, c) [NVC0_HW_METRIC_QUERY_##n] = c + +/* ==== Compute capability 2.0 (GF100/GF110) ==== */ +static const struct nvc0_hw_metric_query_cfg +sm20_achieved_occupancy = +{ + .queries[0] = _SM(ACTIVE_WARPS), + .queries[1] = _SM(ACTIVE_CYCLES), + .num_queries = 2, +}; + +static const struct nvc0_hw_metric_query_cfg +sm20_branch_efficiency = +{ + .queries[0] = _SM(BRANCH), + .queries[1] = _SM(DIVERGENT_BRANCH), + .num_queries = 2, +}; + +static const struct nvc0_hw_metric_query_cfg +sm20_inst_per_wrap = +{ + .queries[0] = _SM(INST_EXECUTED), + .queries[1] = _SM(WARPS_LAUNCHED), + .num_queries = 2, +}; + +static const struct nvc0_hw_metric_query_cfg +sm20_inst_replay_overhead = +{ + .queries[0] = _SM(INST_ISSUED), + .queries[1] = _SM(INST_EXECUTED), + .num_queries = 2, +}; + +static const struct nvc0_hw_metric_query_cfg +sm20_issued_ipc = +{ + .queries[0] = _SM(INST_ISSUED), + .queries[1] = _SM(ACTIVE_CYCLES), + .num_queries = 2, +}; + +static const struct nvc0_hw_metric_query_cfg +sm20_ipc = +{ + .queries[0] = _SM(INST_EXECUTED), + .queries[1] = _SM(ACTIVE_CYCLES), + .num_queries = 2, +}; + +static const struct nvc0_hw_metric_query_cfg *sm20_hw_metric_queries[] = +{ + _M(ACHIEVED_OCCUPANCY, &sm20_achieved_occupancy), + _M(BRANCH_EFFICIENCY, &sm20_branch_efficiency), + _M(INST_ISSUED, NULL), + _M(INST_PER_WRAP, &sm20_inst_per_wrap), + _M(INST_REPLAY_OVERHEAD, &sm20_inst_replay_overhead), + _M(ISSUED_IPC, &sm20_issued_ipc), + _M(ISSUE_SLOTS, NULL), + _M(ISSUE_SLOT_UTILIZATION, &sm20_issued_ipc), + _M(IPC, &sm20_ipc), +}; + +/* ==== Compute capability 2.1 (GF108+ except GF110) ==== */ +static const struct nvc0_hw_metric_query_cfg +sm21_inst_issued = +{ + .queries[0] = _SM(INST_ISSUED1_0), + .queries[1] = _SM(INST_ISSUED1_1), + .queries[2] = _SM(INST_ISSUED2_0), + .queries[3] = _SM(INST_ISSUED2_1), + .num_queries = 4, +}; + +static const struct nvc0_hw_metric_query_cfg +sm21_inst_replay_overhead = +{ + .queries[0] = _SM(INST_ISSUED1_0), + .queries[1] = _SM(INST_ISSUED1_1), + .queries[2] = _SM(INST_ISSUED2_0), + .queries[3] = _SM(INST_ISSUED2_1), + .queries[4] = _SM(INST_EXECUTED), + .num_queries = 5, +}; + +static const struct nvc0_hw_metric_query_cfg +sm21_issued_ipc = +{ + .queries[0] = _SM(INST_ISSUED1_0), + .queries[1] = _SM(INST_ISSUED1_1), + .queries[2] = _SM(INST_ISSUED2_0), + .queries[3] = _SM(INST_ISSUED2_1), + .queries[4] = _SM(ACTIVE_CYCLES), + .num_queries = 5, +}; + +static const struct nvc0_hw_metric_query_cfg *sm21_hw_metric_queries[] = +{ + _M(ACHIEVED_OCCUPANCY, &sm20_achieved_occupancy), + _M(BRANCH_EFFICIENCY, &sm20_branch_efficiency), + _M(INST_ISSUED, &sm21_inst_issued), + _M(INST_PER_WRAP, &sm20_inst_per_wrap), + _M(INST_REPLAY_OVERHEAD, &sm21_inst_replay_overhead), + _M(ISSUED_IPC, &sm21_issued_ipc), + _M(ISSUE_SLOTS, &sm21_inst_issued), + _M(ISSUE_SLOT_UTILIZATION, &sm21_issued_ipc), + _M(IPC, &sm20_ipc), +}; + +#undef _SM +#undef _M + +static inline const struct nvc0_hw_metric_query_cfg ** +nvc0_hw_metric_get_queries(struct nvc0_screen *screen) +{ + struct nouveau_device *dev = screen->base.device; + + if (dev->chipset == 0xc0 || dev->chipset == 0xc8) + return sm20_hw_metric_queries; + return sm21_hw_metric_queries; +} + +static const struct nvc0_hw_metric_query_cfg * +nvc0_hw_metric_query_get_cfg(struct nvc0_context *nvc0, + struct nvc0_hw_query *hq) +{ + const struct nvc0_hw_metric_query_cfg **queries; + struct nvc0_screen *screen = nvc0->screen; + struct nvc0_query *q = &hq->base; + + queries = nvc0_hw_metric_get_queries(screen); + return queries[q->type - NVC0_HW_METRIC_QUERY(0)]; +} + +static void +nvc0_hw_metric_destroy_query(struct nvc0_context *nvc0, + struct nvc0_hw_query *hq) +{ + struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq); + unsigned i; + + for (i = 0; i < hmq->num_queries; i++) + hmq->queries[i]->funcs->destroy_query(nvc0, hmq->queries[i]); + FREE(hmq); +} + +static boolean +nvc0_hw_metric_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) +{ + struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq); + boolean ret = false; + unsigned i; + + for (i = 0; i < hmq->num_queries; i++) { + ret = hmq->queries[i]->funcs->begin_query(nvc0, hmq->queries[i]); + if (!ret) + return ret; + } + return ret; +} + +static void +nvc0_hw_metric_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) +{ + struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq); + unsigned i; + + for (i = 0; i < hmq->num_queries; i++) + hmq->queries[i]->funcs->end_query(nvc0, hmq->queries[i]); +} + +static uint64_t +sm20_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8]) +{ + switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) { + case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY: + /* (active_warps / active_cycles) / max. number of warps on a MP */ + if (res64[1]) + return (res64[0] / (double)res64[1]) / 48; + break; + case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY: + /* (branch / (branch + divergent_branch)) * 100 */ + if (res64[0] + res64[1]) + return (res64[0] / (double)(res64[0] + res64[1])) * 100; + break; + case NVC0_HW_METRIC_QUERY_INST_PER_WRAP: + /* inst_executed / warps_launched */ + if (res64[1]) + return res64[0] / (double)res64[1]; + break; + case NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD: + /* (inst_issued - inst_executed) / inst_executed */ + if (res64[1]) + return (res64[0] - res64[1]) / (double)res64[1]; + break; + case NVC0_HW_METRIC_QUERY_ISSUED_IPC: + /* inst_issued / active_cycles */ + if (res64[1]) + return res64[0] / (double)res64[1]; + break; + case NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION: + /* ((inst_issued / 2) / active_cycles) * 100 */ + if (res64[1]) + return ((res64[0] / 2) / (double)res64[1]) * 100; + break; + case NVC0_HW_METRIC_QUERY_IPC: + /* inst_executed / active_cycles */ + if (res64[1]) + return res64[0] / (double)res64[1]; + break; + default: + debug_printf("invalid metric type: %d\n", + hq->base.type - NVC0_HW_METRIC_QUERY(0)); + break; + } + return 0; +} + +static uint64_t +sm21_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8]) +{ + switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) { + case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY: + return sm20_hw_metric_calc_result(hq, res64); + case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY: + return sm20_hw_metric_calc_result(hq, res64); + case NVC0_HW_METRIC_QUERY_INST_ISSUED: + /* issued1_0 + issued1_1 + (issued2_0 + issued2_1) * 2 */ + return res64[0] + res64[1] + (res64[2] + res64[3]) * 2; + break; + case NVC0_HW_METRIC_QUERY_INST_PER_WRAP: + return sm20_hw_metric_calc_result(hq, res64); + case NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD: + /* (metric-inst_issued - inst_executed) / inst_executed */ + if (res64[4]) + return (((res64[0] + res64[1] + (res64[2] + res64[3]) * 2) - + res64[4]) / (double)res64[4]); + break; + case NVC0_HW_METRIC_QUERY_ISSUED_IPC: + /* metric-inst_issued / active_cycles */ + if (res64[4]) + return (res64[0] + res64[1] + (res64[2] + res64[3]) * 2) / + (double)res64[4]; + break; + case NVC0_HW_METRIC_QUERY_ISSUE_SLOTS: + /* issued1_0 + issued1_1 + issued2_0 + issued2_1 */ + return res64[0] + res64[1] + res64[2] + res64[3]; + break; + case NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION: + /* ((metric-issue_slots / 2) / active_cycles) * 100 */ + if (res64[4]) + return (((res64[0] + res64[1] + res64[2] + res64[3]) / 2) / + (double)res64[4]) * 100; + break; + case NVC0_HW_METRIC_QUERY_IPC: + return sm20_hw_metric_calc_result(hq, res64); + default: + debug_printf("invalid metric type: %d\n", + hq->base.type - NVC0_HW_METRIC_QUERY(0)); + break; + } + return 0; +} + +static boolean +nvc0_hw_metric_get_query_result(struct nvc0_context *nvc0, + struct nvc0_hw_query *hq, boolean wait, + union pipe_query_result *result) +{ + struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq); + struct nvc0_screen *screen = nvc0->screen; + struct nouveau_device *dev = screen->base.device; + union pipe_query_result results[8] = {}; + uint64_t res64[8] = {}; + uint64_t value = 0; + boolean ret = false; + unsigned i; + + for (i = 0; i < hmq->num_queries; i++) { + ret = hmq->queries[i]->funcs->get_query_result(nvc0, hmq->queries[i], + wait, &results[i]); + if (!ret) + return ret; + res64[i] = *(uint64_t *)&results[i]; + } + + if (dev->chipset == 0xc0 || dev->chipset == 0xc8) + value = sm20_hw_metric_calc_result(hq, res64); + else + value = sm21_hw_metric_calc_result(hq, res64); + + *(uint64_t *)result = value; + return ret; +} + +static const struct nvc0_hw_query_funcs hw_metric_query_funcs = { + .destroy_query = nvc0_hw_metric_destroy_query, + .begin_query = nvc0_hw_metric_begin_query, + .end_query = nvc0_hw_metric_end_query, + .get_query_result = nvc0_hw_metric_get_query_result, +}; + +struct nvc0_hw_query * +nvc0_hw_metric_create_query(struct nvc0_context *nvc0, unsigned type) +{ + const struct nvc0_hw_metric_query_cfg *cfg; + struct nvc0_hw_metric_query *hmq; + struct nvc0_hw_query *hq; + unsigned i; + + if (type < NVC0_HW_METRIC_QUERY(0) || type > NVC0_HW_METRIC_QUERY_LAST) + return NULL; + + hmq = CALLOC_STRUCT(nvc0_hw_metric_query); + if (!hmq) + return NULL; + + hq = &hmq->base; + hq->funcs = &hw_metric_query_funcs; + hq->base.type = type; + + cfg = nvc0_hw_metric_query_get_cfg(nvc0, hq); + + for (i = 0; i < cfg->num_queries; i++) { + hmq->queries[i] = nvc0_hw_sm_create_query(nvc0, cfg->queries[i]); + if (!hmq->queries[i]) { + nvc0_hw_metric_destroy_query(nvc0, hq); + return NULL; + } + hmq->num_queries++; + } + + return hq; +} + +static int +nvc0_hw_metric_get_next_query_id(const struct nvc0_hw_metric_query_cfg **queries, + unsigned id) +{ + unsigned i, next = 0; + + for (i = 0; i < NVC0_HW_METRIC_QUERY_COUNT; i++) { + if (!queries[i]) { + next++; + } else + if (i >= id && queries[id + next]) { + break; + } + } + return id + next; +} + +int +nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id, + struct pipe_driver_query_info *info) +{ + uint16_t class_3d = screen->base.class_3d; + int count = 0; + + if (screen->base.device->drm_version >= 0x01000101) { + if (screen->compute) { + if (class_3d < NVE4_3D_CLASS) { + const struct nvc0_hw_metric_query_cfg **queries = + nvc0_hw_metric_get_queries(screen); + unsigned i; + + for (i = 0; i < NVC0_HW_METRIC_QUERY_COUNT; i++) { + if (queries[i]) + count++; + } + } + } + } + + if (!info) + return count; + + if (id < count) { + if (screen->compute) { + if (class_3d < NVE4_3D_CLASS) { + const struct nvc0_hw_metric_query_cfg **queries = + nvc0_hw_metric_get_queries(screen); + + id = nvc0_hw_metric_get_next_query_id(queries, id); + info->name = nvc0_hw_metric_names[id]; + info->query_type = NVC0_HW_METRIC_QUERY(id); + info->group_id = -1; + return 1; + } + } + } + return 0; +} diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h new file mode 100644 index 00000000000..95675fd19b7 --- /dev/null +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h @@ -0,0 +1,42 @@ +#ifndef __NVC0_QUERY_HW_METRIC_H__ +#define __NVC0_QUERY_HW_METRIC_H__ + +#include "nvc0_query_hw.h" + +struct nvc0_hw_metric_query { + struct nvc0_hw_query base; + struct nvc0_hw_query *queries[8]; + unsigned num_queries; +}; + +static inline struct nvc0_hw_metric_query * +nvc0_hw_metric_query(struct nvc0_hw_query *hq) +{ + return (struct nvc0_hw_metric_query *)hq; +} + +/* + * Driver metrics queries: + */ +#define NVC0_HW_METRIC_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + 3072 + (i)) +#define NVC0_HW_METRIC_QUERY_LAST NVC0_HW_METRIC_QUERY(NVC0_HW_METRIC_QUERY_COUNT - 1) +enum nvc0_hw_metric_queries +{ + NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY = 0, + NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY, + NVC0_HW_METRIC_QUERY_INST_ISSUED, + NVC0_HW_METRIC_QUERY_INST_PER_WRAP, + NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD, + NVC0_HW_METRIC_QUERY_ISSUED_IPC, + NVC0_HW_METRIC_QUERY_ISSUE_SLOTS, + NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION, + NVC0_HW_METRIC_QUERY_IPC, + NVC0_HW_METRIC_QUERY_COUNT +}; + +struct nvc0_hw_query * +nvc0_hw_metric_create_query(struct nvc0_context *, unsigned); +int +nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *, unsigned, + struct pipe_driver_query_info *); +#endif diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c new file mode 100644 index 00000000000..44b222e5134 --- /dev/null +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c @@ -0,0 +1,1387 @@ +/* + * Copyright 2011 Christoph Bumiller + * Copyright 2015 Samuel Pitoiset + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#define NVC0_PUSH_EXPLICIT_SPACE_CHECKING + +#include "nvc0/nvc0_context.h" +#include "nvc0/nvc0_query_hw_sm.h" + +#include "nv_object.xml.h" +#include "nvc0/nve4_compute.xml.h" +#include "nvc0/nvc0_compute.xml.h" + +/* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */ + +/* NOTE: intentionally using the same names as NV */ +static const char *nve4_hw_sm_query_names[] = +{ + /* MP counters */ + "active_cycles", + "active_warps", + "atom_count", + "branch", + "divergent_branch", + "gld_request", + "global_ld_mem_divergence_replays", + "global_store_transaction", + "global_st_mem_divergence_replays", + "gred_count", + "gst_request", + "inst_executed", + "inst_issued", + "inst_issued1", + "inst_issued2", + "l1_global_load_hit", + "l1_global_load_miss", + "l1_local_load_hit", + "l1_local_load_miss", + "l1_local_store_hit", + "l1_local_store_miss", + "l1_shared_load_transactions", + "l1_shared_store_transactions", + "local_load", + "local_load_transactions", + "local_store", + "local_store_transactions", + "prof_trigger_00", + "prof_trigger_01", + "prof_trigger_02", + "prof_trigger_03", + "prof_trigger_04", + "prof_trigger_05", + "prof_trigger_06", + "prof_trigger_07", + "shared_load", + "shared_load_replay", + "shared_store", + "shared_store_replay", + "sm_cta_launched", + "threads_launched", + "uncached_global_load_transaction", + "warps_launched", + /* metrics, i.e. functions of the MP counters */ + "metric-ipc", /* inst_executed, clock */ + "metric-ipac", /* inst_executed, active_cycles */ + "metric-ipec", /* inst_executed, (bool)inst_executed */ + "metric-achieved_occupancy", /* active_warps, active_cycles */ + "metric-sm_efficiency", /* active_cycles, clock */ + "metric-inst_replay_overhead" /* inst_issued, inst_executed */ +}; + +/* Code to read out MP counters: They are accessible via mmio, too, but let's + * just avoid mapping registers in userspace. We'd have to know which MPs are + * enabled/present, too, and that information is not presently exposed. + * We could add a kernel interface for it, but reading the counters like this + * has the advantage of being async (if get_result isn't called immediately). + */ +static const uint64_t nve4_read_hw_sm_counters_code[] = +{ + /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20 + * mov b32 $r8 $tidx + * mov b32 $r12 $physid + * mov b32 $r0 $pm0 + * mov b32 $r1 $pm1 + * mov b32 $r2 $pm2 + * mov b32 $r3 $pm3 + * mov b32 $r4 $pm4 + * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b + * mov b32 $r5 $pm5 + * mov b32 $r6 $pm6 + * mov b32 $r7 $pm7 + * set $p0 0x1 eq u32 $r8 0x0 + * mov b32 $r10 c0[0x0] + * ext u32 $r8 $r12 0x414 + * mov b32 $r11 c0[0x4] + * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04 + * ext u32 $r9 $r12 0x208 + * (not $p0) exit + * set $p1 0x1 eq u32 $r9 0x0 + * mul $r8 u32 $r8 u32 96 + * mul $r12 u32 $r9 u32 16 + * mul $r13 u32 $r9 u32 4 + * add b32 $r9 $r8 $r13 + * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c + * add b32 $r8 $r8 $r12 + * mov b32 $r12 $r10 + * add b32 $r10 $c $r10 $r8 + * mov b32 $r13 $r11 + * add b32 $r11 $r11 0x0 $c + * add b32 $r12 $c $r12 $r9 + * st b128 wt g[$r10d] $r0q + * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00 + * mov b32 $r0 c0[0x8] + * add b32 $r13 $r13 0x0 $c + * $p1 st b128 wt g[$r12d+0x40] $r4q + * st b32 wt g[$r12d+0x50] $r0 + * exit */ + 0x2202020202020207ULL, + 0x2c00000084021c04ULL, + 0x2c0000000c031c04ULL, + 0x2c00000010001c04ULL, + 0x2c00000014005c04ULL, + 0x2c00000018009c04ULL, + 0x2c0000001c00dc04ULL, + 0x2c00000020011c04ULL, + 0x22b0420042320207ULL, + 0x2c00000024015c04ULL, + 0x2c00000028019c04ULL, + 0x2c0000002c01dc04ULL, + 0x190e0000fc81dc03ULL, + 0x2800400000029de4ULL, + 0x7000c01050c21c03ULL, + 0x280040001002dde4ULL, + 0x204282020042e047ULL, + 0x7000c00820c25c03ULL, + 0x80000000000021e7ULL, + 0x190e0000fc93dc03ULL, + 0x1000000180821c02ULL, + 0x1000000040931c02ULL, + 0x1000000010935c02ULL, + 0x4800000034825c03ULL, + 0x22c042c042c04287ULL, + 0x4800000030821c03ULL, + 0x2800000028031de4ULL, + 0x4801000020a29c03ULL, + 0x280000002c035de4ULL, + 0x0800000000b2dc42ULL, + 0x4801000024c31c03ULL, + 0x9400000000a01fc5ULL, + 0x200002e04202c047ULL, + 0x2800400020001de4ULL, + 0x0800000000d35c42ULL, + 0x9400000100c107c5ULL, + 0x9400000140c01f85ULL, + 0x8000000000001de7ULL +}; + +/* For simplicity, we will allocate as many group slots as we allocate counter + * slots. This means that a single counter which wants to source from 2 groups + * will have to be declared as using 2 counter slots. This shouldn't really be + * a problem because such queries don't make much sense ... (unless someone is + * really creative). + */ +struct nvc0_hw_sm_counter_cfg +{ + uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */ + uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */ + uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */ + uint32_t sig_sel : 8; /* signal group */ + uint32_t src_mask; /* mask for signal selection (only for NVC0:NVE4) */ + uint32_t src_sel; /* signal selection for up to 4 sources */ +}; + +#define NVC0_COUNTER_OPn_SUM 0 +#define NVC0_COUNTER_OPn_OR 1 +#define NVC0_COUNTER_OPn_AND 2 +#define NVC0_COUNTER_OP2_REL_SUM_MM 3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */ +#define NVC0_COUNTER_OP2_DIV_SUM_M0 4 /* sum(ctr0) / ctr1 of MP[0]) */ +#define NVC0_COUNTER_OP2_AVG_DIV_MM 5 /* avg(ctr0 / ctr1) */ +#define NVC0_COUNTER_OP2_AVG_DIV_M0 6 /* avg(ctr0) / ctr1 of MP[0]) */ + +struct nvc0_hw_sm_query_cfg +{ + struct nvc0_hw_sm_counter_cfg ctr[8]; + uint8_t num_counters; + uint8_t op; + uint8_t norm[2]; /* normalization num,denom */ +}; + +#define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } } +#define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } } +#define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \ + { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, 0, s0 }, \ + { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, 0, s1 }, \ + {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } } +#define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \ + { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, 0, s0 }, \ + { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, 0, s1 }, \ + {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } } +#define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \ + { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, 0, s0 }, \ + { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, 0, s1 }, \ + {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } } + +/* NOTES: + * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps + * inst_executed etc.: we only count a single warp scheduler + * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers; + * this is inaccurate ! + */ +static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] = +{ + _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1), + _Q1B(ACTIVE_WARPS, 0x003f, B6, WARP, 0x31483104, 2, 1), + _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1), + _Q1A(BRANCH, 0x0001, B6, BRANCH, 0x0000000c, 1, 1), + _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1), + _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1), + _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1), + _Q1B(GST_TRANSACTIONS, 0x0001, B6, MEM, 0x00000004, 1, 1), + _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1), + _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1), + _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1), + _Q1A(INST_EXECUTED, 0x0003, B6, EXEC, 0x00000398, 1, 1), + _Q1A(INST_ISSUED, 0x0003, B6, ISSUE, 0x00000104, 1, 1), + _Q1A(INST_ISSUED1, 0x0001, B6, ISSUE, 0x00000004, 1, 1), + _Q1A(INST_ISSUED2, 0x0001, B6, ISSUE, 0x00000008, 1, 1), + _Q1B(L1_GLD_HIT, 0x0001, B6, L1, 0x00000010, 1, 1), + _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1), + _Q1B(L1_LOCAL_LD_HIT, 0x0001, B6, L1, 0x00000000, 1, 1), + _Q1B(L1_LOCAL_LD_MISS, 0x0001, B6, L1, 0x00000004, 1, 1), + _Q1B(L1_LOCAL_ST_HIT, 0x0001, B6, L1, 0x00000008, 1, 1), + _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1), + _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1), + _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1), + _Q1A(LOCAL_LD, 0x0001, B6, LDST, 0x00000008, 1, 1), + _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1), + _Q1A(LOCAL_ST, 0x0001, B6, LDST, 0x0000000c, 1, 1), + _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1), + _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1), + _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1), + _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1), + _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1), + _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1), + _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1), + _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1), + _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1), + _Q1A(SHARED_LD, 0x0001, B6, LDST, 0x00000000, 1, 1), + _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1), + _Q1A(SHARED_ST, 0x0001, B6, LDST, 0x00000004, 1, 1), + _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1), + _Q1B(SM_CTA_LAUNCHED, 0x0001, B6, WARP, 0x0000001c, 1, 1), + _Q1A(THREADS_LAUNCHED, 0x003f, B6, LAUNCH, 0x398a4188, 1, 1), + _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1), + _Q1A(WARPS_LAUNCHED, 0x0001, B6, LAUNCH, 0x00000004, 1, 1), + _M2AB(IPC, 0x3, B6, EXEC, 0x398, 0xffff, LOGOP, WARP, 0x0, DIV_SUM_M0, 10, 1), + _M2AB(IPAC, 0x3, B6, EXEC, 0x398, 0x1, B6, WARP, 0x0, AVG_DIV_MM, 10, 1), + _M2A(IPEC, 0x3, B6, EXEC, 0x398, 0xe, LOGOP, EXEC, 0x398, AVG_DIV_MM, 10, 1), + _M2A(INST_REPLAY_OHEAD, 0x3, B6, ISSUE, 0x104, 0x3, B6, EXEC, 0x398, REL_SUM_MM, 100, 1), + _M2B(MP_OCCUPANCY, 0x3f, B6, WARP, 0x31483104, 0x01, B6, WARP, 0x0, AVG_DIV_MM, 200, 64), + _M2B(MP_EFFICIENCY, 0x01, B6, WARP, 0x0, 0xffff, LOGOP, WARP, 0x0, AVG_DIV_M0, 100, 1), +}; + +#undef _Q1A +#undef _Q1B +#undef _M2A +#undef _M2B + +/* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */ +/* NOTES: + * - MP counters on GF100/GF110 (compute capability 2.0) are buggy + * because there is a context-switch problem that we need to fix. + * Results might be wrong sometimes, be careful! + */ +static const char *nvc0_hw_sm_query_names[] = +{ + /* MP counters */ + "active_cycles", + "active_warps", + "atom_count", + "branch", + "divergent_branch", + "gld_request", + "gred_count", + "gst_request", + "inst_executed", + "inst_issued", + "inst_issued1_0", + "inst_issued1_1", + "inst_issued2_0", + "inst_issued2_1", + "local_load", + "local_store", + "prof_trigger_00", + "prof_trigger_01", + "prof_trigger_02", + "prof_trigger_03", + "prof_trigger_04", + "prof_trigger_05", + "prof_trigger_06", + "prof_trigger_07", + "shared_load", + "shared_store", + "threads_launched", + "thread_inst_executed_0", + "thread_inst_executed_1", + "thread_inst_executed_2", + "thread_inst_executed_3", + "warps_launched", +}; + +static const uint64_t nvc0_read_hw_sm_counters_code[] = +{ + /* mov b32 $r8 $tidx + * mov b32 $r9 $physid + * mov b32 $r0 $pm0 + * mov b32 $r1 $pm1 + * mov b32 $r2 $pm2 + * mov b32 $r3 $pm3 + * mov b32 $r4 $pm4 + * mov b32 $r5 $pm5 + * mov b32 $r6 $pm6 + * mov b32 $r7 $pm7 + * set $p0 0x1 eq u32 $r8 0x0 + * mov b32 $r10 c0[0x0] + * mov b32 $r11 c0[0x4] + * ext u32 $r8 $r9 0x414 + * (not $p0) exit + * mul $r8 u32 $r8 u32 48 + * add b32 $r10 $c $r10 $r8 + * add b32 $r11 $r11 0x0 $c + * mov b32 $r8 c0[0x8] + * st b128 wt g[$r10d+0x00] $r0q + * st b128 wt g[$r10d+0x10] $r4q + * st b32 wt g[$r10d+0x20] $r8 + * exit */ + 0x2c00000084021c04ULL, + 0x2c0000000c025c04ULL, + 0x2c00000010001c04ULL, + 0x2c00000014005c04ULL, + 0x2c00000018009c04ULL, + 0x2c0000001c00dc04ULL, + 0x2c00000020011c04ULL, + 0x2c00000024015c04ULL, + 0x2c00000028019c04ULL, + 0x2c0000002c01dc04ULL, + 0x190e0000fc81dc03ULL, + 0x2800400000029de4ULL, + 0x280040001002dde4ULL, + 0x7000c01050921c03ULL, + 0x80000000000021e7ULL, + 0x10000000c0821c02ULL, + 0x4801000020a29c03ULL, + 0x0800000000b2dc42ULL, + 0x2800400020021de4ULL, + 0x9400000000a01fc5ULL, + 0x9400000040a11fc5ULL, + 0x9400000080a21f85ULL, + 0x8000000000001de7ULL +}; + +#define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s } +#define _Q(n, c) [NVC0_HW_SM_QUERY_##n] = c + +/* ==== Compute capability 2.0 (GF100/GF110) ==== */ +static const struct nvc0_hw_sm_query_cfg +sm20_active_cycles = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x11, 0x000000ff, 0x00000000), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_active_warps = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000010), + .ctr[1] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000020), + .ctr[2] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000030), + .ctr[3] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000040), + .ctr[4] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000050), + .ctr[5] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000060), + .num_counters = 6, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_atom_count = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000030), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_branch = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000000), + .ctr[1] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000010), + .num_counters = 2, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_divergent_branch = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000020), + .ctr[1] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000030), + .num_counters = 2, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_gld_request = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000030), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_gred_count = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000040), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_gst_request = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000060), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_inst_executed = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001000), + .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001010), + .num_counters = 2, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_inst_issued = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007060), + .ctr[1] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007070), + .num_counters = 2, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_local_ld = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000020), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_local_st = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000050), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_prof_trigger_0 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000000), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_prof_trigger_1 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000010), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_prof_trigger_2 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000020), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_prof_trigger_3 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000030), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_prof_trigger_4 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000040), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_prof_trigger_5 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000050), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_prof_trigger_6 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000060), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_prof_trigger_7 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000070), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_shared_ld = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000010), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_shared_st = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000040), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_threads_launched = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000010), + .ctr[1] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000020), + .ctr[2] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000030), + .ctr[3] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000040), + .ctr[4] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000050), + .ctr[5] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000060), + .num_counters = 6, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_th_inst_executed_0 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000000), + .ctr[1] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000010), + .ctr[2] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000020), + .ctr[3] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000030), + .ctr[4] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000040), + .ctr[5] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000050), + .num_counters = 6, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_th_inst_executed_1 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000000), + .ctr[1] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000010), + .ctr[2] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000020), + .ctr[3] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000030), + .ctr[4] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000040), + .ctr[5] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000050), + .num_counters = 6, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_warps_launched = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000000), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg *sm20_hw_sm_queries[] = +{ + _Q(ACTIVE_CYCLES, &sm20_active_cycles), + _Q(ACTIVE_WARPS, &sm20_active_warps), + _Q(ATOM_COUNT, &sm20_atom_count), + _Q(BRANCH, &sm20_branch), + _Q(DIVERGENT_BRANCH, &sm20_divergent_branch), + _Q(GLD_REQUEST, &sm20_gld_request), + _Q(GRED_COUNT, &sm20_gred_count), + _Q(GST_REQUEST, &sm20_gst_request), + _Q(INST_EXECUTED, &sm20_inst_executed), + _Q(INST_ISSUED, &sm20_inst_issued), + _Q(INST_ISSUED1_0, NULL), + _Q(INST_ISSUED1_1, NULL), + _Q(INST_ISSUED2_0, NULL), + _Q(INST_ISSUED2_1, NULL), + _Q(LOCAL_LD, &sm20_local_ld), + _Q(LOCAL_ST, &sm20_local_st), + _Q(PROF_TRIGGER_0, &sm20_prof_trigger_0), + _Q(PROF_TRIGGER_1, &sm20_prof_trigger_1), + _Q(PROF_TRIGGER_2, &sm20_prof_trigger_2), + _Q(PROF_TRIGGER_3, &sm20_prof_trigger_3), + _Q(PROF_TRIGGER_4, &sm20_prof_trigger_4), + _Q(PROF_TRIGGER_5, &sm20_prof_trigger_5), + _Q(PROF_TRIGGER_6, &sm20_prof_trigger_6), + _Q(PROF_TRIGGER_7, &sm20_prof_trigger_7), + _Q(SHARED_LD, &sm20_shared_ld), + _Q(SHARED_ST, &sm20_shared_st), + _Q(THREADS_LAUNCHED, &sm20_threads_launched), + _Q(TH_INST_EXECUTED_0, &sm20_th_inst_executed_0), + _Q(TH_INST_EXECUTED_1, &sm20_th_inst_executed_1), + _Q(TH_INST_EXECUTED_2, NULL), + _Q(TH_INST_EXECUTED_3, NULL), + _Q(WARPS_LAUNCHED, &sm20_warps_launched), +}; + +/* ==== Compute capability 2.1 (GF108+ except GF110) ==== */ +static const struct nvc0_hw_sm_query_cfg +sm21_inst_executed = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000000), + .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000010), + .ctr[2] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000020), + .num_counters = 3, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm21_inst_issued1_0 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000010), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm21_inst_issued1_1 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000040), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm21_inst_issued2_0 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000020), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm21_inst_issued2_1 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000050), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm21_th_inst_executed_0 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000000), + .ctr[1] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000010), + .ctr[2] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000020), + .ctr[3] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000030), + .ctr[4] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000040), + .ctr[5] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000050), + .num_counters = 6, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm21_th_inst_executed_1 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000000), + .ctr[1] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000010), + .ctr[2] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000020), + .ctr[3] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000030), + .ctr[4] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000040), + .ctr[5] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000050), + .num_counters = 6, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm21_th_inst_executed_2 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000000), + .ctr[1] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000010), + .ctr[2] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000020), + .ctr[3] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000030), + .ctr[4] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000040), + .ctr[5] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000050), + .num_counters = 6, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm21_th_inst_executed_3 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000000), + .ctr[1] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000010), + .ctr[2] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000020), + .ctr[3] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000030), + .ctr[4] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000040), + .ctr[5] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000050), + .num_counters = 6, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg *sm21_hw_sm_queries[] = +{ + _Q(ACTIVE_CYCLES, &sm20_active_cycles), + _Q(ACTIVE_WARPS, &sm20_active_warps), + _Q(ATOM_COUNT, &sm20_atom_count), + _Q(BRANCH, &sm20_branch), + _Q(DIVERGENT_BRANCH, &sm20_divergent_branch), + _Q(GLD_REQUEST, &sm20_gld_request), + _Q(GRED_COUNT, &sm20_gred_count), + _Q(GST_REQUEST, &sm20_gst_request), + _Q(INST_EXECUTED, &sm21_inst_executed), + _Q(INST_ISSUED, NULL), + _Q(INST_ISSUED1_0, &sm21_inst_issued1_0), + _Q(INST_ISSUED1_1, &sm21_inst_issued1_1), + _Q(INST_ISSUED2_0, &sm21_inst_issued2_0), + _Q(INST_ISSUED2_1, &sm21_inst_issued2_1), + _Q(LOCAL_LD, &sm20_local_ld), + _Q(LOCAL_ST, &sm20_local_st), + _Q(PROF_TRIGGER_0, &sm20_prof_trigger_0), + _Q(PROF_TRIGGER_1, &sm20_prof_trigger_1), + _Q(PROF_TRIGGER_2, &sm20_prof_trigger_2), + _Q(PROF_TRIGGER_3, &sm20_prof_trigger_3), + _Q(PROF_TRIGGER_4, &sm20_prof_trigger_4), + _Q(PROF_TRIGGER_5, &sm20_prof_trigger_5), + _Q(PROF_TRIGGER_6, &sm20_prof_trigger_6), + _Q(PROF_TRIGGER_7, &sm20_prof_trigger_7), + _Q(SHARED_LD, &sm20_shared_ld), + _Q(SHARED_ST, &sm20_shared_st), + _Q(THREADS_LAUNCHED, &sm20_threads_launched), + _Q(TH_INST_EXECUTED_0, &sm21_th_inst_executed_0), + _Q(TH_INST_EXECUTED_1, &sm21_th_inst_executed_1), + _Q(TH_INST_EXECUTED_2, &sm21_th_inst_executed_2), + _Q(TH_INST_EXECUTED_3, &sm21_th_inst_executed_3), + _Q(WARPS_LAUNCHED, &sm20_warps_launched), +}; + +#undef _Q +#undef _C + +static inline const struct nvc0_hw_sm_query_cfg ** +nvc0_hw_sm_get_queries(struct nvc0_screen *screen) +{ + struct nouveau_device *dev = screen->base.device; + + if (dev->chipset == 0xc0 || dev->chipset == 0xc8) + return sm20_hw_sm_queries; + return sm21_hw_sm_queries; +} + +static const struct nvc0_hw_sm_query_cfg * +nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) +{ + struct nvc0_screen *screen = nvc0->screen; + struct nvc0_query *q = &hq->base; + + if (screen->base.class_3d >= NVE4_3D_CLASS) + return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC]; + + if (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST) { + const struct nvc0_hw_sm_query_cfg **queries = + nvc0_hw_sm_get_queries(screen); + return queries[q->type - NVC0_HW_SM_QUERY(0)]; + } + debug_printf("invalid query type: %d\n", q->type); + return NULL; +} + +static void +nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) +{ + struct nvc0_query *q = &hq->base; + q->funcs->destroy_query(nvc0, q); +} + +static boolean +nve4_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) +{ + struct nvc0_screen *screen = nvc0->screen; + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq); + const struct nvc0_hw_sm_query_cfg *cfg; + unsigned i, c; + unsigned num_ab[2] = { 0, 0 }; + + cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq); + + /* check if we have enough free counter slots */ + for (i = 0; i < cfg->num_counters; ++i) + num_ab[cfg->ctr[i].sig_dom]++; + + if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 || + screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) { + NOUVEAU_ERR("Not enough free MP counter slots !\n"); + return false; + } + + assert(cfg->num_counters <= 4); + PUSH_SPACE(push, 4 * 8 * + 6); + + if (!screen->pm.mp_counters_enabled) { + screen->pm.mp_counters_enabled = true; + BEGIN_NVC0(push, SUBC_SW(0x06ac), 1); + PUSH_DATA (push, 0x1fcb); + } + + /* set sequence field to 0 (used to check if result is available) */ + for (i = 0; i < screen->mp_count; ++i) + hq->data[i * 10 + 10] = 0; + hq->sequence++; + + for (i = 0; i < cfg->num_counters; ++i) { + const unsigned d = cfg->ctr[i].sig_dom; + + if (!screen->pm.num_hw_sm_active[d]) { + uint32_t m = (1 << 22) | (1 << (7 + (8 * !d))); + if (screen->pm.num_hw_sm_active[!d]) + m |= 1 << (7 + (8 * d)); + BEGIN_NVC0(push, SUBC_SW(0x0600), 1); + PUSH_DATA (push, m); + } + screen->pm.num_hw_sm_active[d]++; + + for (c = d * 4; c < (d * 4 + 4); ++c) { + if (!screen->pm.mp_counter[c]) { + hsq->ctr[i] = c; + screen->pm.mp_counter[c] = hsq; + break; + } + } + assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */ + + /* configure and reset the counter(s) */ + if (d == 0) + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1); + else + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1); + PUSH_DATA (push, cfg->ctr[i].sig_sel); + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1); + PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3)); + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1); + PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1); + PUSH_DATA (push, 0); + } + return true; +} + +static boolean +nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) +{ + struct nvc0_screen *screen = nvc0->screen; + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq); + const struct nvc0_hw_sm_query_cfg *cfg; + unsigned i, c; + + if (screen->base.class_3d >= NVE4_3D_CLASS) + return nve4_hw_sm_begin_query(nvc0, hq); + + cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq); + + /* check if we have enough free counter slots */ + if (screen->pm.num_hw_sm_active[0] + cfg->num_counters > 8) { + NOUVEAU_ERR("Not enough free MP counter slots !\n"); + return false; + } + + assert(cfg->num_counters <= 8); + PUSH_SPACE(push, 8 * 8 + 2); + + /* set sequence field to 0 (used to check if result is available) */ + for (i = 0; i < screen->mp_count; ++i) { + const unsigned b = (0x30 / 4) * i; + hq->data[b + 8] = 0; + } + hq->sequence++; + + for (i = 0; i < cfg->num_counters; ++i) { + uint32_t mask_sel = 0x00000000; + + if (!screen->pm.num_hw_sm_active[0]) { + BEGIN_NVC0(push, SUBC_SW(0x0600), 1); + PUSH_DATA (push, 0x80000000); + } + screen->pm.num_hw_sm_active[0]++; + + for (c = 0; c < 8; ++c) { + if (!screen->pm.mp_counter[c]) { + hsq->ctr[i] = c; + screen->pm.mp_counter[c] = hsq; + break; + } + } + + /* Oddly-enough, the signal id depends on the slot selected on Fermi but + * not on Kepler. Fortunately, the signal ids are just offseted by the + * slot id! */ + mask_sel |= c; + mask_sel |= (c << 8); + mask_sel |= (c << 16); + mask_sel |= (c << 24); + mask_sel &= cfg->ctr[i].src_mask; + + /* configure and reset the counter(s) */ + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(c)), 1); + PUSH_DATA (push, cfg->ctr[i].sig_sel); + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(c)), 1); + PUSH_DATA (push, cfg->ctr[i].src_sel | mask_sel); + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 1); + PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(c)), 1); + PUSH_DATA (push, 0); + } + return true; +} + +static void +nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) +{ + struct nvc0_screen *screen = nvc0->screen; + struct pipe_context *pipe = &nvc0->base.pipe; + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS; + struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq); + uint32_t mask; + uint32_t input[3]; + const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 }; + const uint grid[3] = { screen->mp_count, screen->gpc_count, 1 }; + unsigned c; + + if (unlikely(!screen->pm.prog)) { + struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program); + prog->type = PIPE_SHADER_COMPUTE; + prog->translated = true; + prog->num_gprs = 14; + prog->parm_size = 12; + if (is_nve4) { + prog->code = (uint32_t *)nve4_read_hw_sm_counters_code; + prog->code_size = sizeof(nve4_read_hw_sm_counters_code); + } else { + prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code; + prog->code_size = sizeof(nvc0_read_hw_sm_counters_code); + } + screen->pm.prog = prog; + } + + /* disable all counting */ + PUSH_SPACE(push, 8); + for (c = 0; c < 8; ++c) + if (screen->pm.mp_counter[c]) { + if (is_nve4) { + IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0); + } else { + IMMED_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 0); + } + } + /* release counters for this query */ + for (c = 0; c < 8; ++c) { + if (screen->pm.mp_counter[c] == hsq) { + uint8_t d = is_nve4 ? c / 4 : 0; /* only one domain for NVC0:NVE4 */ + screen->pm.num_hw_sm_active[d]--; + screen->pm.mp_counter[c] = NULL; + } + } + + BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR, + hq->bo); + + PUSH_SPACE(push, 1); + IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0); + + pipe->bind_compute_state(pipe, screen->pm.prog); + input[0] = (hq->bo->offset + hq->base_offset); + input[1] = (hq->bo->offset + hq->base_offset) >> 32; + input[2] = hq->sequence; + pipe->launch_grid(pipe, block, grid, 0, input); + + nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY); + + /* re-activate other counters */ + PUSH_SPACE(push, 16); + mask = 0; + for (c = 0; c < 8; ++c) { + const struct nvc0_hw_sm_query_cfg *cfg; + unsigned i; + + hsq = screen->pm.mp_counter[c]; + if (!hsq) + continue; + + cfg = nvc0_hw_sm_query_get_cfg(nvc0, &hsq->base); + for (i = 0; i < cfg->num_counters; ++i) { + if (mask & (1 << hsq->ctr[i])) + break; + mask |= 1 << hsq->ctr[i]; + if (is_nve4) { + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(hsq->ctr[i])), 1); + } else { + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(hsq->ctr[i])), 1); + } + PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); + } + } +} + +static inline bool +nvc0_hw_sm_query_read_data(uint32_t count[32][8], + struct nvc0_context *nvc0, bool wait, + struct nvc0_hw_query *hq, + const struct nvc0_hw_sm_query_cfg *cfg, + unsigned mp_count) +{ + struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq); + unsigned p, c; + + for (p = 0; p < mp_count; ++p) { + const unsigned b = (0x30 / 4) * p; + + for (c = 0; c < cfg->num_counters; ++c) { + if (hq->data[b + 8] != hq->sequence) { + if (!wait) + return false; + if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client)) + return false; + } + count[p][c] = hq->data[b + hsq->ctr[c]] * (1 << c); + } + } + return true; +} + +static inline bool +nve4_hw_sm_query_read_data(uint32_t count[32][8], + struct nvc0_context *nvc0, bool wait, + struct nvc0_hw_query *hq, + const struct nvc0_hw_sm_query_cfg *cfg, + unsigned mp_count) +{ + struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq); + unsigned p, c, d; + + for (p = 0; p < mp_count; ++p) { + const unsigned b = (0x60 / 4) * p; + + for (c = 0; c < cfg->num_counters; ++c) { + count[p][c] = 0; + for (d = 0; d < ((hsq->ctr[c] & ~3) ? 1 : 4); ++d) { + if (hq->data[b + 20 + d] != hq->sequence) { + if (!wait) + return false; + if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client)) + return false; + } + if (hsq->ctr[c] & ~0x3) + count[p][c] = hq->data[b + 16 + (hsq->ctr[c] & 3)]; + else + count[p][c] += hq->data[b + d * 4 + hsq->ctr[c]]; + } + } + } + return true; +} + +/* Metric calculations: + * sum(x) ... sum of x over all MPs + * avg(x) ... average of x over all MPs + * + * IPC : sum(inst_executed) / clock + * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued) + * MP_OCCUPANCY : avg((active_warps / 64) / active_cycles) + * MP_EFFICIENCY : avg(active_cycles / clock) + * + * NOTE: Interpretation of IPC requires knowledge of MP count. + */ +static boolean +nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq, + boolean wait, union pipe_query_result *result) +{ + uint32_t count[32][8]; + uint64_t value = 0; + unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32); + unsigned p, c; + const struct nvc0_hw_sm_query_cfg *cfg; + bool ret; + + cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq); + + if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) + ret = nve4_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count); + else + ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count); + if (!ret) + return false; + + if (cfg->op == NVC0_COUNTER_OPn_SUM) { + for (c = 0; c < cfg->num_counters; ++c) + for (p = 0; p < mp_count; ++p) + value += count[p][c]; + value = (value * cfg->norm[0]) / cfg->norm[1]; + } else + if (cfg->op == NVC0_COUNTER_OPn_OR) { + uint32_t v = 0; + for (c = 0; c < cfg->num_counters; ++c) + for (p = 0; p < mp_count; ++p) + v |= count[p][c]; + value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1]; + } else + if (cfg->op == NVC0_COUNTER_OPn_AND) { + uint32_t v = ~0; + for (c = 0; c < cfg->num_counters; ++c) + for (p = 0; p < mp_count; ++p) + v &= count[p][c]; + value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1]; + } else + if (cfg->op == NVC0_COUNTER_OP2_REL_SUM_MM) { + uint64_t v[2] = { 0, 0 }; + for (p = 0; p < mp_count; ++p) { + v[0] += count[p][0]; + v[1] += count[p][1]; + } + if (v[0]) + value = ((v[0] - v[1]) * cfg->norm[0]) / (v[0] * cfg->norm[1]); + } else + if (cfg->op == NVC0_COUNTER_OP2_DIV_SUM_M0) { + for (p = 0; p < mp_count; ++p) + value += count[p][0]; + if (count[0][1]) + value = (value * cfg->norm[0]) / (count[0][1] * cfg->norm[1]); + else + value = 0; + } else + if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_MM) { + unsigned mp_used = 0; + for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0]) + if (count[p][1]) + value += (count[p][0] * cfg->norm[0]) / count[p][1]; + if (mp_used) + value /= (uint64_t)mp_used * cfg->norm[1]; + } else + if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_M0) { + unsigned mp_used = 0; + for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0]) + value += count[p][0]; + if (count[0][1] && mp_used) { + value *= cfg->norm[0]; + value /= (uint64_t)count[0][1] * mp_used * cfg->norm[1]; + } else { + value = 0; + } + } + + *(uint64_t *)result = value; + return true; +} + +static const struct nvc0_hw_query_funcs hw_sm_query_funcs = { + .destroy_query = nvc0_hw_sm_destroy_query, + .begin_query = nvc0_hw_sm_begin_query, + .end_query = nvc0_hw_sm_end_query, + .get_query_result = nvc0_hw_sm_get_query_result, +}; + +struct nvc0_hw_query * +nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type) +{ + struct nvc0_screen *screen = nvc0->screen; + struct nvc0_hw_sm_query *hsq; + struct nvc0_hw_query *hq; + unsigned space; + + if (nvc0->screen->base.device->drm_version < 0x01000101) + return NULL; + + if ((type < NVE4_HW_SM_QUERY(0) || type > NVE4_HW_SM_QUERY_LAST) && + (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST)) + return NULL; + + hsq = CALLOC_STRUCT(nvc0_hw_sm_query); + if (!hsq) + return NULL; + + hq = &hsq->base; + hq->funcs = &hw_sm_query_funcs; + hq->base.type = type; + + if (screen->base.class_3d >= NVE4_3D_CLASS) { + /* for each MP: + * [00] = WS0.C0 + * [04] = WS0.C1 + * [08] = WS0.C2 + * [0c] = WS0.C3 + * [24] = WS2.C1 + * [28] = WS2.C2 + * [2c] = WS2.C3 + * [30] = WS3.C0 + * [34] = WS3.C1 + * [38] = WS3.C2 + * [3c] = WS3.C3 + * [40] = MP.C4 + * [44] = MP.C5 + * [48] = MP.C6 + * [4c] = MP.C7 + * [50] = WS0.sequence + * [54] = WS1.sequence + * [58] = WS2.sequence + * [5c] = WS3.sequence + */ + space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t); + } else { + /* + * Note that padding is used to align memory access to 128 bits. + * + * for each MP: + * [00] = MP.C0 + * [04] = MP.C1 + * [08] = MP.C2 + * [0c] = MP.C3 + * [10] = MP.C4 + * [14] = MP.C5 + * [18] = MP.C6 + * [1c] = MP.C7 + * [20] = MP.sequence + * [24] = padding + * [28] = padding + * [2c] = padding + */ + space = (8 + 1 + 3) * nvc0->screen->mp_count * sizeof(uint32_t); + } + + if (!nvc0_hw_query_allocate(nvc0, &hq->base, space)) { + FREE(hq); + return NULL; + } + + return hq; +} + +static int +nvc0_hw_sm_get_next_query_id(const struct nvc0_hw_sm_query_cfg **queries, + unsigned id) +{ + unsigned i, next = 0; + + for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) { + if (!queries[i]) { + next++; + } else + if (i >= id && queries[id + next]) { + break; + } + } + return id + next; +} + +int +nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id, + struct pipe_driver_query_info *info) +{ + int count = 0; + + if (screen->base.device->drm_version >= 0x01000101) { + if (screen->compute) { + if (screen->base.class_3d == NVE4_3D_CLASS) { + count += NVE4_HW_SM_QUERY_COUNT; + } else + if (screen->base.class_3d < NVE4_3D_CLASS) { + const struct nvc0_hw_sm_query_cfg **queries = + nvc0_hw_sm_get_queries(screen); + unsigned i; + + for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) { + if (queries[i]) + count++; + } + } + } + } + + if (!info) + return count; + + if (id < count) { + if (screen->compute) { + if (screen->base.class_3d == NVE4_3D_CLASS) { + info->name = nve4_hw_sm_query_names[id]; + info->query_type = NVE4_HW_SM_QUERY(id); + info->max_value.u64 = + (id < NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY) ? 0 : 100; + info->group_id = NVC0_HW_SM_QUERY_GROUP; + return 1; + } else + if (screen->base.class_3d < NVE4_3D_CLASS) { + const struct nvc0_hw_sm_query_cfg **queries = + nvc0_hw_sm_get_queries(screen); + + id = nvc0_hw_sm_get_next_query_id(queries, id); + info->name = nvc0_hw_sm_query_names[id]; + info->query_type = NVC0_HW_SM_QUERY(id); + info->group_id = NVC0_HW_SM_QUERY_GROUP; + return 1; + } + } + } + return 0; +} diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h new file mode 100644 index 00000000000..26bde0c3e0d --- /dev/null +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h @@ -0,0 +1,120 @@ +#ifndef __NVC0_QUERY_HW_SM_H__ +#define __NVC0_QUERY_HW_SM_H__ + +#include "nvc0_query_hw.h" + +struct nvc0_hw_sm_query { + struct nvc0_hw_query base; + uint8_t ctr[8]; +}; + +static inline struct nvc0_hw_sm_query * +nvc0_hw_sm_query(struct nvc0_hw_query *hq) +{ + return (struct nvc0_hw_sm_query *)hq; +} + +/* + * Performance counter queries: + */ +#define NVE4_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + (i)) +#define NVE4_HW_SM_QUERY_LAST NVE4_HW_SM_QUERY(NVE4_HW_SM_QUERY_COUNT - 1) +enum nve4_hw_sm_queries +{ + NVE4_HW_SM_QUERY_ACTIVE_CYCLES = 0, + NVE4_HW_SM_QUERY_ACTIVE_WARPS, + NVE4_HW_SM_QUERY_ATOM_COUNT, + NVE4_HW_SM_QUERY_BRANCH, + NVE4_HW_SM_QUERY_DIVERGENT_BRANCH, + NVE4_HW_SM_QUERY_GLD_REQUEST, + NVE4_HW_SM_QUERY_GLD_MEM_DIV_REPLAY, + NVE4_HW_SM_QUERY_GST_TRANSACTIONS, + NVE4_HW_SM_QUERY_GST_MEM_DIV_REPLAY, + NVE4_HW_SM_QUERY_GRED_COUNT, + NVE4_HW_SM_QUERY_GST_REQUEST, + NVE4_HW_SM_QUERY_INST_EXECUTED, + NVE4_HW_SM_QUERY_INST_ISSUED, + NVE4_HW_SM_QUERY_INST_ISSUED1, + NVE4_HW_SM_QUERY_INST_ISSUED2, + NVE4_HW_SM_QUERY_L1_GLD_HIT, + NVE4_HW_SM_QUERY_L1_GLD_MISS, + NVE4_HW_SM_QUERY_L1_LOCAL_LD_HIT, + NVE4_HW_SM_QUERY_L1_LOCAL_LD_MISS, + NVE4_HW_SM_QUERY_L1_LOCAL_ST_HIT, + NVE4_HW_SM_QUERY_L1_LOCAL_ST_MISS, + NVE4_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS, + NVE4_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS, + NVE4_HW_SM_QUERY_LOCAL_LD, + NVE4_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS, + NVE4_HW_SM_QUERY_LOCAL_ST, + NVE4_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS, + NVE4_HW_SM_QUERY_PROF_TRIGGER_0, + NVE4_HW_SM_QUERY_PROF_TRIGGER_1, + NVE4_HW_SM_QUERY_PROF_TRIGGER_2, + NVE4_HW_SM_QUERY_PROF_TRIGGER_3, + NVE4_HW_SM_QUERY_PROF_TRIGGER_4, + NVE4_HW_SM_QUERY_PROF_TRIGGER_5, + NVE4_HW_SM_QUERY_PROF_TRIGGER_6, + NVE4_HW_SM_QUERY_PROF_TRIGGER_7, + NVE4_HW_SM_QUERY_SHARED_LD, + NVE4_HW_SM_QUERY_SHARED_LD_REPLAY, + NVE4_HW_SM_QUERY_SHARED_ST, + NVE4_HW_SM_QUERY_SHARED_ST_REPLAY, + NVE4_HW_SM_QUERY_SM_CTA_LAUNCHED, + NVE4_HW_SM_QUERY_THREADS_LAUNCHED, + NVE4_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS, + NVE4_HW_SM_QUERY_WARPS_LAUNCHED, + NVE4_HW_SM_QUERY_METRIC_IPC, + NVE4_HW_SM_QUERY_METRIC_IPAC, + NVE4_HW_SM_QUERY_METRIC_IPEC, + NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY, + NVE4_HW_SM_QUERY_METRIC_MP_EFFICIENCY, + NVE4_HW_SM_QUERY_METRIC_INST_REPLAY_OHEAD, + NVE4_HW_SM_QUERY_COUNT +}; + +#define NVC0_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i)) +#define NVC0_HW_SM_QUERY_LAST NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_COUNT - 1) +enum nvc0_hw_sm_queries +{ + NVC0_HW_SM_QUERY_ACTIVE_CYCLES = 0, + NVC0_HW_SM_QUERY_ACTIVE_WARPS, + NVC0_HW_SM_QUERY_ATOM_COUNT, + NVC0_HW_SM_QUERY_BRANCH, + NVC0_HW_SM_QUERY_DIVERGENT_BRANCH, + NVC0_HW_SM_QUERY_GLD_REQUEST, + NVC0_HW_SM_QUERY_GRED_COUNT, + NVC0_HW_SM_QUERY_GST_REQUEST, + NVC0_HW_SM_QUERY_INST_EXECUTED, + NVC0_HW_SM_QUERY_INST_ISSUED, + NVC0_HW_SM_QUERY_INST_ISSUED1_0, + NVC0_HW_SM_QUERY_INST_ISSUED1_1, + NVC0_HW_SM_QUERY_INST_ISSUED2_0, + NVC0_HW_SM_QUERY_INST_ISSUED2_1, + NVC0_HW_SM_QUERY_LOCAL_LD, + NVC0_HW_SM_QUERY_LOCAL_ST, + NVC0_HW_SM_QUERY_PROF_TRIGGER_0, + NVC0_HW_SM_QUERY_PROF_TRIGGER_1, + NVC0_HW_SM_QUERY_PROF_TRIGGER_2, + NVC0_HW_SM_QUERY_PROF_TRIGGER_3, + NVC0_HW_SM_QUERY_PROF_TRIGGER_4, + NVC0_HW_SM_QUERY_PROF_TRIGGER_5, + NVC0_HW_SM_QUERY_PROF_TRIGGER_6, + NVC0_HW_SM_QUERY_PROF_TRIGGER_7, + NVC0_HW_SM_QUERY_SHARED_LD, + NVC0_HW_SM_QUERY_SHARED_ST, + NVC0_HW_SM_QUERY_THREADS_LAUNCHED, + NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0, + NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1, + NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2, + NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3, + NVC0_HW_SM_QUERY_WARPS_LAUNCHED, + NVC0_HW_SM_QUERY_COUNT +}; + +struct nvc0_hw_query * +nvc0_hw_sm_create_query(struct nvc0_context *, unsigned); +int +nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *, unsigned, + struct pipe_driver_query_info *); +#endif diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.c new file mode 100644 index 00000000000..cd24618d564 --- /dev/null +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.c @@ -0,0 +1,162 @@ +/* + * Copyright 2011 Christoph Bumiller + * Copyright 2015 Samuel Pitoiset + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "nvc0/nvc0_context.h" + +#include "nvc0_query_sw.h" + +/* === DRIVER STATISTICS === */ + +#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS + +static const char *nvc0_sw_query_drv_stat_names[] = +{ + "drv-tex_obj_current_count", + "drv-tex_obj_current_bytes", + "drv-buf_obj_current_count", + "drv-buf_obj_current_bytes_vid", + "drv-buf_obj_current_bytes_sys", + "drv-tex_transfers_rd", + "drv-tex_transfers_wr", + "drv-tex_copy_count", + "drv-tex_blit_count", + "drv-tex_cache_flush_count", + "drv-buf_transfers_rd", + "drv-buf_transfers_wr", + "drv-buf_read_bytes_staging_vid", + "drv-buf_write_bytes_direct", + "drv-buf_write_bytes_staging_vid", + "drv-buf_write_bytes_staging_sys", + "drv-buf_copy_bytes", + "drv-buf_non_kernel_fence_sync_count", + "drv-any_non_kernel_fence_sync_count", + "drv-query_sync_count", + "drv-gpu_serialize_count", + "drv-draw_calls_array", + "drv-draw_calls_indexed", + "drv-draw_calls_fallback_count", + "drv-user_buffer_upload_bytes", + "drv-constbuf_upload_count", + "drv-constbuf_upload_bytes", + "drv-pushbuf_count", + "drv-resource_validate_count" +}; + +#endif /* NOUVEAU_ENABLE_DRIVER_STATISTICS */ + +static void +nvc0_sw_destroy_query(struct nvc0_context *nvc0, struct nvc0_query *q) +{ + struct nvc0_sw_query *sq = nvc0_sw_query(q); + FREE(sq); +} + +static boolean +nvc0_sw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q) +{ +#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS + struct nvc0_sw_query *sq = nvc0_sw_query(q); + + if (q->index >= 5) { + sq->value = nvc0->screen->base.stats.v[q->index]; + } else { + sq->value = 0; + } +#endif + return true; +} + +static void +nvc0_sw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q) +{ +#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS + struct nvc0_sw_query *sq = nvc0_sw_query(q); + sq->value = nvc0->screen->base.stats.v[q->index] - sq->value; +#endif +} + +static boolean +nvc0_sw_get_query_result(struct nvc0_context *nvc0, struct nvc0_query *q, + boolean wait, union pipe_query_result *result) +{ +#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS + struct nvc0_sw_query *sq = nvc0_sw_query(q); + uint64_t *res64 = (uint64_t *)result; + + res64[0] = sq->value; +#endif + return true; +} + +static const struct nvc0_query_funcs sw_query_funcs = { + .destroy_query = nvc0_sw_destroy_query, + .begin_query = nvc0_sw_begin_query, + .end_query = nvc0_sw_end_query, + .get_query_result = nvc0_sw_get_query_result, +}; + +struct nvc0_query * +nvc0_sw_create_query(struct nvc0_context *nvcO, unsigned type, unsigned index) +{ + struct nvc0_sw_query *sq; + struct nvc0_query *q; + + if (type < NVC0_SW_QUERY_DRV_STAT(0) || type > NVC0_SW_QUERY_DRV_STAT_LAST) + return NULL; + + sq = CALLOC_STRUCT(nvc0_sw_query); + if (!sq) + return NULL; + + q = &sq->base; + q->funcs = &sw_query_funcs; + q->type = type; + q->index = type - NVC0_SW_QUERY_DRV_STAT(0); + + return q; +} + +int +nvc0_sw_get_driver_query_info(struct nvc0_screen *screen, unsigned id, + struct pipe_driver_query_info *info) +{ + int count = 0; + + count += NVC0_SW_QUERY_DRV_STAT_COUNT; + if (!info) + return count; + +#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS + if (id < count) { + info->name = nvc0_sw_query_drv_stat_names[id]; + info->query_type = NVC0_SW_QUERY_DRV_STAT(id); + info->type = PIPE_DRIVER_QUERY_TYPE_UINT64; + info->max_value.u64 = 0; + if (strstr(info->name, "bytes")) + info->type = PIPE_DRIVER_QUERY_TYPE_BYTES; + info->group_id = NVC0_SW_QUERY_DRV_STAT_GROUP; + return 1; + } +#endif + return 0; +} diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.h new file mode 100644 index 00000000000..eaa890e4fc0 --- /dev/null +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.h @@ -0,0 +1,64 @@ +#ifndef __NVC0_QUERY_SW_H__ +#define __NVC0_QUERY_SW_H__ + +#include "nvc0_query.h" + +struct nvc0_sw_query { + struct nvc0_query base; + uint64_t value; +}; + +static inline struct nvc0_sw_query * +nvc0_sw_query(struct nvc0_query *q) +{ + return (struct nvc0_sw_query *)q; +} + +/* + * Driver statistics queries: + */ +#define NVC0_SW_QUERY_DRV_STAT(i) (PIPE_QUERY_DRIVER_SPECIFIC + 1024 + (i)) +#define NVC0_SW_QUERY_DRV_STAT_LAST NVC0_SW_QUERY_DRV_STAT(NVC0_SW_QUERY_DRV_STAT_COUNT - 1) +enum nvc0_sw_query_drv_stat +{ +#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS + NVC0_SW_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_COUNT = 0, + NVC0_SW_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_BYTES, + NVC0_SW_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_COUNT, + NVC0_SW_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_VID, + NVC0_SW_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_SYS, + NVC0_SW_QUERY_DRV_STAT_TEX_TRANSFERS_READ, + NVC0_SW_QUERY_DRV_STAT_TEX_TRANSFERS_WRITE, + NVC0_SW_QUERY_DRV_STAT_TEX_COPY_COUNT, + NVC0_SW_QUERY_DRV_STAT_TEX_BLIT_COUNT, + NVC0_SW_QUERY_DRV_STAT_TEX_CACHE_FLUSH_COUNT, + NVC0_SW_QUERY_DRV_STAT_BUF_TRANSFERS_READ, + NVC0_SW_QUERY_DRV_STAT_BUF_TRANSFERS_WRITE, + NVC0_SW_QUERY_DRV_STAT_BUF_READ_BYTES_STAGING_VID, + NVC0_SW_QUERY_DRV_STAT_BUF_WRITE_BYTES_DIRECT, + NVC0_SW_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_VID, + NVC0_SW_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_SYS, + NVC0_SW_QUERY_DRV_STAT_BUF_COPY_BYTES, + NVC0_SW_QUERY_DRV_STAT_BUF_NON_KERNEL_FENCE_SYNC_COUNT, + NVC0_SW_QUERY_DRV_STAT_ANY_NON_KERNEL_FENCE_SYNC_COUNT, + NVC0_SW_QUERY_DRV_STAT_QUERY_SYNC_COUNT, + NVC0_SW_QUERY_DRV_STAT_GPU_SERIALIZE_COUNT, + NVC0_SW_QUERY_DRV_STAT_DRAW_CALLS_ARRAY, + NVC0_SW_QUERY_DRV_STAT_DRAW_CALLS_INDEXED, + NVC0_SW_QUERY_DRV_STAT_DRAW_CALLS_FALLBACK_COUNT, + NVC0_SW_QUERY_DRV_STAT_USER_BUFFER_UPLOAD_BYTES, + NVC0_SW_QUERY_DRV_STAT_CONSTBUF_UPLOAD_COUNT, + NVC0_SW_QUERY_DRV_STAT_CONSTBUF_UPLOAD_BYTES, + NVC0_SW_QUERY_DRV_STAT_PUSHBUF_COUNT, + NVC0_SW_QUERY_DRV_STAT_RESOURCE_VALIDATE_COUNT, +#endif + NVC0_SW_QUERY_DRV_STAT_COUNT +}; + +struct nvc0_query * +nvc0_sw_create_query(struct nvc0_context *, unsigned, unsigned); +int +nvc0_sw_get_driver_query_info(struct nvc0_screen *, unsigned, + struct pipe_driver_query_info *); + +#endif diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index afd91e6feee..f34ad0ed5d1 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -561,12 +561,7 @@ nvc0_screen_init_compute(struct nvc0_screen *screen) switch (screen->base.device->chipset & ~0xf) { case 0xc0: case 0xd0: - /* Using COMPUTE has weird effects on 3D state, we need to - * investigate this further before enabling it by default. - */ - if (debug_get_bool_option("NVC0_COMPUTE", false)) - return nvc0_screen_compute_setup(screen, screen->base.pushbuf); - return 0; + return nvc0_screen_compute_setup(screen, screen->base.pushbuf); case 0xe0: return nve4_screen_compute_setup(screen, screen->base.pushbuf); case 0xf0: @@ -914,6 +909,7 @@ nvc0_screen_create(struct nouveau_device *dev) else value = (16 << 8) | 4; } + screen->gpc_count = value & 0x000000ff; screen->mp_count = value >> 8; screen->mp_count_compute = screen->mp_count; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h index f57a316f01e..857eb0316c7 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h @@ -67,6 +67,7 @@ struct nvc0_screen { struct nouveau_bo *txc; /* TIC (offset 0) and TSC (65536) */ struct nouveau_bo *poly_cache; + uint8_t gpc_count; uint16_t mp_count; uint16_t mp_count_compute; /* magic reg can make compute use fewer MPs */ @@ -94,7 +95,7 @@ struct nvc0_screen { struct { struct nvc0_program *prog; /* compute state object to read MP counters */ - struct pipe_query *mp_counter[8]; /* counter to query allocation */ + struct nvc0_hw_sm_query *mp_counter[8]; /* counter to query allocation */ uint8_t num_hw_sm_active[2]; bool mp_counters_enabled; } pm; @@ -112,148 +113,6 @@ nvc0_screen(struct pipe_screen *screen) return (struct nvc0_screen *)screen; } -/* - * Performance counters groups: - */ -#define NVC0_QUERY_MP_COUNTER_GROUP 0 -#define NVC0_QUERY_DRV_STAT_GROUP 1 - -/* Performance counter queries: - */ -#define NVE4_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + (i)) -#define NVE4_HW_SM_QUERY_LAST NVE4_HW_SM_QUERY(NVE4_HW_SM_QUERY_COUNT - 1) -enum nve4_pm_queries -{ - NVE4_HW_SM_QUERY_ACTIVE_CYCLES = 0, - NVE4_HW_SM_QUERY_ACTIVE_WARPS, - NVE4_HW_SM_QUERY_ATOM_COUNT, - NVE4_HW_SM_QUERY_BRANCH, - NVE4_HW_SM_QUERY_DIVERGENT_BRANCH, - NVE4_HW_SM_QUERY_GLD_REQUEST, - NVE4_HW_SM_QUERY_GLD_MEM_DIV_REPLAY, - NVE4_HW_SM_QUERY_GST_TRANSACTIONS, - NVE4_HW_SM_QUERY_GST_MEM_DIV_REPLAY, - NVE4_HW_SM_QUERY_GRED_COUNT, - NVE4_HW_SM_QUERY_GST_REQUEST, - NVE4_HW_SM_QUERY_INST_EXECUTED, - NVE4_HW_SM_QUERY_INST_ISSUED, - NVE4_HW_SM_QUERY_INST_ISSUED1, - NVE4_HW_SM_QUERY_INST_ISSUED2, - NVE4_HW_SM_QUERY_L1_GLD_HIT, - NVE4_HW_SM_QUERY_L1_GLD_MISS, - NVE4_HW_SM_QUERY_L1_LOCAL_LD_HIT, - NVE4_HW_SM_QUERY_L1_LOCAL_LD_MISS, - NVE4_HW_SM_QUERY_L1_LOCAL_ST_HIT, - NVE4_HW_SM_QUERY_L1_LOCAL_ST_MISS, - NVE4_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS, - NVE4_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS, - NVE4_HW_SM_QUERY_LOCAL_LD, - NVE4_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS, - NVE4_HW_SM_QUERY_LOCAL_ST, - NVE4_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS, - NVE4_HW_SM_QUERY_PROF_TRIGGER_0, - NVE4_HW_SM_QUERY_PROF_TRIGGER_1, - NVE4_HW_SM_QUERY_PROF_TRIGGER_2, - NVE4_HW_SM_QUERY_PROF_TRIGGER_3, - NVE4_HW_SM_QUERY_PROF_TRIGGER_4, - NVE4_HW_SM_QUERY_PROF_TRIGGER_5, - NVE4_HW_SM_QUERY_PROF_TRIGGER_6, - NVE4_HW_SM_QUERY_PROF_TRIGGER_7, - NVE4_HW_SM_QUERY_SHARED_LD, - NVE4_HW_SM_QUERY_SHARED_LD_REPLAY, - NVE4_HW_SM_QUERY_SHARED_ST, - NVE4_HW_SM_QUERY_SHARED_ST_REPLAY, - NVE4_HW_SM_QUERY_SM_CTA_LAUNCHED, - NVE4_HW_SM_QUERY_THREADS_LAUNCHED, - NVE4_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS, - NVE4_HW_SM_QUERY_WARPS_LAUNCHED, - NVE4_HW_SM_QUERY_METRIC_IPC, - NVE4_HW_SM_QUERY_METRIC_IPAC, - NVE4_HW_SM_QUERY_METRIC_IPEC, - NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY, - NVE4_HW_SM_QUERY_METRIC_MP_EFFICIENCY, - NVE4_HW_SM_QUERY_METRIC_INST_REPLAY_OHEAD, - NVE4_HW_SM_QUERY_COUNT -}; - -#define NVC0_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i)) -#define NVC0_HW_SM_QUERY_LAST NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_COUNT - 1) -enum nvc0_pm_queries -{ - NVC0_HW_SM_QUERY_ACTIVE_CYCLES = 0, - NVC0_HW_SM_QUERY_ACTIVE_WARPS, - NVC0_HW_SM_QUERY_ATOM_COUNT, - NVC0_HW_SM_QUERY_BRANCH, - NVC0_HW_SM_QUERY_DIVERGENT_BRANCH, - NVC0_HW_SM_QUERY_GLD_REQUEST, - NVC0_HW_SM_QUERY_GRED_COUNT, - NVC0_HW_SM_QUERY_GST_REQUEST, - NVC0_HW_SM_QUERY_INST_EXECUTED, - NVC0_HW_SM_QUERY_INST_ISSUED1_0, - NVC0_HW_SM_QUERY_INST_ISSUED1_1, - NVC0_HW_SM_QUERY_INST_ISSUED2_0, - NVC0_HW_SM_QUERY_INST_ISSUED2_1, - NVC0_HW_SM_QUERY_LOCAL_LD, - NVC0_HW_SM_QUERY_LOCAL_ST, - NVC0_HW_SM_QUERY_PROF_TRIGGER_0, - NVC0_HW_SM_QUERY_PROF_TRIGGER_1, - NVC0_HW_SM_QUERY_PROF_TRIGGER_2, - NVC0_HW_SM_QUERY_PROF_TRIGGER_3, - NVC0_HW_SM_QUERY_PROF_TRIGGER_4, - NVC0_HW_SM_QUERY_PROF_TRIGGER_5, - NVC0_HW_SM_QUERY_PROF_TRIGGER_6, - NVC0_HW_SM_QUERY_PROF_TRIGGER_7, - NVC0_HW_SM_QUERY_SHARED_LD, - NVC0_HW_SM_QUERY_SHARED_ST, - NVC0_HW_SM_QUERY_THREADS_LAUNCHED, - NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0, - NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1, - NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2, - NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3, - NVC0_HW_SM_QUERY_WARPS_LAUNCHED, - NVC0_HW_SM_QUERY_COUNT -}; - -/* Driver statistics queries: - */ -#define NVC0_QUERY_DRV_STAT(i) (PIPE_QUERY_DRIVER_SPECIFIC + 1024 + (i)) -#define NVC0_QUERY_DRV_STAT_LAST NVC0_QUERY_DRV_STAT(NVC0_QUERY_DRV_STAT_COUNT - 1) -enum nvc0_drv_stats_queries -{ -#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS - NVC0_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_COUNT = 0, - NVC0_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_BYTES, - NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_COUNT, - NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_VID, - NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_SYS, - NVC0_QUERY_DRV_STAT_TEX_TRANSFERS_READ, - NVC0_QUERY_DRV_STAT_TEX_TRANSFERS_WRITE, - NVC0_QUERY_DRV_STAT_TEX_COPY_COUNT, - NVC0_QUERY_DRV_STAT_TEX_BLIT_COUNT, - NVC0_QUERY_DRV_STAT_TEX_CACHE_FLUSH_COUNT, - NVC0_QUERY_DRV_STAT_BUF_TRANSFERS_READ, - NVC0_QUERY_DRV_STAT_BUF_TRANSFERS_WRITE, - NVC0_QUERY_DRV_STAT_BUF_READ_BYTES_STAGING_VID, - NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_DIRECT, - NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_VID, - NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_SYS, - NVC0_QUERY_DRV_STAT_BUF_COPY_BYTES, - NVC0_QUERY_DRV_STAT_BUF_NON_KERNEL_FENCE_SYNC_COUNT, - NVC0_QUERY_DRV_STAT_ANY_NON_KERNEL_FENCE_SYNC_COUNT, - NVC0_QUERY_DRV_STAT_QUERY_SYNC_COUNT, - NVC0_QUERY_DRV_STAT_GPU_SERIALIZE_COUNT, - NVC0_QUERY_DRV_STAT_DRAW_CALLS_ARRAY, - NVC0_QUERY_DRV_STAT_DRAW_CALLS_INDEXED, - NVC0_QUERY_DRV_STAT_DRAW_CALLS_FALLBACK_COUNT, - NVC0_QUERY_DRV_STAT_USER_BUFFER_UPLOAD_BYTES, - NVC0_QUERY_DRV_STAT_CONSTBUF_UPLOAD_COUNT, - NVC0_QUERY_DRV_STAT_CONSTBUF_UPLOAD_BYTES, - NVC0_QUERY_DRV_STAT_PUSHBUF_COUNT, - NVC0_QUERY_DRV_STAT_RESOURCE_VALIDATE_COUNT, -#endif - NVC0_QUERY_DRV_STAT_COUNT -}; - int nvc0_screen_get_driver_query_info(struct pipe_screen *, unsigned, struct pipe_driver_query_info *); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c index 8f8ac2d34b9..af837fc4a33 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c @@ -26,6 +26,7 @@ #include "util/u_inlines.h" #include "nvc0/nvc0_context.h" +#include "nvc0/nvc0_query_hw.h" static inline void nvc0_program_update_context_state(struct nvc0_context *nvc0, @@ -272,14 +273,14 @@ nvc0_tfb_validate(struct nvc0_context *nvc0) continue; if (!targ->clean) - nvc0_query_fifo_wait(push, targ->pq); + nvc0_hw_query_fifo_wait(push, nvc0_query(targ->pq)); BEGIN_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 5); PUSH_DATA (push, 1); PUSH_DATAh(push, buf->address + targ->pipe.buffer_offset); PUSH_DATA (push, buf->address + targ->pipe.buffer_offset); PUSH_DATA (push, targ->pipe.buffer_size); if (!targ->clean) { - nvc0_query_pushbuf_submit(push, targ->pq, 0x4); + nvc0_hw_query_pushbuf_submit(push, nvc0_query(targ->pq), 0x4); } else { PUSH_DATA(push, 0); /* TFB_BUFFER_OFFSET */ targ->clean = false; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c index c5bfd03956d..742bef39247 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c @@ -29,6 +29,7 @@ #include "nvc0/nvc0_stateobj.h" #include "nvc0/nvc0_context.h" +#include "nvc0/nvc0_query_hw.h" #include "nvc0/nvc0_3d.xml.h" #include "nv50/nv50_texture.xml.h" @@ -1070,7 +1071,7 @@ nvc0_so_target_create(struct pipe_context *pipe, if (!targ) return NULL; - targ->pq = pipe->create_query(pipe, NVC0_QUERY_TFB_BUFFER_OFFSET, 0); + targ->pq = pipe->create_query(pipe, NVC0_HW_QUERY_TFB_BUFFER_OFFSET, 0); if (!targ->pq) { FREE(targ); return NULL; @@ -1091,6 +1092,25 @@ nvc0_so_target_create(struct pipe_context *pipe, } static void +nvc0_so_target_save_offset(struct pipe_context *pipe, + struct pipe_stream_output_target *ptarg, + unsigned index, bool *serialize) +{ + struct nvc0_so_target *targ = nvc0_so_target(ptarg); + + if (*serialize) { + *serialize = false; + PUSH_SPACE(nvc0_context(pipe)->base.pushbuf, 1); + IMMED_NVC0(nvc0_context(pipe)->base.pushbuf, NVC0_3D(SERIALIZE), 0); + + NOUVEAU_DRV_STAT(nouveau_screen(pipe->screen), gpu_serialize_count, 1); + } + + nvc0_query(targ->pq)->index = index; + pipe->end_query(pipe, targ->pq); +} + +static void nvc0_so_target_destroy(struct pipe_context *pipe, struct pipe_stream_output_target *ptarg) { diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c index aaec60a5ac2..d459dd61c19 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c @@ -188,14 +188,10 @@ nvc0_m2mf_push_linear(struct nouveau_context *nv, nouveau_pushbuf_validate(push); while (count) { - unsigned nr; + unsigned nr = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN); - if (!PUSH_SPACE(push, 16)) + if (!PUSH_SPACE(push, nr + 9)) break; - nr = PUSH_AVAIL(push); - assert(nr >= 16); - nr = MIN2(count, nr - 9); - nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN); BEGIN_NVC0(push, NVC0_M2MF(OFFSET_OUT_HIGH), 2); PUSH_DATAh(push, dst->offset + offset); @@ -234,14 +230,10 @@ nve4_p2mf_push_linear(struct nouveau_context *nv, nouveau_pushbuf_validate(push); while (count) { - unsigned nr; + unsigned nr = MIN2(count, (NV04_PFIFO_MAX_PACKET_LEN - 1)); - if (!PUSH_SPACE(push, 16)) + if (!PUSH_SPACE(push, nr + 10)) break; - nr = PUSH_AVAIL(push); - assert(nr >= 16); - nr = MIN2(count, nr - 8); - nr = MIN2(nr, (NV04_PFIFO_MAX_PACKET_LEN - 1)); BEGIN_NVC0(push, NVE4_P2MF(UPLOAD_DST_ADDRESS_HIGH), 2); PUSH_DATAh(push, dst->offset + offset); @@ -571,9 +563,7 @@ nvc0_cb_bo_push(struct nouveau_context *nv, PUSH_DATA (push, bo->offset + base); while (words) { - unsigned nr = PUSH_AVAIL(push); - nr = MIN2(nr, words); - nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN - 1); + unsigned nr = MIN2(words, NV04_PFIFO_MAX_PACKET_LEN - 1); PUSH_SPACE(push, nr + 2); PUSH_REFN (push, bo, NOUVEAU_BO_WR | domain); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c index 188c7d7cdc8..c464904d6d4 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c @@ -29,6 +29,7 @@ #include "translate/translate.h" #include "nvc0/nvc0_context.h" +#include "nvc0/nvc0_query_hw.h" #include "nvc0/nvc0_resource.h" #include "nvc0/nvc0_3d.xml.h" @@ -775,7 +776,7 @@ nvc0_draw_stream_output(struct nvc0_context *nvc0, res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING; PUSH_SPACE(push, 2); IMMED_NVC0(push, NVC0_3D(SERIALIZE), 0); - nvc0_query_fifo_wait(push, so->pq); + nvc0_hw_query_fifo_wait(push, nvc0_query(so->pq)); if (nvc0->screen->eng3d->oclass < GM107_3D_CLASS) IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FLUSH), 0); @@ -791,7 +792,7 @@ nvc0_draw_stream_output(struct nvc0_context *nvc0, BEGIN_NVC0(push, NVC0_3D(DRAW_TFB_STRIDE), 1); PUSH_DATA (push, so->stride); BEGIN_NVC0(push, NVC0_3D(DRAW_TFB_BYTES), 1); - nvc0_query_pushbuf_submit(push, so->pq, 0x4); + nvc0_hw_query_pushbuf_submit(push, nvc0_query(so->pq), 0x4); IMMED_NVC0(push, NVC0_3D(VERTEX_END_GL), 0); mode |= NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT; diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index efb4889e562..32ce76a9e07 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -305,7 +305,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_GLSL_FEATURE_LEVEL: if (family >= CHIP_CEDAR) - return 330; + return 410; /* pre-evergreen geom shaders need newer kernel */ if (rscreen->b.info.drm_minor >= 37) return 330; diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 1d905822cde..8efe902a329 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -166,8 +166,6 @@ int r600_pipe_shader_create(struct pipe_context *ctx, if (rctx->b.chip_class <= R700) { use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_GEOMETRY); } - /* disable SB for shaders using CF_INDEX_0/1 (sampler/ubo array indexing) as it doesn't handle those currently */ - use_sb &= !shader->shader.uses_index_registers; /* disable SB for shaders using doubles */ use_sb &= !shader->shader.uses_doubles; @@ -1250,9 +1248,6 @@ static int tgsi_split_constant(struct r600_shader_ctx *ctx) continue; } - if (ctx->src[i].kc_rel) - ctx->shader->uses_index_registers = true; - if (ctx->src[i].rel) { int chan = inst->Src[i].Indirect.Swizzle; int treg = r600_get_temp(ctx); @@ -1912,7 +1907,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, shader->uses_doubles = ctx.info.uses_doubles; - indirect_gprs = ctx.info.indirect_files & ~(1 << TGSI_FILE_CONSTANT); + indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER)); tgsi_parse_init(&ctx.parse, tokens); ctx.type = ctx.info.processor; shader->processor_type = ctx.type; @@ -1936,7 +1931,6 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, ctx.gs_next_vertex = 0; ctx.gs_stream_output_info = &so; - shader->uses_index_registers = false; ctx.face_gpr = -1; ctx.fixed_pt_position_gpr = -1; ctx.fragcoord_input = -1; @@ -5703,8 +5697,6 @@ static int tgsi_tex(struct r600_shader_ctx *ctx) sampler_src_reg = 3; sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE - if (sampler_index_mode) - ctx->shader->uses_index_registers = true; src_gpr = tgsi_tex_get_src_gpr(ctx, 0); diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h index 48de9cdb156..c240e7110c1 100644 --- a/src/gallium/drivers/r600/r600_shader.h +++ b/src/gallium/drivers/r600/r600_shader.h @@ -75,8 +75,6 @@ struct r600_shader { boolean has_txq_cube_array_z_comp; boolean uses_tex_buffers; boolean gs_prim_id_input; - /* Temporarily workaround SB not handling CF_INDEX_[01] index registers */ - boolean uses_index_registers; /* Size in bytes of a data item in the ring(s) (single vertex data). Stages with only one ring items 123 will be set to 0. */ diff --git a/src/gallium/drivers/r600/r600_uvd.c b/src/gallium/drivers/r600/r600_uvd.c index 357e9017a65..e2e9033ea2c 100644 --- a/src/gallium/drivers/r600/r600_uvd.c +++ b/src/gallium/drivers/r600/r600_uvd.c @@ -47,8 +47,11 @@ #include "r600_pipe.h" #include "radeon/radeon_video.h" #include "radeon/radeon_uvd.h" +#include "radeon/radeon_vce.h" #include "r600d.h" +#define R600_UVD_ENABLE_TILING 0 + /** * creates an video buffer with an UVD compatible memory layout */ @@ -77,7 +80,7 @@ struct pipe_video_buffer *r600_video_buffer_create(struct pipe_context *pipe, template.height = align(tmpl->height / array_size, VL_MACROBLOCK_HEIGHT); vl_video_buffer_template(&templ, &template, resource_formats[0], 1, array_size, PIPE_USAGE_DEFAULT, 0); - if (ctx->b.chip_class < EVERGREEN || tmpl->interlaced) + if (ctx->b.chip_class < EVERGREEN || tmpl->interlaced || !R600_UVD_ENABLE_TILING) templ.bind = PIPE_BIND_LINEAR; resources[0] = (struct r600_texture *) pipe->screen->resource_create(pipe->screen, &templ); @@ -86,7 +89,7 @@ struct pipe_video_buffer *r600_video_buffer_create(struct pipe_context *pipe, if (resource_formats[1] != PIPE_FORMAT_NONE) { vl_video_buffer_template(&templ, &template, resource_formats[1], 1, array_size, PIPE_USAGE_DEFAULT, 1); - if (ctx->b.chip_class < EVERGREEN || tmpl->interlaced) + if (ctx->b.chip_class < EVERGREEN || tmpl->interlaced || !R600_UVD_ENABLE_TILING) templ.bind = PIPE_BIND_LINEAR; resources[1] = (struct r600_texture *) pipe->screen->resource_create(pipe->screen, &templ); @@ -96,7 +99,7 @@ struct pipe_video_buffer *r600_video_buffer_create(struct pipe_context *pipe, if (resource_formats[2] != PIPE_FORMAT_NONE) { vl_video_buffer_template(&templ, &template, resource_formats[2], 1, array_size, PIPE_USAGE_DEFAULT, 2); - if (ctx->b.chip_class < EVERGREEN || tmpl->interlaced) + if (ctx->b.chip_class < EVERGREEN || tmpl->interlaced || !R600_UVD_ENABLE_TILING) templ.bind = PIPE_BIND_LINEAR; resources[2] = (struct r600_texture *) pipe->screen->resource_create(pipe->screen, &templ); @@ -166,9 +169,28 @@ static struct radeon_winsys_cs_handle* r600_uvd_set_dtb(struct ruvd_msg *msg, st return luma->resource.cs_buf; } +/* get the radeon resources for VCE */ +static void r600_vce_get_buffer(struct pipe_resource *resource, + struct radeon_winsys_cs_handle **handle, + struct radeon_surf **surface) +{ + struct r600_texture *res = (struct r600_texture *)resource; + + if (handle) + *handle = res->resource.cs_buf; + + if (surface) + *surface = &res->surface; +} + /* create decoder */ struct pipe_video_codec *r600_uvd_create_decoder(struct pipe_context *context, - const struct pipe_video_codec *templat) + const struct pipe_video_codec *templat) { + struct r600_context *ctx = (struct r600_context *)context; + + if (templat->entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) + return rvce_create_encoder(context, templat, ctx->b.ws, r600_vce_get_buffer); + return ruvd_create_decoder(context, templat, r600_uvd_set_dtb); } diff --git a/src/gallium/drivers/r600/sb/sb_bc.h b/src/gallium/drivers/r600/sb/sb_bc.h index ab988f8716d..9c2a9170436 100644 --- a/src/gallium/drivers/r600/sb/sb_bc.h +++ b/src/gallium/drivers/r600/sb/sb_bc.h @@ -48,6 +48,7 @@ class fetch_node; class alu_group_node; class region_node; class shader; +class value; class sb_ostream { public: @@ -477,7 +478,9 @@ struct bc_cf { bool is_alu_extended() { assert(op_ptr->flags & CF_ALU); - return kc[2].mode != KC_LOCK_NONE || kc[3].mode != KC_LOCK_NONE; + return kc[2].mode != KC_LOCK_NONE || kc[3].mode != KC_LOCK_NONE || + kc[0].index_mode != KC_INDEX_NONE || kc[1].index_mode != KC_INDEX_NONE || + kc[2].index_mode != KC_INDEX_NONE || kc[3].index_mode != KC_INDEX_NONE; } }; @@ -818,13 +821,16 @@ class bc_parser { bool gpr_reladdr; + // Note: currently relies on input emitting SET_CF in same basic block as uses + value *cf_index_value[2]; + alu_node *mova; public: bc_parser(sb_context &sctx, r600_bytecode *bc, r600_shader* pshader) : ctx(sctx), dec(), bc(bc), pshader(pshader), dw(), bc_ndw(), max_cf(), sh(), error(), slots(), cgroup(), - cf_map(), loop_stack(), gpr_reladdr() { } + cf_map(), loop_stack(), gpr_reladdr(), cf_index_value(), mova() { } int decode(); int prepare(); @@ -852,6 +858,10 @@ private: int prepare_loop(cf_node *c); int prepare_if(cf_node *c); + void save_set_cf_index(value *val, unsigned idx); + value *get_cf_index_value(unsigned idx); + void save_mova(alu_node *mova); + alu_node *get_mova(); }; diff --git a/src/gallium/drivers/r600/sb/sb_bc_dump.cpp b/src/gallium/drivers/r600/sb/sb_bc_dump.cpp index 0fc73c419a6..3c70ea7cd3d 100644 --- a/src/gallium/drivers/r600/sb/sb_bc_dump.cpp +++ b/src/gallium/drivers/r600/sb/sb_bc_dump.cpp @@ -27,6 +27,7 @@ #include "sb_bc.h" #include "sb_shader.h" #include "sb_pass.h" +#include "eg_sq.h" // V_SQ_CF_INDEX_0/1 namespace r600_sb { @@ -354,6 +355,14 @@ void bc_dump::dump(alu_node& n) { s << " " << vec_bs[n.bc.bank_swizzle]; } + if (ctx.is_cayman()) { + if (n.bc.op == ALU_OP1_MOVA_INT) { + static const char *mova_str[] = { " AR_X", " PC", " CF_IDX0", " CF_IDX1", + " Unknown MOVA_INT dest" }; + s << mova_str[std::min(n.bc.dst_gpr, 4u)]; // CM_V_SQ_MOVA_DST_AR_* + } + } + sblog << s.str() << "\n"; } @@ -450,9 +459,9 @@ void bc_dump::dump(fetch_node& n) { if (n.bc.fetch_whole_quad) s << " FWQ"; if (ctx.is_egcm() && n.bc.resource_index_mode) - s << " RIM:SQ_CF_INDEX_" << n.bc.resource_index_mode; + s << " RIM:SQ_CF_INDEX_" << (n.bc.resource_index_mode - V_SQ_CF_INDEX_0); if (ctx.is_egcm() && n.bc.sampler_index_mode) - s << " SID:SQ_CF_INDEX_" << n.bc.sampler_index_mode; + s << " SID:SQ_CF_INDEX_" << (n.bc.sampler_index_mode - V_SQ_CF_INDEX_0); s << " UCF:" << n.bc.use_const_fields << " FMT(DTA:" << n.bc.data_format @@ -470,9 +479,9 @@ void bc_dump::dump(fetch_node& n) { if (n.bc.offset[k]) s << " O" << chans[k] << ":" << n.bc.offset[k]; if (ctx.is_egcm() && n.bc.resource_index_mode) - s << " RIM:SQ_CF_INDEX_" << n.bc.resource_index_mode; + s << " RIM:SQ_CF_INDEX_" << (n.bc.resource_index_mode - V_SQ_CF_INDEX_0); if (ctx.is_egcm() && n.bc.sampler_index_mode) - s << " SID:SQ_CF_INDEX_" << n.bc.sampler_index_mode; + s << " SID:SQ_CF_INDEX_" << (n.bc.sampler_index_mode - V_SQ_CF_INDEX_0); } sblog << s.str() << "\n"; diff --git a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp index 522ff9d956e..82826a90921 100644 --- a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp +++ b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp @@ -303,7 +303,8 @@ void bc_finalizer::finalize_alu_group(alu_group_node* g, node *prev_node) { assert(fdst.chan() == slot || slot == SLOT_TRANS); } - n->bc.dst_gpr = fdst.sel(); + if (!(n->bc.op_ptr->flags & AF_MOVA && ctx.is_cayman())) + n->bc.dst_gpr = fdst.sel(); n->bc.dst_chan = d ? fdst.chan() : slot < SLOT_TRANS ? slot : 0; @@ -514,7 +515,7 @@ void bc_finalizer::copy_fetch_src(fetch_node &dst, fetch_node &src, unsigned arg void bc_finalizer::emit_set_grad(fetch_node* f) { - assert(f->src.size() == 12); + assert(f->src.size() == 12 || f->src.size() == 13); unsigned ops[2] = { FETCH_OP_SET_GRADIENTS_V, FETCH_OP_SET_GRADIENTS_H }; unsigned arg_start = 0; @@ -809,8 +810,8 @@ void bc_finalizer::finalize_cf(cf_node* c) { } sel_chan bc_finalizer::translate_kcache(cf_node* alu, value* v) { - unsigned sel = v->select.sel(); - unsigned bank = sel >> 12; + unsigned sel = v->select.kcache_sel(); + unsigned bank = v->select.kcache_bank(); unsigned chan = v->select.chan(); static const unsigned kc_base[] = {128, 160, 256, 288}; diff --git a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp index 19bd0784a61..28ebfa2ce62 100644 --- a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp +++ b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp @@ -34,6 +34,7 @@ #include "r600_pipe.h" #include "r600_shader.h" +#include "eg_sq.h" // CM_V_SQ_MOVA_DST_CF_IDX0/1 #include <stack> @@ -121,7 +122,7 @@ int bc_parser::parse_decls() { return 0; } - if (pshader->indirect_files & ~(1 << TGSI_FILE_CONSTANT)) { + if (pshader->indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER))) { assert(pshader->num_arrays); @@ -328,6 +329,29 @@ int bc_parser::prepare_alu_clause(cf_node* cf) { return 0; } +void bc_parser::save_set_cf_index(value *val, unsigned idx) +{ + assert(idx <= 1); + assert(val); + cf_index_value[idx] = val; +} +value *bc_parser::get_cf_index_value(unsigned idx) +{ + assert(idx <= 1); + assert(cf_index_value[idx]); + return cf_index_value[idx]; +} +void bc_parser::save_mova(alu_node *mova) +{ + assert(mova); + this->mova = mova; +} +alu_node *bc_parser::get_mova() +{ + assert(mova); + return mova; +} + int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) { alu_node *n; @@ -338,6 +362,7 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) { for (node_iterator I = g->begin(), E = g->end(); I != E; ++I) { n = static_cast<alu_node*>(*I); + bool ubo_indexing[2] = {}; if (!sh->assign_slot(n, slots[cgroup])) { assert(!"alu slot assignment failed"); @@ -375,9 +400,14 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) { n->dst.resize(1); } - if (flags & AF_MOVA) { + if (n->bc.op == ALU_OP0_SET_CF_IDX0 || n->bc.op == ALU_OP0_SET_CF_IDX1) { + // Move CF_IDX value into tex instruction operands, scheduler will later re-emit setting of CF_IDX + // DCE will kill this op + save_set_cf_index(get_mova()->src[0], n->bc.op == ALU_OP0_SET_CF_IDX1); + } else if (flags & AF_MOVA) { n->dst[0] = sh->get_special_value(SV_AR_INDEX); + save_mova(n); n->flags |= NF_DONT_HOIST; @@ -432,7 +462,12 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) { bc_kcache &kc = cf->bc.kc[kc_set]; kc_addr = (kc.addr << 4) + (sel & 0x1F); - n->src[s] = sh->get_kcache_value(kc.bank, kc_addr, src.chan); + n->src[s] = sh->get_kcache_value(kc.bank, kc_addr, src.chan, (alu_kcache_index_mode)kc.index_mode); + + if (kc.index_mode != KC_INDEX_NONE) { + assert(kc.index_mode != KC_LOCK_LOOP); + ubo_indexing[kc.index_mode - KC_INDEX_0] = true; + } } else if (src.sel < MAX_GPR) { value *v = sh->get_gpr_value(true, src.sel, src.chan, src.rel); @@ -469,6 +504,19 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) { } } } + + // add UBO index values if any as dependencies + if (ubo_indexing[0]) { + n->src.push_back(get_cf_index_value(0)); + } + if (ubo_indexing[1]) { + n->src.push_back(get_cf_index_value(1)); + } + + if ((n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX0 || n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX1) && + ctx.is_cayman()) + // Move CF_IDX value into tex instruction operands, scheduler will later re-emit setting of CF_IDX + save_set_cf_index(n->src[0], n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX1); } // pack multislot instructions into alu_packed_node @@ -608,6 +656,13 @@ int bc_parser::prepare_fetch_clause(cf_node *cf) { n->bc.src_sel[s], false); } + // Scheduler will emit the appropriate instructions to set CF_IDX0/1 + if (n->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE) { + n->src.push_back(get_cf_index_value(n->bc.sampler_index_mode == V_SQ_CF_INDEX_1)); + } + if (n->bc.resource_index_mode != V_SQ_CF_INDEX_NONE) { + n->src.push_back(get_cf_index_value(n->bc.resource_index_mode == V_SQ_CF_INDEX_1)); + } } } diff --git a/src/gallium/drivers/r600/sb/sb_expr.cpp b/src/gallium/drivers/r600/sb/sb_expr.cpp index 9c2274e65a3..556a05da395 100644 --- a/src/gallium/drivers/r600/sb/sb_expr.cpp +++ b/src/gallium/drivers/r600/sb/sb_expr.cpp @@ -403,7 +403,8 @@ bool expr_handler::fold_alu_op1(alu_node& n) { if ((n.bc.op == ALU_OP1_MOV || n.bc.op == ALU_OP1_MOVA_INT || n.bc.op == ALU_OP1_MOVA_GPR_INT) && n.bc.clamp == 0 && n.bc.omod == 0 - && n.bc.src[0].abs == 0 && n.bc.src[0].neg == 0) { + && n.bc.src[0].abs == 0 && n.bc.src[0].neg == 0 && + n.src.size() == 1 /* RIM/SIM can be appended as additional values */) { assign_source(n.dst[0], v0); return true; } diff --git a/src/gallium/drivers/r600/sb/sb_gcm.cpp b/src/gallium/drivers/r600/sb/sb_gcm.cpp index bccb6713967..236b2ea0031 100644 --- a/src/gallium/drivers/r600/sb/sb_gcm.cpp +++ b/src/gallium/drivers/r600/sb/sb_gcm.cpp @@ -37,6 +37,7 @@ #include "sb_bc.h" #include "sb_shader.h" #include "sb_pass.h" +#include "eg_sq.h" // V_SQ_CF_INDEX_NONE namespace r600_sb { @@ -406,6 +407,14 @@ void gcm::bu_sched_bb(bb_node* bb) { ncnt = 3; } + bool sampler_indexing = false; + if (n->is_fetch_inst() && + static_cast<fetch_node *>(n)->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE) + { + sampler_indexing = true; // Give sampler indexed ops get their own clause + ncnt = sh.get_ctx().is_cayman() ? 2 : 3; // MOVA + SET_CF_IDX0/1 + } + if ((sq == SQ_TEX || sq == SQ_VTX) && ((last_count >= ctx.max_fetch/2 && check_alu_ready_count(24)) || @@ -418,7 +427,7 @@ void gcm::bu_sched_bb(bb_node* bb) { bu_ready[sq].pop_front(); if (sq != SQ_CF) { - if (!clause) { + if (!clause || sampler_indexing) { clause = sh.create_clause(sq == SQ_ALU ? NST_ALU_CLAUSE : sq == SQ_TEX ? NST_TEX_CLAUSE : diff --git a/src/gallium/drivers/r600/sb/sb_ir.h b/src/gallium/drivers/r600/sb/sb_ir.h index 560a4a9b284..c612e6c4ec6 100644 --- a/src/gallium/drivers/r600/sb/sb_ir.h +++ b/src/gallium/drivers/r600/sb/sb_ir.h @@ -62,6 +62,13 @@ struct sel_chan static unsigned sel(unsigned idx) { return (idx-1) >> 2; } static unsigned chan(unsigned idx) { return (idx-1) & 3; } + + sel_chan(unsigned bank, unsigned index, + unsigned chan, alu_kcache_index_mode index_mode) + : id(sel_chan((bank << 12) | index | ((unsigned)index_mode << 28), chan).id) {} + unsigned kcache_index_mode() const { return sel() >> 28; } + unsigned kcache_sel() const { return sel() & 0x0fffffffu; } + unsigned kcache_bank() const { return kcache_sel() >> 12; } }; inline sb_ostream& operator <<(sb_ostream& o, sel_chan r) { diff --git a/src/gallium/drivers/r600/sb/sb_sched.cpp b/src/gallium/drivers/r600/sb/sb_sched.cpp index c98b8fff764..5113b756847 100644 --- a/src/gallium/drivers/r600/sb/sb_sched.cpp +++ b/src/gallium/drivers/r600/sb/sb_sched.cpp @@ -36,6 +36,7 @@ #include "sb_shader.h" #include "sb_pass.h" #include "sb_sched.h" +#include "eg_sq.h" // V_SQ_CF_INDEX_NONE/0/1 namespace r600_sb { @@ -781,7 +782,14 @@ void post_scheduler::schedule_bb(bb_node* bb) { sblog << "\n"; ); - if (n->subtype == NST_ALU_CLAUSE) { + // May require emitting ALU ops to load index registers + if (n->is_fetch_clause()) { + n->remove(); + process_fetch(static_cast<container_node *>(n)); + continue; + } + + if (n->is_alu_clause()) { n->remove(); process_alu(static_cast<container_node*>(n)); continue; @@ -823,6 +831,108 @@ void post_scheduler::init_regmap() { } } +static alu_node *create_set_idx(shader &sh, unsigned ar_idx) { + alu_node *a = sh.create_alu(); + + assert(ar_idx == V_SQ_CF_INDEX_0 || ar_idx == V_SQ_CF_INDEX_1); + if (ar_idx == V_SQ_CF_INDEX_0) + a->bc.set_op(ALU_OP0_SET_CF_IDX0); + else + a->bc.set_op(ALU_OP0_SET_CF_IDX1); + a->bc.slot = SLOT_X; + a->dst.resize(1); // Dummy needed for recolor + + PSC_DUMP( + sblog << "created IDX load: "; + dump::dump_op(a); + sblog << "\n"; + ); + + return a; +} + +void post_scheduler::load_index_register(value *v, unsigned ar_idx) +{ + alu.reset(); + + if (!sh.get_ctx().is_cayman()) { + // Evergreen has to first load address register, then use CF_SET_IDX0/1 + alu_group_tracker &rt = alu.grp(); + alu_node *set_idx = create_set_idx(sh, ar_idx); + if (!rt.try_reserve(set_idx)) { + sblog << "can't emit SET_CF_IDX"; + dump::dump_op(set_idx); + sblog << "\n"; + } + process_group(); + + if (!alu.check_clause_limits()) { + // Can't happen since clause only contains MOVA/CF_SET_IDX0/1 + } + alu.emit_group(); + } + + alu_group_tracker &rt = alu.grp(); + alu_node *a = alu.create_ar_load(v, ar_idx == V_SQ_CF_INDEX_1 ? SEL_Z : SEL_Y); + + if (!rt.try_reserve(a)) { + sblog << "can't emit AR load : "; + dump::dump_op(a); + sblog << "\n"; + } + + process_group(); + + if (!alu.check_clause_limits()) { + // Can't happen since clause only contains MOVA/CF_SET_IDX0/1 + } + + alu.emit_group(); + alu.emit_clause(cur_bb); +} + +void post_scheduler::process_fetch(container_node *c) { + if (c->empty()) + return; + + for (node_iterator N, I = c->begin(), E = c->end(); I != E; I = N) { + N = I; + ++N; + + node *n = *I; + + fetch_node *f = static_cast<fetch_node*>(n); + + PSC_DUMP( + sblog << "process_tex "; + dump::dump_op(n); + sblog << " "; + ); + + // TODO: If same values used can avoid reloading index register + if (f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE || + f->bc.resource_index_mode != V_SQ_CF_INDEX_NONE) { + unsigned index_mode = f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE ? + f->bc.sampler_index_mode : f->bc.resource_index_mode; + + // Currently require prior opt passes to use one TEX per indexed op + assert(f->parent->count() == 1); + + value *v = f->src.back(); // Last src is index offset + assert(v); + + cur_bb->push_front(c); + + load_index_register(v, index_mode); + f->src.pop_back(); // Don't need index value any more + + return; + } + } + + cur_bb->push_front(c); +} + void post_scheduler::process_alu(container_node *c) { if (c->empty()) @@ -855,6 +965,7 @@ void post_scheduler::process_alu(container_node *c) { if (uc) { n->remove(); + pending.push_back(n); PSC_DUMP( sblog << "pending\n"; ); } else { @@ -997,6 +1108,18 @@ void post_scheduler::init_globals(val_set &s, bool prealloc) { } } +void post_scheduler::emit_index_registers() { + for (unsigned i = 0; i < 2; i++) { + if (alu.current_idx[i]) { + regmap = prev_regmap; + alu.discard_current_group(); + + load_index_register(alu.current_idx[i], KC_INDEX_0 + i); + alu.current_idx[i] = NULL; + } + } +} + void post_scheduler::emit_clause() { if (alu.current_ar) { @@ -1005,7 +1128,11 @@ void post_scheduler::emit_clause() { alu.emit_group(); } - alu.emit_clause(cur_bb); + if (!alu.is_empty()) { + alu.emit_clause(cur_bb); + } + + emit_index_registers(); } void post_scheduler::schedule_alu(container_node *c) { @@ -1017,6 +1144,14 @@ void post_scheduler::schedule_alu(container_node *c) { prev_regmap = regmap; if (!prepare_alu_group()) { + if (alu.current_idx[0] || alu.current_idx[1]) { + regmap = prev_regmap; + emit_clause(); + init_globals(live, false); + + continue; + } + if (alu.current_ar) { emit_load_ar(); continue; @@ -1028,6 +1163,7 @@ void post_scheduler::schedule_alu(container_node *c) { regmap = prev_regmap; emit_clause(); init_globals(live, false); + continue; } @@ -1180,7 +1316,7 @@ void post_scheduler::emit_load_ar() { alu.discard_current_group(); alu_group_tracker &rt = alu.grp(); - alu_node *a = alu.create_ar_load(); + alu_node *a = alu.create_ar_load(alu.current_ar, SEL_X); if (!rt.try_reserve(a)) { sblog << "can't emit AR load : "; @@ -1287,6 +1423,42 @@ bool post_scheduler::map_src_val(value *v) { } bool post_scheduler::map_src_vec(vvec &vv, bool src) { + if (src) { + // Handle possible UBO indexing + bool ubo_indexing[2] = { false, false }; + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value *v = *I; + if (!v) + continue; + + if (v->is_kcache()) { + unsigned index_mode = v->select.kcache_index_mode(); + if (index_mode == KC_INDEX_0 || index_mode == KC_INDEX_1) { + ubo_indexing[index_mode - KC_INDEX_0] = true; + } + } + } + + // idx values stored at end of src vec, see bc_parser::prepare_alu_group + for (unsigned i = 2; i != 0; i--) { + if (ubo_indexing[i-1]) { + // TODO: skip adding value to kcache reservation somehow, causes + // unnecessary group breaks and cache line locks + value *v = vv.back(); + if (alu.current_idx[i-1] && alu.current_idx[i-1] != v) { + PSC_DUMP( + sblog << "IDX" << i-1 << " already set to " << + *alu.current_idx[i-1] << ", trying to set " << *v << "\n"; + ); + return false; + } + + alu.current_idx[i-1] = v; + PSC_DUMP(sblog << "IDX" << i-1 << " set to " << *v << "\n";); + } + } + } + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { value *v = *I; if (!v) @@ -1352,6 +1524,10 @@ void post_scheduler::dump_regmap() { sblog << " current_AR: " << *alu.current_ar << "\n"; if (alu.current_pr) sblog << " current_PR: " << *alu.current_pr << "\n"; + if (alu.current_idx[0]) + sblog << " current IDX0: " << *alu.current_idx[0] << "\n"; + if (alu.current_idx[1]) + sblog << " current IDX1: " << *alu.current_idx[1] << "\n"; } void post_scheduler::recolor_locals() { @@ -1441,6 +1617,13 @@ unsigned post_scheduler::try_add_instruction(node *n) { unsigned avail_slots = rt.avail_slots(); + // Cannot schedule in same clause as instructions using this index value + if (!n->dst.empty() && n->dst[0] && + (n->dst[0] == alu.current_idx[0] || n->dst[0] == alu.current_idx[1])) { + PSC_DUMP(sblog << " CF_IDX source: " << *n->dst[0] << "\n";); + return 0; + } + if (n->is_alu_packed()) { alu_packed_node *p = static_cast<alu_packed_node*>(n); unsigned slots = p->get_slot_mask(); @@ -1770,7 +1953,7 @@ alu_clause_tracker::alu_clause_tracker(shader &sh) grp0(sh), grp1(sh), group(), clause(), push_exec_mask(), - current_ar(), current_pr() {} + current_ar(), current_pr(), current_idx() {} void alu_clause_tracker::emit_group() { @@ -1827,6 +2010,8 @@ bool alu_clause_tracker::check_clause_limits() { // reserving slots to load AR and PR values unsigned reserve_slots = (current_ar ? 1 : 0) + (current_pr ? 1 : 0); + // ...and index registers + reserve_slots += (current_idx[0] != NULL) + (current_idx[1] != NULL); if (slot_count + slots > MAX_ALU_SLOTS - reserve_slots) return false; @@ -1892,13 +2077,15 @@ unsigned rp_kcache_tracker::get_lines(kc_lines& lines) { unsigned cnt = 0; for (unsigned i = 0; i < sel_count; ++i) { - unsigned line = rp[i]; + unsigned line = rp[i] & 0x1fffffffu; + unsigned index_mode = rp[i] >> 29; if (!line) return cnt; --line; line = (sel_count == 2) ? line >> 5 : line >> 6; + line |= index_mode << 29; if (lines.insert(line).second) ++cnt; @@ -1913,14 +2100,18 @@ bool alu_kcache_tracker::update_kc() { memcpy(old_kc, kc, sizeof(kc)); for (kc_lines::iterator I = lines.begin(), E = lines.end(); I != E; ++I) { - unsigned line = *I; + unsigned index_mode = *I >> 29; + unsigned line = *I & 0x1fffffffu; unsigned bank = line >> 8; + assert(index_mode <= KC_INDEX_INVALID); line &= 0xFF; - if (c && (bank == kc[c-1].bank) && (kc[c-1].addr + 1 == line)) - ++kc[c-1].mode; - else { + if (c && (bank == kc[c-1].bank) && (kc[c-1].addr + 1 == line) && + kc[c-1].index_mode == index_mode) + { + kc[c-1].mode = KC_LOCK_2; + } else { if (c == max_kcs) { memcpy(kc, old_kc, sizeof(kc)); return false; @@ -1930,17 +2121,16 @@ bool alu_kcache_tracker::update_kc() { kc[c].bank = bank; kc[c].addr = line; + kc[c].index_mode = index_mode; ++c; } } return true; } -alu_node* alu_clause_tracker::create_ar_load() { +alu_node* alu_clause_tracker::create_ar_load(value *v, chan_select ar_channel) { alu_node *a = sh.create_alu(); - // FIXME use MOVA_GPR on R6xx - if (sh.get_ctx().uses_mova_gpr) { a->bc.set_op(ALU_OP1_MOVA_GPR_INT); a->bc.slot = SLOT_TRANS; @@ -1948,9 +2138,13 @@ alu_node* alu_clause_tracker::create_ar_load() { a->bc.set_op(ALU_OP1_MOVA_INT); a->bc.slot = SLOT_X; } + a->bc.dst_chan = ar_channel; + if (ar_channel != SEL_X && sh.get_ctx().is_cayman()) { + a->bc.dst_gpr = ar_channel == SEL_Y ? CM_V_SQ_MOVA_DST_CF_IDX0 : CM_V_SQ_MOVA_DST_CF_IDX1; + } a->dst.resize(1); - a->src.push_back(current_ar); + a->src.push_back(v); PSC_DUMP( sblog << "created AR load: "; diff --git a/src/gallium/drivers/r600/sb/sb_sched.h b/src/gallium/drivers/r600/sb/sb_sched.h index 87c45867e16..05b428ca884 100644 --- a/src/gallium/drivers/r600/sb/sb_sched.h +++ b/src/gallium/drivers/r600/sb/sb_sched.h @@ -66,6 +66,7 @@ public: class literal_tracker { literal lt[4]; unsigned uc[4]; + public: literal_tracker() : lt(), uc() {} @@ -219,6 +220,8 @@ public: // bottom-up) value *current_ar; value *current_pr; + // current values of CF_IDX registers that need preloading + value *current_idx[2]; alu_clause_tracker(shader &sh); @@ -235,7 +238,7 @@ public: void new_group(); bool is_empty(); - alu_node* create_ar_load(); + alu_node* create_ar_load(value *v, chan_select ar_channel); void discard_current_group(); @@ -256,6 +259,7 @@ class post_scheduler : public pass { val_set cleared_interf; + void emit_index_registers(); public: post_scheduler(shader &sh) : pass(sh), @@ -266,6 +270,9 @@ public: void run_on(container_node *n); void schedule_bb(bb_node *bb); + void load_index_register(value *v, unsigned idx); + void process_fetch(container_node *c); + void process_alu(container_node *c); void schedule_alu(container_node *c); bool prepare_alu_group(); diff --git a/src/gallium/drivers/r600/sb/sb_shader.cpp b/src/gallium/drivers/r600/sb/sb_shader.cpp index f996c0786d1..87e28e98157 100644 --- a/src/gallium/drivers/r600/sb/sb_shader.cpp +++ b/src/gallium/drivers/r600/sb/sb_shader.cpp @@ -188,9 +188,9 @@ value* shader::create_temp_value() { return get_value(VLK_TEMP, id, 0); } -value* shader::get_kcache_value(unsigned bank, unsigned index, unsigned chan) { +value* shader::get_kcache_value(unsigned bank, unsigned index, unsigned chan, alu_kcache_index_mode index_mode) { return get_ro_value(kcache_values, VLK_KCACHE, - sel_chan((bank << 12) | index, chan)); + sel_chan(bank, index, chan, index_mode)); } void shader::add_input(unsigned gpr, bool preloaded, unsigned comp_mask) { diff --git a/src/gallium/drivers/r600/sb/sb_shader.h b/src/gallium/drivers/r600/sb/sb_shader.h index 7955bba9b67..70bea891b76 100644 --- a/src/gallium/drivers/r600/sb/sb_shader.h +++ b/src/gallium/drivers/r600/sb/sb_shader.h @@ -323,7 +323,7 @@ public: value* get_special_ro_value(unsigned sel); - value* get_kcache_value(unsigned bank, unsigned index, unsigned chan); + value* get_kcache_value(unsigned bank, unsigned index, unsigned chan, alu_kcache_index_mode index_mode); value* get_value_version(value* v, unsigned ver); diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c index 2e9a0135647..ac99e732c94 100644 --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c @@ -272,6 +272,15 @@ static LLVMValueRef fetch_system_value( return bitcast(bld_base, type, cval); } +static LLVMValueRef si_build_alloca_undef(struct gallivm_state *gallivm, + LLVMTypeRef type, + const char *name) +{ + LLVMValueRef ptr = lp_build_alloca(gallivm, type, name); + LLVMBuildStore(gallivm->builder, LLVMGetUndef(type), ptr); + return ptr; +} + static void emit_declaration( struct lp_build_tgsi_context * bld_base, const struct tgsi_full_declaration *decl) @@ -285,7 +294,7 @@ static void emit_declaration( for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) { unsigned chan; for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { - ctx->soa.addr[idx][chan] = lp_build_alloca( + ctx->soa.addr[idx][chan] = si_build_alloca_undef( &ctx->gallivm, ctx->soa.bld_base.uint_bld.elem_type, ""); } @@ -315,8 +324,9 @@ static void emit_declaration( for (idx = first; idx <= last; idx++) { for (i = 0; i < TGSI_NUM_CHANNELS; i++) { ctx->temps[idx * TGSI_NUM_CHANNELS + i] = - lp_build_alloca(bld_base->base.gallivm, bld_base->base.vec_type, - "temp"); + si_build_alloca_undef(bld_base->base.gallivm, + bld_base->base.vec_type, + "temp"); } } break; @@ -347,7 +357,8 @@ static void emit_declaration( unsigned chan; assert(idx < RADEON_LLVM_MAX_OUTPUTS); for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { - ctx->soa.outputs[idx][chan] = lp_build_alloca(&ctx->gallivm, + ctx->soa.outputs[idx][chan] = si_build_alloca_undef( + &ctx->gallivm, ctx->soa.bld_base.base.elem_type, ""); } } @@ -908,7 +919,21 @@ static void emit_ucmp( LLVMBuildSelect(builder, v, emit_data->args[1], emit_data->args[2], ""); } -static void emit_cmp( +static void emit_cmp(const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + LLVMValueRef cond, *args = emit_data->args; + + cond = LLVMBuildFCmp(builder, LLVMRealOLT, args[0], + bld_base->base.zero, ""); + + emit_data->output[emit_data->chan] = + LLVMBuildSelect(builder, cond, args[1], args[2], ""); +} + +static void emit_set_cond( const struct lp_build_tgsi_action *action, struct lp_build_tgsi_context * bld_base, struct lp_build_emit_data * emit_data) @@ -1382,6 +1407,51 @@ static void emit_imsb(const struct lp_build_tgsi_action * action, LLVMBuildSelect(builder, cond, all_ones, msb, ""); } +static void emit_iabs(const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + + emit_data->output[emit_data->chan] = + lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_IMAX, + emit_data->args[0], + LLVMBuildNeg(builder, + emit_data->args[0], "")); +} + +static void emit_minmax_int(const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + LLVMIntPredicate op; + + switch (emit_data->info->opcode) { + default: + assert(0); + case TGSI_OPCODE_IMAX: + op = LLVMIntSGT; + break; + case TGSI_OPCODE_IMIN: + op = LLVMIntSLT; + break; + case TGSI_OPCODE_UMAX: + op = LLVMIntUGT; + break; + case TGSI_OPCODE_UMIN: + op = LLVMIntULT; + break; + } + + emit_data->output[emit_data->chan] = + LLVMBuildSelect(builder, + LLVMBuildICmp(builder, op, emit_data->args[0], + emit_data->args[1], ""), + emit_data->args[0], + emit_data->args[1], ""); +} + void radeon_llvm_context_init(struct radeon_llvm_context * ctx) { struct lp_type type; @@ -1447,8 +1517,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) bld_base->op_actions[TGSI_OPCODE_CEIL].intr_name = "llvm.ceil.f32"; bld_base->op_actions[TGSI_OPCODE_CLAMP].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_CLAMP].intr_name = "llvm.AMDIL.clamp."; - bld_base->op_actions[TGSI_OPCODE_CMP].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_CMP].intr_name = "llvm.AMDGPU.cndlt"; + bld_base->op_actions[TGSI_OPCODE_CMP].emit = emit_cmp; bld_base->op_actions[TGSI_OPCODE_CONT].emit = cont_emit; bld_base->op_actions[TGSI_OPCODE_COS].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_COS].intr_name = "llvm.cos.f32"; @@ -1470,7 +1539,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) bld_base->op_actions[TGSI_OPCODE_ENDIF].emit = endif_emit; bld_base->op_actions[TGSI_OPCODE_ENDLOOP].emit = endloop_emit; bld_base->op_actions[TGSI_OPCODE_EX2].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_EX2].intr_name = "llvm.AMDIL.exp."; + bld_base->op_actions[TGSI_OPCODE_EX2].intr_name = "llvm.exp2.f32"; bld_base->op_actions[TGSI_OPCODE_FLR].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_FLR].intr_name = "llvm.floor.f32"; bld_base->op_actions[TGSI_OPCODE_FMA].emit = build_tgsi_intrinsic_nomem; @@ -1482,17 +1551,14 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) bld_base->op_actions[TGSI_OPCODE_FSGE].emit = emit_fcmp; bld_base->op_actions[TGSI_OPCODE_FSLT].emit = emit_fcmp; bld_base->op_actions[TGSI_OPCODE_FSNE].emit = emit_fcmp; - bld_base->op_actions[TGSI_OPCODE_IABS].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_IABS].intr_name = "llvm.AMDIL.abs."; + bld_base->op_actions[TGSI_OPCODE_IABS].emit = emit_iabs; bld_base->op_actions[TGSI_OPCODE_IBFE].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_IBFE].intr_name = "llvm.AMDGPU.bfe.i32"; bld_base->op_actions[TGSI_OPCODE_IDIV].emit = emit_idiv; bld_base->op_actions[TGSI_OPCODE_IF].emit = if_emit; bld_base->op_actions[TGSI_OPCODE_UIF].emit = uif_emit; - bld_base->op_actions[TGSI_OPCODE_IMAX].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_IMAX].intr_name = "llvm.AMDGPU.imax"; - bld_base->op_actions[TGSI_OPCODE_IMIN].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_IMIN].intr_name = "llvm.AMDGPU.imin"; + bld_base->op_actions[TGSI_OPCODE_IMAX].emit = emit_minmax_int; + bld_base->op_actions[TGSI_OPCODE_IMIN].emit = emit_minmax_int; bld_base->op_actions[TGSI_OPCODE_IMSB].emit = emit_imsb; bld_base->op_actions[TGSI_OPCODE_INEG].emit = emit_ineg; bld_base->op_actions[TGSI_OPCODE_ISHR].emit = emit_ishr; @@ -1508,8 +1574,6 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) bld_base->op_actions[TGSI_OPCODE_LSB].emit = emit_lsb; bld_base->op_actions[TGSI_OPCODE_LG2].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_LG2].intr_name = "llvm.log2.f32"; - bld_base->op_actions[TGSI_OPCODE_LRP].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_LRP].intr_name = "llvm.AMDGPU.lrp"; bld_base->op_actions[TGSI_OPCODE_MOD].emit = emit_mod; bld_base->op_actions[TGSI_OPCODE_UMSB].emit = emit_umsb; bld_base->op_actions[TGSI_OPCODE_NOT].emit = emit_not; @@ -1519,31 +1583,29 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) bld_base->op_actions[TGSI_OPCODE_POW].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_POW].intr_name = "llvm.pow.f32"; bld_base->op_actions[TGSI_OPCODE_ROUND].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_ROUND].intr_name = "llvm.AMDIL.round.nearest."; + bld_base->op_actions[TGSI_OPCODE_ROUND].intr_name = "llvm.rint.f32"; bld_base->op_actions[TGSI_OPCODE_RSQ].intr_name = "llvm.AMDGPU.rsq.clamped.f32"; bld_base->op_actions[TGSI_OPCODE_RSQ].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_SGE].emit = emit_cmp; - bld_base->op_actions[TGSI_OPCODE_SEQ].emit = emit_cmp; + bld_base->op_actions[TGSI_OPCODE_SGE].emit = emit_set_cond; + bld_base->op_actions[TGSI_OPCODE_SEQ].emit = emit_set_cond; bld_base->op_actions[TGSI_OPCODE_SHL].emit = emit_shl; - bld_base->op_actions[TGSI_OPCODE_SLE].emit = emit_cmp; - bld_base->op_actions[TGSI_OPCODE_SLT].emit = emit_cmp; - bld_base->op_actions[TGSI_OPCODE_SNE].emit = emit_cmp; - bld_base->op_actions[TGSI_OPCODE_SGT].emit = emit_cmp; + bld_base->op_actions[TGSI_OPCODE_SLE].emit = emit_set_cond; + bld_base->op_actions[TGSI_OPCODE_SLT].emit = emit_set_cond; + bld_base->op_actions[TGSI_OPCODE_SNE].emit = emit_set_cond; + bld_base->op_actions[TGSI_OPCODE_SGT].emit = emit_set_cond; bld_base->op_actions[TGSI_OPCODE_SIN].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_SIN].intr_name = "llvm.sin.f32"; bld_base->op_actions[TGSI_OPCODE_SQRT].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_SQRT].intr_name = "llvm.sqrt.f32"; bld_base->op_actions[TGSI_OPCODE_SSG].emit = emit_ssg; bld_base->op_actions[TGSI_OPCODE_TRUNC].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_TRUNC].intr_name = "llvm.AMDGPU.trunc"; + bld_base->op_actions[TGSI_OPCODE_TRUNC].intr_name = "llvm.trunc.f32"; bld_base->op_actions[TGSI_OPCODE_UADD].emit = emit_uadd; bld_base->op_actions[TGSI_OPCODE_UBFE].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_UBFE].intr_name = "llvm.AMDGPU.bfe.u32"; bld_base->op_actions[TGSI_OPCODE_UDIV].emit = emit_udiv; - bld_base->op_actions[TGSI_OPCODE_UMAX].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_UMAX].intr_name = "llvm.AMDGPU.umax"; - bld_base->op_actions[TGSI_OPCODE_UMIN].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_UMIN].intr_name = "llvm.AMDGPU.umin"; + bld_base->op_actions[TGSI_OPCODE_UMAX].emit = emit_minmax_int; + bld_base->op_actions[TGSI_OPCODE_UMIN].emit = emit_minmax_int; bld_base->op_actions[TGSI_OPCODE_UMOD].emit = emit_umod; bld_base->op_actions[TGSI_OPCODE_USEQ].emit = emit_icmp; bld_base->op_actions[TGSI_OPCODE_USGE].emit = emit_icmp; diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index c6605346771..697e60a50d9 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -469,7 +469,7 @@ static void si_delete_compute_state(struct pipe_context *ctx, void* state){ if (program->kernels) { for (int i = 0; i < program->num_kernels; i++){ if (program->kernels[i].bo){ - si_shader_destroy(ctx, &program->kernels[i]); + si_shader_destroy(&program->kernels[i]); } } FREE(program->kernels); @@ -482,7 +482,7 @@ static void si_delete_compute_state(struct pipe_context *ctx, void* state){ FREE(program->shader.binary.config); FREE(program->shader.binary.rodata); FREE(program->shader.binary.global_symbol_offsets); - si_shader_destroy(ctx, &program->shader); + si_shader_destroy(&program->shader); #endif pipe_resource_reference( diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index a0283b7c966..53c80dba602 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -271,6 +271,8 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_START_INSTANCE: case PIPE_CAP_NPOT_TEXTURES: case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES: + case PIPE_CAP_VERTEX_COLOR_CLAMPED: + case PIPE_CAP_FRAGMENT_COLOR_CLAMPED: case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: case PIPE_CAP_TGSI_INSTANCEID: case PIPE_CAP_COMPUTE: @@ -330,8 +332,6 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) /* Unsupported features. */ case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT: case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS: - case PIPE_CAP_FRAGMENT_COLOR_CLAMPED: - case PIPE_CAP_VERTEX_COLOR_CLAMPED: case PIPE_CAP_USER_VERTEX_BUFFERS: case PIPE_CAP_FAKE_SW_MSAA: case PIPE_CAP_TEXTURE_GATHER_OFFSETS: diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 32a702fcdf5..a119cbdc16c 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1306,6 +1306,23 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, unsigned compressed = 0; unsigned chan; + /* XXX: This controls which components of the output + * registers actually get exported. (e.g bit 0 means export + * X component, bit 1 means export Y component, etc.) I'm + * hard coding this to 0xf for now. In the future, we might + * want to do something else. + */ + args[0] = lp_build_const_int32(base->gallivm, 0xf); + + /* Specify whether the EXEC mask represents the valid mask */ + args[1] = uint->zero; + + /* Specify whether this is the last export */ + args[2] = uint->zero; + + /* Specify the target we are exporting */ + args[3] = lp_build_const_int32(base->gallivm, target); + if (si_shader_ctx->type == TGSI_PROCESSOR_FRAGMENT) { int cbuf = target - V_008DFC_SQ_EXP_MRT; @@ -1323,55 +1340,31 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, } } + /* Set COMPR flag */ + args[4] = compressed ? uint->one : uint->zero; + if (compressed) { /* Pixel shader needs to pack output values before export */ - for (chan = 0; chan < 2; chan++ ) { - args[0] = values[2 * chan]; - args[1] = values[2 * chan + 1]; - args[chan + 5] = - lp_build_intrinsic(base->gallivm->builder, - "llvm.SI.packf16", - LLVMInt32TypeInContext(base->gallivm->context), - args, 2, - LLVMReadNoneAttribute | LLVMNoUnwindAttribute); + for (chan = 0; chan < 2; chan++) { + LLVMValueRef pack_args[2] = { + values[2 * chan], + values[2 * chan + 1] + }; + LLVMValueRef packed; + + packed = lp_build_intrinsic(base->gallivm->builder, + "llvm.SI.packf16", + LLVMInt32TypeInContext(base->gallivm->context), + pack_args, 2, + LLVMReadNoneAttribute | LLVMNoUnwindAttribute); args[chan + 7] = args[chan + 5] = LLVMBuildBitCast(base->gallivm->builder, - args[chan + 5], + packed, LLVMFloatTypeInContext(base->gallivm->context), ""); } - - /* Set COMPR flag */ - args[4] = uint->one; - } else { - for (chan = 0; chan < 4; chan++ ) - /* +5 because the first output value will be - * the 6th argument to the intrinsic. */ - args[chan + 5] = values[chan]; - - /* Clear COMPR flag */ - args[4] = uint->zero; - } - - /* XXX: This controls which components of the output - * registers actually get exported. (e.g bit 0 means export - * X component, bit 1 means export Y component, etc.) I'm - * hard coding this to 0xf for now. In the future, we might - * want to do something else. */ - args[0] = lp_build_const_int32(base->gallivm, 0xf); - - /* Specify whether the EXEC mask represents the valid mask */ - args[1] = uint->zero; - - /* Specify whether this is the last export */ - args[2] = uint->zero; - - /* Specify the target we are exporting */ - args[3] = lp_build_const_int32(base->gallivm, target); - - /* XXX: We probably need to keep track of the output - * values, so we know what we are passing to the next - * stage. */ + } else + memcpy(&args[5], values, sizeof(values[0]) * 4); } /* Load from output pointers and initialize arguments for the shader export intrinsic */ @@ -2083,6 +2076,45 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base) outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0])); + /* Vertex color clamping. + * + * This uses a state constant loaded in a user data SGPR and + * an IF statement is added that clamps all colors if the constant + * is true. + */ + if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX && + !si_shader_ctx->shader->is_gs_copy_shader) { + struct lp_build_if_state if_ctx; + LLVMValueRef cond = NULL; + LLVMValueRef addr, val; + + for (i = 0; i < info->num_outputs; i++) { + if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR && + info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR) + continue; + + /* We've found a color. */ + if (!cond) { + /* The state is in the first bit of the user SGPR. */ + cond = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + SI_PARAM_VS_STATE_BITS); + cond = LLVMBuildTrunc(gallivm->builder, cond, + LLVMInt1TypeInContext(gallivm->context), ""); + lp_build_if(&if_ctx, gallivm, cond); + } + + for (j = 0; j < 4; j++) { + addr = si_shader_ctx->radeon_bld.soa.outputs[i][j]; + val = LLVMBuildLoad(gallivm->builder, addr, ""); + val = radeon_llvm_saturate(bld_base, val); + LLVMBuildStore(gallivm->builder, val, addr); + } + } + + if (cond) + lp_build_endif(&if_ctx); + } + for (i = 0; i < info->num_outputs; i++) { outputs[i].name = info->output_semantic_name[i]; outputs[i].sid = info->output_semantic_index[i]; @@ -2117,6 +2149,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base) struct lp_build_context * base = &bld_base->base; struct lp_build_context * uint = &bld_base->uint_bld; struct tgsi_shader_info *info = &shader->selector->info; + LLVMBuilderRef builder = base->gallivm->builder; LLVMValueRef args[9]; LLVMValueRef last_args[9] = { 0 }; int depth_index = -1, stencil_index = -1, samplemask_index = -1; @@ -2143,6 +2176,16 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base) target = V_008DFC_SQ_EXP_MRT + semantic_index; alpha_ptr = si_shader_ctx->radeon_bld.soa.outputs[i][3]; + if (si_shader_ctx->shader->key.ps.clamp_color) { + for (int j = 0; j < 4; j++) { + LLVMValueRef ptr = si_shader_ctx->radeon_bld.soa.outputs[i][j]; + LLVMValueRef result = LLVMBuildLoad(builder, ptr, ""); + + result = radeon_llvm_saturate(bld_base, result); + LLVMBuildStore(builder, result, ptr); + } + } + if (si_shader_ctx->shader->key.ps.alpha_to_one) LLVMBuildStore(base->gallivm->builder, base->one, alpha_ptr); @@ -2153,6 +2196,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base) if (si_shader_ctx->shader->key.ps.poly_line_smoothing) si_scale_alpha_by_sample_mask(bld_base, alpha_ptr); + break; default: target = 0; @@ -3440,6 +3484,9 @@ static void create_function(struct si_shader_context *si_shader_ctx) if (shader->is_gs_copy_shader) { last_array_pointer = SI_PARAM_CONST; num_params = SI_PARAM_CONST+1; + } else { + params[SI_PARAM_VS_STATE_BITS] = i32; + num_params = SI_PARAM_VS_STATE_BITS+1; } /* The locations of the other parameters are assigned dynamically. */ @@ -3982,6 +4029,7 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f) key->vs.es_enabled_outputs); fprintf(f, " as_es = %u\n", key->vs.as_es); fprintf(f, " as_ls = %u\n", key->vs.as_ls); + fprintf(f, " export_prim_id = %u\n", key->vs.export_prim_id); break; case PIPE_SHADER_TESS_CTRL: @@ -3993,6 +4041,7 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f) fprintf(f, " es_enabled_outputs = 0x%"PRIx64"\n", key->tes.es_enabled_outputs); fprintf(f, " as_es = %u\n", key->tes.as_es); + fprintf(f, " export_prim_id = %u\n", key->tes.export_prim_id); break; case PIPE_SHADER_GEOMETRY: @@ -4005,6 +4054,7 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f) fprintf(f, " alpha_func = %u\n", key->ps.alpha_func); fprintf(f, " alpha_to_one = %u\n", key->ps.alpha_to_one); fprintf(f, " poly_stipple = %u\n", key->ps.poly_stipple); + fprintf(f, " clamp_color = %u\n", key->ps.clamp_color); break; default: @@ -4196,10 +4246,12 @@ out: return r; } -void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader) +void si_shader_destroy(struct si_shader *shader) { - if (shader->gs_copy_shader) - si_shader_destroy(ctx, shader->gs_copy_shader); + if (shader->gs_copy_shader) { + si_shader_destroy(shader->gs_copy_shader); + FREE(shader->gs_copy_shader); + } if (shader->scratch_bo) r600_resource_reference(&shader->scratch_bo, NULL); diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index b92fa02a171..54dad726d01 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -83,6 +83,7 @@ struct radeon_shader_reloc; #define SI_SGPR_VERTEX_BUFFER 8 /* VS only */ #define SI_SGPR_BASE_VERTEX 10 /* VS only */ #define SI_SGPR_START_INSTANCE 11 /* VS only */ +#define SI_SGPR_VS_STATE_BITS 12 /* VS(VS) only */ #define SI_SGPR_LS_OUT_LAYOUT 12 /* VS(LS) only */ #define SI_SGPR_TCS_OUT_OFFSETS 8 /* TCS & TES only */ #define SI_SGPR_TCS_OUT_LAYOUT 9 /* TCS & TES only */ @@ -90,8 +91,9 @@ struct radeon_shader_reloc; #define SI_SGPR_ALPHA_REF 8 /* PS only */ #define SI_SGPR_PS_STATE_BITS 9 /* PS only */ -#define SI_VS_NUM_USER_SGPR 12 -#define SI_LS_NUM_USER_SGPR 13 +#define SI_VS_NUM_USER_SGPR 13 /* API VS */ +#define SI_ES_NUM_USER_SGPR 12 /* API VS */ +#define SI_LS_NUM_USER_SGPR 13 /* API VS */ #define SI_TCS_NUM_USER_SGPR 11 #define SI_TES_NUM_USER_SGPR 10 #define SI_GS_NUM_USER_SGPR 8 @@ -108,6 +110,8 @@ struct radeon_shader_reloc; #define SI_PARAM_VERTEX_BUFFER 4 #define SI_PARAM_BASE_VERTEX 5 #define SI_PARAM_START_INSTANCE 6 +/* [0] = clamp vertex color */ +#define SI_PARAM_VS_STATE_BITS 7 /* the other VS parameters are assigned dynamically */ /* Offsets where TCS outputs and TCS patch outputs live in LDS: @@ -227,6 +231,7 @@ union si_shader_key { unsigned alpha_to_one:1; unsigned poly_stipple:1; unsigned poly_line_smoothing:1; + unsigned clamp_color:1; } ps; struct { unsigned instance_divisors[SI_NUM_VERTEX_BUFFERS]; @@ -324,7 +329,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f); int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader, LLVMTargetMachineRef tm, LLVMModuleRef mod); -void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader); +void si_shader_destroy(struct si_shader *shader); unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index); int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader); int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader); diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 00d4bc1fbc2..e6475364f98 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -694,7 +694,7 @@ static void *si_create_rs_state(struct pipe_context *ctx, rs->poly_smooth = state->poly_smooth; rs->uses_poly_offset = state->offset_point || state->offset_line || state->offset_tri; - + rs->clamp_fragment_color = state->clamp_fragment_color; rs->flatshade = state->flatshade; rs->sprite_coord_enable = state->sprite_coord_enable; rs->pa_sc_line_stipple = state->line_stipple_enable ? @@ -760,6 +760,8 @@ static void *si_create_rs_state(struct pipe_context *ctx, state->fill_back != PIPE_POLYGON_MODE_FILL) | S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) | S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back))); + si_pm4_set_reg(pm4, R_00B130_SPI_SHADER_USER_DATA_VS_0 + + SI_SGPR_VS_STATE_BITS * 4, state->clamp_vertex_color); /* Precalculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */ for (i = 0; i < 3; i++) { diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 6a567688ee4..fba6619d2fd 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -60,6 +60,7 @@ struct si_state_rasterizer { bool line_smooth; bool poly_smooth; bool uses_poly_offset; + bool clamp_fragment_color; }; struct si_dsa_stencil_ref_part { diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index f673388b121..c98509bb0b9 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -122,7 +122,8 @@ static void si_shader_ls(struct si_shader *shader) shader->ls_rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) | S_00B528_SGPRS((num_sgprs - 1) / 8) | - S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt); + S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt) | + S_00B528_DX10_CLAMP(shader->dx10_clamp_mode); shader->ls_rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) | S_00B52C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0); } @@ -154,7 +155,8 @@ static void si_shader_hs(struct si_shader *shader) si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, va >> 40); si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS, S_00B428_VGPRS((shader->num_vgprs - 1) / 4) | - S_00B428_SGPRS((num_sgprs - 1) / 8)); + S_00B428_SGPRS((num_sgprs - 1) / 8) | + S_00B428_DX10_CLAMP(shader->dx10_clamp_mode)); si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, S_00B42C_USER_SGPR(num_user_sgprs) | S_00B42C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); @@ -177,7 +179,7 @@ static void si_shader_es(struct si_shader *shader) if (shader->selector->type == PIPE_SHADER_VERTEX) { vgpr_comp_cnt = shader->uses_instanceid ? 3 : 0; - num_user_sgprs = SI_VS_NUM_USER_SGPR; + num_user_sgprs = SI_ES_NUM_USER_SGPR; } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) { vgpr_comp_cnt = 3; /* all components are needed for TES */ num_user_sgprs = SI_TES_NUM_USER_SGPR; @@ -570,6 +572,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, key->ps.poly_line_smoothing = ((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) && sctx->framebuffer.nr_samples <= 1; + key->ps.clamp_color = rs->clamp_fragment_color; } key->ps.alpha_func = PIPE_FUNC_ALWAYS; @@ -645,9 +648,8 @@ static int si_shader_select(struct pipe_context *ctx, return 0; } -static void *si_create_shader_state(struct pipe_context *ctx, - const struct pipe_shader_state *state, - unsigned pipe_shader_type) +static void *si_create_shader_selector(struct pipe_context *ctx, + const struct pipe_shader_state *state) { struct si_screen *sscreen = (struct si_screen *)ctx->screen; struct si_shader_selector *sel = CALLOC_STRUCT(si_shader_selector); @@ -656,7 +658,6 @@ static void *si_create_shader_state(struct pipe_context *ctx, if (!sel) return NULL; - sel->type = pipe_shader_type; sel->tokens = tgsi_dup_tokens(state->tokens); if (!sel->tokens) { FREE(sel); @@ -665,6 +666,7 @@ static void *si_create_shader_state(struct pipe_context *ctx, sel->so = state->stream_output; tgsi_scan_shader(state->tokens, &sel->info); + sel->type = util_pipe_shader_from_tgsi_processor(sel->info.processor); p_atomic_inc(&sscreen->b.num_shaders_created); /* First set which opcode uses which (i,j) pair. */ @@ -695,7 +697,7 @@ static void *si_create_shader_state(struct pipe_context *ctx, sel->info.uses_linear_centroid + sel->info.uses_linear_sample >= 2; - switch (pipe_shader_type) { + switch (sel->type) { case PIPE_SHADER_GEOMETRY: sel->gs_output_prim = sel->info.properties[TGSI_PROPERTY_GS_OUTPUT_PRIM]; @@ -761,36 +763,6 @@ static void *si_create_shader_state(struct pipe_context *ctx, return sel; } -static void *si_create_fs_state(struct pipe_context *ctx, - const struct pipe_shader_state *state) -{ - return si_create_shader_state(ctx, state, PIPE_SHADER_FRAGMENT); -} - -static void *si_create_gs_state(struct pipe_context *ctx, - const struct pipe_shader_state *state) -{ - return si_create_shader_state(ctx, state, PIPE_SHADER_GEOMETRY); -} - -static void *si_create_vs_state(struct pipe_context *ctx, - const struct pipe_shader_state *state) -{ - return si_create_shader_state(ctx, state, PIPE_SHADER_VERTEX); -} - -static void *si_create_tcs_state(struct pipe_context *ctx, - const struct pipe_shader_state *state) -{ - return si_create_shader_state(ctx, state, PIPE_SHADER_TESS_CTRL); -} - -static void *si_create_tes_state(struct pipe_context *ctx, - const struct pipe_shader_state *state) -{ - return si_create_shader_state(ctx, state, PIPE_SHADER_TESS_EVAL); -} - /** * Normally, we only emit 1 viewport and 1 scissor if no shader is using * the VIEWPORT_INDEX output, and emitting the other viewports and scissors @@ -905,11 +877,21 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state) si_mark_atom_dirty(sctx, &sctx->cb_target_mask); } -static void si_delete_shader_selector(struct pipe_context *ctx, - struct si_shader_selector *sel) +static void si_delete_shader_selector(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; + struct si_shader_selector *sel = (struct si_shader_selector *)state; struct si_shader *p = sel->current, *c; + struct si_shader_selector **current_shader[SI_NUM_SHADERS] = { + [PIPE_SHADER_VERTEX] = &sctx->vs_shader, + [PIPE_SHADER_TESS_CTRL] = &sctx->tcs_shader, + [PIPE_SHADER_TESS_EVAL] = &sctx->tes_shader, + [PIPE_SHADER_GEOMETRY] = &sctx->gs_shader, + [PIPE_SHADER_FRAGMENT] = &sctx->ps_shader, + }; + + if (*current_shader[sel->type] == sel) + *current_shader[sel->type] = NULL; while (p) { c = p->next_variant; @@ -940,7 +922,7 @@ static void si_delete_shader_selector(struct pipe_context *ctx, break; } - si_shader_destroy(ctx, p); + si_shader_destroy(p); free(p); p = c; } @@ -949,66 +931,6 @@ static void si_delete_shader_selector(struct pipe_context *ctx, free(sel); } -static void si_delete_vs_shader(struct pipe_context *ctx, void *state) -{ - struct si_context *sctx = (struct si_context *)ctx; - struct si_shader_selector *sel = (struct si_shader_selector *)state; - - if (sctx->vs_shader == sel) { - sctx->vs_shader = NULL; - } - - si_delete_shader_selector(ctx, sel); -} - -static void si_delete_gs_shader(struct pipe_context *ctx, void *state) -{ - struct si_context *sctx = (struct si_context *)ctx; - struct si_shader_selector *sel = (struct si_shader_selector *)state; - - if (sctx->gs_shader == sel) { - sctx->gs_shader = NULL; - } - - si_delete_shader_selector(ctx, sel); -} - -static void si_delete_ps_shader(struct pipe_context *ctx, void *state) -{ - struct si_context *sctx = (struct si_context *)ctx; - struct si_shader_selector *sel = (struct si_shader_selector *)state; - - if (sctx->ps_shader == sel) { - sctx->ps_shader = NULL; - } - - si_delete_shader_selector(ctx, sel); -} - -static void si_delete_tcs_shader(struct pipe_context *ctx, void *state) -{ - struct si_context *sctx = (struct si_context *)ctx; - struct si_shader_selector *sel = (struct si_shader_selector *)state; - - if (sctx->tcs_shader == sel) { - sctx->tcs_shader = NULL; - } - - si_delete_shader_selector(ctx, sel); -} - -static void si_delete_tes_shader(struct pipe_context *ctx, void *state) -{ - struct si_context *sctx = (struct si_context *)ctx; - struct si_shader_selector *sel = (struct si_shader_selector *)state; - - if (sctx->tes_shader == sel) { - sctx->tes_shader = NULL; - } - - si_delete_shader_selector(ctx, sel); -} - static void si_emit_spi_map(struct si_context *sctx, struct r600_atom *atom) { struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; @@ -1284,30 +1206,23 @@ static int si_update_scratch_buffer(struct si_context *sctx, static unsigned si_get_current_scratch_buffer_size(struct si_context *sctx) { - if (!sctx->scratch_buffer) - return 0; - - return sctx->scratch_buffer->b.b.width0; + return sctx->scratch_buffer ? sctx->scratch_buffer->b.b.width0 : 0; } -static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_context *sctx, - struct si_shader_selector *sel) +static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader_selector *sel) { - if (!sel) - return 0; - - return sel->current->scratch_bytes_per_wave; + return sel ? sel->current->scratch_bytes_per_wave : 0; } static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx) { unsigned bytes = 0; - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->ps_shader)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->gs_shader)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->vs_shader)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->tcs_shader)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->tes_shader)); + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader)); + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader)); + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader)); + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tcs_shader)); + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader)); return bytes; } @@ -1322,7 +1237,6 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx) int r; if (scratch_needed_size > 0) { - if (scratch_needed_size > current_scratch_buffer_size) { /* Create a bigger scratch buffer */ pipe_resource_reference( @@ -1361,38 +1275,26 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx) si_pm4_bind_state(sctx, hs, sctx->tcs_shader->current->pm4); /* VS can be bound as LS, ES, or VS. */ - if (sctx->tes_shader) { - r = si_update_scratch_buffer(sctx, sctx->vs_shader); - if (r < 0) - return false; - if (r == 1) + r = si_update_scratch_buffer(sctx, sctx->vs_shader); + if (r < 0) + return false; + if (r == 1) { + if (sctx->tes_shader) si_pm4_bind_state(sctx, ls, sctx->vs_shader->current->pm4); - } else if (sctx->gs_shader) { - r = si_update_scratch_buffer(sctx, sctx->vs_shader); - if (r < 0) - return false; - if (r == 1) + else if (sctx->gs_shader) si_pm4_bind_state(sctx, es, sctx->vs_shader->current->pm4); - } else { - r = si_update_scratch_buffer(sctx, sctx->vs_shader); - if (r < 0) - return false; - if (r == 1) + else si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4); } /* TES can be bound as ES or VS. */ - if (sctx->gs_shader) { - r = si_update_scratch_buffer(sctx, sctx->tes_shader); - if (r < 0) - return false; - if (r == 1) + r = si_update_scratch_buffer(sctx, sctx->tes_shader); + if (r < 0) + return false; + if (r == 1) { + if (sctx->gs_shader) si_pm4_bind_state(sctx, es, sctx->tes_shader->current->pm4); - } else { - r = si_update_scratch_buffer(sctx, sctx->tes_shader); - if (r < 0) - return false; - if (r == 1) + else si_pm4_bind_state(sctx, vs, sctx->tes_shader->current->pm4); } } @@ -1661,11 +1563,11 @@ void si_init_shader_functions(struct si_context *sctx) si_init_atom(sctx, &sctx->spi_map, &sctx->atoms.s.spi_map, si_emit_spi_map); si_init_atom(sctx, &sctx->spi_ps_input, &sctx->atoms.s.spi_ps_input, si_emit_spi_ps_input); - sctx->b.b.create_vs_state = si_create_vs_state; - sctx->b.b.create_tcs_state = si_create_tcs_state; - sctx->b.b.create_tes_state = si_create_tes_state; - sctx->b.b.create_gs_state = si_create_gs_state; - sctx->b.b.create_fs_state = si_create_fs_state; + sctx->b.b.create_vs_state = si_create_shader_selector; + sctx->b.b.create_tcs_state = si_create_shader_selector; + sctx->b.b.create_tes_state = si_create_shader_selector; + sctx->b.b.create_gs_state = si_create_shader_selector; + sctx->b.b.create_fs_state = si_create_shader_selector; sctx->b.b.bind_vs_state = si_bind_vs_shader; sctx->b.b.bind_tcs_state = si_bind_tcs_shader; @@ -1673,9 +1575,9 @@ void si_init_shader_functions(struct si_context *sctx) sctx->b.b.bind_gs_state = si_bind_gs_shader; sctx->b.b.bind_fs_state = si_bind_ps_shader; - sctx->b.b.delete_vs_state = si_delete_vs_shader; - sctx->b.b.delete_tcs_state = si_delete_tcs_shader; - sctx->b.b.delete_tes_state = si_delete_tes_shader; - sctx->b.b.delete_gs_state = si_delete_gs_shader; - sctx->b.b.delete_fs_state = si_delete_ps_shader; + sctx->b.b.delete_vs_state = si_delete_shader_selector; + sctx->b.b.delete_tcs_state = si_delete_shader_selector; + sctx->b.b.delete_tes_state = si_delete_shader_selector; + sctx->b.b.delete_gs_state = si_delete_shader_selector; + sctx->b.b.delete_fs_state = si_delete_shader_selector; } diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c index 2bf795de22d..f8622b96f45 100644 --- a/src/gallium/drivers/svga/svga_context.c +++ b/src/gallium/drivers/svga/svga_context.c @@ -312,6 +312,8 @@ void svga_context_flush( struct svga_context *svga, */ svga->swc->flush(svga->swc, &fence); + svga->hud.num_flushes++; + svga_screen_cache_flush(svgascreen, fence); /* To force the re-emission of rendertargets and texture sampler bindings on diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h index e8575f36c3b..bcce18a3502 100644 --- a/src/gallium/drivers/svga/svga_context.h +++ b/src/gallium/drivers/svga/svga_context.h @@ -44,10 +44,21 @@ /** Non-GPU queries for gallium HUD */ -#define SVGA_QUERY_DRAW_CALLS (PIPE_QUERY_DRIVER_SPECIFIC + 0) -#define SVGA_QUERY_FALLBACKS (PIPE_QUERY_DRIVER_SPECIFIC + 1) -#define SVGA_QUERY_MEMORY_USED (PIPE_QUERY_DRIVER_SPECIFIC + 2) -#define SVGA_QUERY_MAX (PIPE_QUERY_DRIVER_SPECIFIC + 3) +/* per-frame counters */ +#define SVGA_QUERY_NUM_DRAW_CALLS (PIPE_QUERY_DRIVER_SPECIFIC + 0) +#define SVGA_QUERY_NUM_FALLBACKS (PIPE_QUERY_DRIVER_SPECIFIC + 1) +#define SVGA_QUERY_NUM_FLUSHES (PIPE_QUERY_DRIVER_SPECIFIC + 2) +#define SVGA_QUERY_NUM_VALIDATIONS (PIPE_QUERY_DRIVER_SPECIFIC + 3) +#define SVGA_QUERY_MAP_BUFFER_TIME (PIPE_QUERY_DRIVER_SPECIFIC + 4) +#define SVGA_QUERY_NUM_RESOURCES_MAPPED (PIPE_QUERY_DRIVER_SPECIFIC + 5) +/* running total counters */ +#define SVGA_QUERY_MEMORY_USED (PIPE_QUERY_DRIVER_SPECIFIC + 6) +#define SVGA_QUERY_NUM_SHADERS (PIPE_QUERY_DRIVER_SPECIFIC + 7) +#define SVGA_QUERY_NUM_RESOURCES (PIPE_QUERY_DRIVER_SPECIFIC + 8) +#define SVGA_QUERY_NUM_STATE_OBJECTS (PIPE_QUERY_DRIVER_SPECIFIC + 9) +#define SVGA_QUERY_NUM_SURFACE_VIEWS (PIPE_QUERY_DRIVER_SPECIFIC + 10) +/*SVGA_QUERY_MAX has to be last because it is size of an array*/ +#define SVGA_QUERY_MAX (PIPE_QUERY_DRIVER_SPECIFIC + 11) /** * Maximum supported number of constant buffers per shader @@ -463,9 +474,18 @@ struct svga_context /** List of buffers with queued transfers */ struct list_head dirty_buffers; - /** performance / info queries */ - uint64_t num_draw_calls; /**< SVGA_QUERY_DRAW_CALLS */ - uint64_t num_fallbacks; /**< SVGA_QUERY_FALLBACKS */ + /** performance / info queries for HUD */ + struct { + uint64_t num_draw_calls; /**< SVGA_QUERY_DRAW_CALLS */ + uint64_t num_fallbacks; /**< SVGA_QUERY_NUM_FALLBACKS */ + uint64_t num_flushes; /**< SVGA_QUERY_NUM_FLUSHES */ + uint64_t num_validations; /**< SVGA_QUERY_NUM_VALIDATIONS */ + uint64_t map_buffer_time; /**< SVGA_QUERY_MAP_BUFFER_TIME */ + uint64_t num_resources_mapped; /**< SVGA_QUERY_NUM_RESOURCES_MAPPED */ + uint64_t num_shaders; /**< SVGA_QUERY_NUM_SHADERS */ + uint64_t num_state_objects; /**< SVGA_QUERY_NUM_STATE_OBJECTS */ + uint64_t num_surface_views; /**< SVGA_QUERY_NUM_SURFACE_VIEWS */ + } hud; /** The currently bound stream output targets */ unsigned num_so_targets; diff --git a/src/gallium/drivers/svga/svga_pipe_blend.c b/src/gallium/drivers/svga/svga_pipe_blend.c index 06bb3e3bd7e..0c9d6129b53 100644 --- a/src/gallium/drivers/svga/svga_pipe_blend.c +++ b/src/gallium/drivers/svga/svga_pipe_blend.c @@ -321,6 +321,8 @@ svga_create_blend_state(struct pipe_context *pipe, define_blend_state_object(svga, blend); } + svga->hud.num_state_objects++; + return blend; } @@ -359,6 +361,7 @@ static void svga_delete_blend_state(struct pipe_context *pipe, } FREE(blend); + svga->hud.num_state_objects--; } static void svga_set_blend_color( struct pipe_context *pipe, diff --git a/src/gallium/drivers/svga/svga_pipe_depthstencil.c b/src/gallium/drivers/svga/svga_pipe_depthstencil.c index 5ea623be4d9..d84ed1df48e 100644 --- a/src/gallium/drivers/svga/svga_pipe_depthstencil.c +++ b/src/gallium/drivers/svga/svga_pipe_depthstencil.c @@ -202,6 +202,8 @@ svga_create_depth_stencil_state(struct pipe_context *pipe, define_depth_stencil_state_object(svga, ds); } + svga->hud.num_state_objects++; + return ds; } @@ -248,6 +250,7 @@ static void svga_delete_depth_stencil_state(struct pipe_context *pipe, } FREE(depth_stencil); + svga->hud.num_state_objects--; } diff --git a/src/gallium/drivers/svga/svga_pipe_draw.c b/src/gallium/drivers/svga/svga_pipe_draw.c index 303d4565cdb..50ebb53df90 100644 --- a/src/gallium/drivers/svga/svga_pipe_draw.c +++ b/src/gallium/drivers/svga/svga_pipe_draw.c @@ -177,7 +177,7 @@ svga_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) enum pipe_error ret = 0; boolean needed_swtnl; - svga->num_draw_calls++; /* for SVGA_QUERY_DRAW_CALLS */ + svga->hud.num_draw_calls++; /* for SVGA_QUERY_NUM_DRAW_CALLS */ if (u_reduced_prim(info->mode) == PIPE_PRIM_TRIANGLES && svga->curr.rast->templ.cull_face == PIPE_FACE_FRONT_AND_BACK) @@ -219,7 +219,7 @@ svga_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) #endif if (svga->state.sw.need_swtnl) { - svga->num_fallbacks++; /* for SVGA_QUERY_FALLBACKS */ + svga->hud.num_fallbacks++; /* for SVGA_QUERY_NUM_FALLBACKS */ if (!needed_swtnl) { /* * We're switching from HW to SW TNL. SW TNL will require mapping all diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c index 7081e5a1c43..8b9818334ca 100644 --- a/src/gallium/drivers/svga/svga_pipe_query.c +++ b/src/gallium/drivers/svga/svga_pipe_query.c @@ -720,9 +720,17 @@ svga_create_query(struct pipe_context *pipe, define_query_vgpu10(svga, sq, sizeof(SVGADXTimestampQueryResult)); break; - case SVGA_QUERY_DRAW_CALLS: - case SVGA_QUERY_FALLBACKS: + case SVGA_QUERY_NUM_DRAW_CALLS: + case SVGA_QUERY_NUM_FALLBACKS: + case SVGA_QUERY_NUM_FLUSHES: case SVGA_QUERY_MEMORY_USED: + case SVGA_QUERY_NUM_SHADERS: + case SVGA_QUERY_NUM_RESOURCES: + case SVGA_QUERY_NUM_STATE_OBJECTS: + case SVGA_QUERY_NUM_VALIDATIONS: + case SVGA_QUERY_MAP_BUFFER_TIME: + case SVGA_QUERY_NUM_SURFACE_VIEWS: + case SVGA_QUERY_NUM_RESOURCES_MAPPED: break; default: assert(!"unexpected query type in svga_create_query()"); @@ -778,9 +786,17 @@ svga_destroy_query(struct pipe_context *pipe, struct pipe_query *q) destroy_query_vgpu10(svga, sq); sws->fence_reference(sws, &sq->fence, NULL); break; - case SVGA_QUERY_DRAW_CALLS: - case SVGA_QUERY_FALLBACKS: + case SVGA_QUERY_NUM_DRAW_CALLS: + case SVGA_QUERY_NUM_FALLBACKS: + case SVGA_QUERY_NUM_FLUSHES: case SVGA_QUERY_MEMORY_USED: + case SVGA_QUERY_NUM_SHADERS: + case SVGA_QUERY_NUM_RESOURCES: + case SVGA_QUERY_NUM_STATE_OBJECTS: + case SVGA_QUERY_NUM_VALIDATIONS: + case SVGA_QUERY_MAP_BUFFER_TIME: + case SVGA_QUERY_NUM_SURFACE_VIEWS: + case SVGA_QUERY_NUM_RESOURCES_MAPPED: /* nothing */ break; default: @@ -842,13 +858,29 @@ svga_begin_query(struct pipe_context *pipe, struct pipe_query *q) ret = begin_query_vgpu10(svga, sq); assert(ret == PIPE_OK); break; - case SVGA_QUERY_DRAW_CALLS: - sq->begin_count = svga->num_draw_calls; + case SVGA_QUERY_NUM_DRAW_CALLS: + sq->begin_count = svga->hud.num_draw_calls; break; - case SVGA_QUERY_FALLBACKS: - sq->begin_count = svga->num_fallbacks; + case SVGA_QUERY_NUM_FALLBACKS: + sq->begin_count = svga->hud.num_fallbacks; + break; + case SVGA_QUERY_NUM_FLUSHES: + sq->begin_count = svga->hud.num_flushes; + break; + case SVGA_QUERY_NUM_VALIDATIONS: + sq->begin_count = svga->hud.num_validations; + break; + case SVGA_QUERY_MAP_BUFFER_TIME: + sq->begin_count = svga->hud.map_buffer_time; + break; + case SVGA_QUERY_NUM_RESOURCES_MAPPED: + sq->begin_count = svga->hud.num_resources_mapped; break; case SVGA_QUERY_MEMORY_USED: + case SVGA_QUERY_NUM_SHADERS: + case SVGA_QUERY_NUM_RESOURCES: + case SVGA_QUERY_NUM_STATE_OBJECTS: + case SVGA_QUERY_NUM_SURFACE_VIEWS: /* nothing */ break; default: @@ -916,13 +948,29 @@ svga_end_query(struct pipe_context *pipe, struct pipe_query *q) ret = end_query_vgpu10(svga, sq); assert(ret == PIPE_OK); break; - case SVGA_QUERY_DRAW_CALLS: - sq->end_count = svga->num_draw_calls; + case SVGA_QUERY_NUM_DRAW_CALLS: + sq->end_count = svga->hud.num_draw_calls; + break; + case SVGA_QUERY_NUM_FALLBACKS: + sq->end_count = svga->hud.num_fallbacks; + break; + case SVGA_QUERY_NUM_FLUSHES: + sq->end_count = svga->hud.num_flushes; break; - case SVGA_QUERY_FALLBACKS: - sq->end_count = svga->num_fallbacks; + case SVGA_QUERY_NUM_VALIDATIONS: + sq->end_count = svga->hud.num_validations; + break; + case SVGA_QUERY_MAP_BUFFER_TIME: + sq->end_count = svga->hud.map_buffer_time; + break; + case SVGA_QUERY_NUM_RESOURCES_MAPPED: + sq->end_count = svga->hud.num_resources_mapped; break; case SVGA_QUERY_MEMORY_USED: + case SVGA_QUERY_NUM_SHADERS: + case SVGA_QUERY_NUM_RESOURCES: + case SVGA_QUERY_NUM_STATE_OBJECTS: + case SVGA_QUERY_NUM_SURFACE_VIEWS: /* nothing */ break; default: @@ -1007,13 +1055,30 @@ svga_get_query_result(struct pipe_context *pipe, *result = (uint64_t)sResult.numPrimitivesWritten; break; } - case SVGA_QUERY_DRAW_CALLS: - /* fall-through */ - case SVGA_QUERY_FALLBACKS: + /* These are per-frame counters */ + case SVGA_QUERY_NUM_DRAW_CALLS: + case SVGA_QUERY_NUM_FALLBACKS: + case SVGA_QUERY_NUM_FLUSHES: + case SVGA_QUERY_NUM_VALIDATIONS: + case SVGA_QUERY_NUM_RESOURCES_MAPPED: + case SVGA_QUERY_MAP_BUFFER_TIME: vresult->u64 = sq->end_count - sq->begin_count; break; + /* These are running total counters */ case SVGA_QUERY_MEMORY_USED: - vresult->u64 = svgascreen->total_resource_bytes; + vresult->u64 = svgascreen->hud.total_resource_bytes; + break; + case SVGA_QUERY_NUM_SHADERS: + vresult->u64 = svga->hud.num_shaders; + break; + case SVGA_QUERY_NUM_RESOURCES: + vresult->u64 = svgascreen->hud.num_resources; + break; + case SVGA_QUERY_NUM_STATE_OBJECTS: + vresult->u64 = svga->hud.num_state_objects; + break; + case SVGA_QUERY_NUM_SURFACE_VIEWS: + vresult->u64 = svga->hud.num_surface_views; break; default: assert(!"unexpected query type in svga_get_query_result"); diff --git a/src/gallium/drivers/svga/svga_pipe_rasterizer.c b/src/gallium/drivers/svga/svga_pipe_rasterizer.c index a7aadac0111..6310b7a5e86 100644 --- a/src/gallium/drivers/svga/svga_pipe_rasterizer.c +++ b/src/gallium/drivers/svga/svga_pipe_rasterizer.c @@ -352,6 +352,8 @@ svga_create_rasterizer_state(struct pipe_context *pipe, define_rasterizer_object(svga, rast); } + svga->hud.num_state_objects++; + return rast; } @@ -392,6 +394,7 @@ svga_delete_rasterizer_state(struct pipe_context *pipe, void *state) } FREE(state); + svga->hud.num_state_objects--; } diff --git a/src/gallium/drivers/svga/svga_pipe_sampler.c b/src/gallium/drivers/svga/svga_pipe_sampler.c index 60e2d44ace4..95241176510 100644 --- a/src/gallium/drivers/svga/svga_pipe_sampler.c +++ b/src/gallium/drivers/svga/svga_pipe_sampler.c @@ -273,6 +273,8 @@ svga_create_sampler_state(struct pipe_context *pipe, cso->min_lod, cso->view_min_lod, cso->view_max_lod, cso->mipfilter == SVGA3D_TEX_FILTER_NONE ? "SVGA3D_TEX_FILTER_NONE" : "SOMETHING"); + svga->hud.num_state_objects++; + return cso; } @@ -328,6 +330,7 @@ static void svga_delete_sampler_state(struct pipe_context *pipe, } FREE(sampler); + svga->hud.num_state_objects--; } diff --git a/src/gallium/drivers/svga/svga_pipe_vertex.c b/src/gallium/drivers/svga/svga_pipe_vertex.c index e0932a9dbc1..b932c568f53 100644 --- a/src/gallium/drivers/svga/svga_pipe_vertex.c +++ b/src/gallium/drivers/svga/svga_pipe_vertex.c @@ -274,6 +274,9 @@ svga_create_vertex_elements_state(struct pipe_context *pipe, translate_vertex_decls(svga, velems); } } + + svga->hud.num_state_objects++; + return velems; } @@ -315,6 +318,7 @@ svga_delete_vertex_elements_state(struct pipe_context *pipe, void *state) } FREE(velems); + svga->hud.num_state_objects--; } void svga_cleanup_vertex_state( struct svga_context *svga ) diff --git a/src/gallium/drivers/svga/svga_resource_buffer.c b/src/gallium/drivers/svga/svga_resource_buffer.c index 57e37fcfe14..71f2f4f2779 100644 --- a/src/gallium/drivers/svga/svga_resource_buffer.c +++ b/src/gallium/drivers/svga/svga_resource_buffer.c @@ -29,6 +29,7 @@ #include "pipe/p_defines.h" #include "util/u_inlines.h" #include "os/os_thread.h" +#include "os/os_time.h" #include "util/u_math.h" #include "util/u_memory.h" #include "util/u_resource.h" @@ -77,6 +78,7 @@ svga_buffer_transfer_map(struct pipe_context *pipe, struct svga_buffer *sbuf = svga_buffer(resource); struct pipe_transfer *transfer; uint8_t *map; + int64_t begin = os_time_get(); transfer = CALLOC_STRUCT(pipe_transfer); if (transfer == NULL) { @@ -244,6 +246,8 @@ svga_buffer_transfer_map(struct pipe_context *pipe, FREE(transfer); } + svga->hud.map_buffer_time += (os_time_get() - begin); + return map; } @@ -331,7 +335,10 @@ svga_buffer_destroy( struct pipe_screen *screen, if (sbuf->swbuf && !sbuf->user) align_free(sbuf->swbuf); - ss->total_resource_bytes -= sbuf->size; + ss->hud.total_resource_bytes -= sbuf->size; + assert(ss->hud.num_resources > 0); + if (ss->hud.num_resources > 0) + ss->hud.num_resources--; FREE(sbuf); } @@ -409,7 +416,9 @@ svga_buffer_create(struct pipe_screen *screen, (debug_reference_descriptor)debug_describe_resource, 0); sbuf->size = util_resource_size(&sbuf->b.b); - ss->total_resource_bytes += sbuf->size; + ss->hud.total_resource_bytes += sbuf->size; + + ss->hud.num_resources++; return &sbuf->b.b; @@ -427,6 +436,7 @@ svga_user_buffer_create(struct pipe_screen *screen, unsigned bind) { struct svga_buffer *sbuf; + struct svga_screen *ss = svga_screen(screen); sbuf = CALLOC_STRUCT(svga_buffer); if (!sbuf) @@ -450,6 +460,8 @@ svga_user_buffer_create(struct pipe_screen *screen, debug_reference(&sbuf->b.b.reference, (debug_reference_descriptor)debug_describe_resource, 0); + ss->hud.num_resources++; + return &sbuf->b.b; no_sbuf: diff --git a/src/gallium/drivers/svga/svga_resource_buffer.h b/src/gallium/drivers/svga/svga_resource_buffer.h index 75e12c3220c..0591f8960b9 100644 --- a/src/gallium/drivers/svga/svga_resource_buffer.h +++ b/src/gallium/drivers/svga/svga_resource_buffer.h @@ -253,6 +253,9 @@ svga_buffer_hw_storage_map(struct svga_context *svga, unsigned flags, boolean *retry) { struct svga_winsys_screen *sws = svga_buffer_winsys_screen(sbuf); + + svga->hud.num_resources_mapped++; + if (sws->have_gb_objects) { return svga->swc->surface_map(svga->swc, sbuf->handle, flags, retry); } else { diff --git a/src/gallium/drivers/svga/svga_resource_texture.c b/src/gallium/drivers/svga/svga_resource_texture.c index 90787be8073..a02d1e495ff 100644 --- a/src/gallium/drivers/svga/svga_resource_texture.c +++ b/src/gallium/drivers/svga/svga_resource_texture.c @@ -29,6 +29,7 @@ #include "pipe/p_state.h" #include "pipe/p_defines.h" #include "os/os_thread.h" +#include "os/os_time.h" #include "util/u_format.h" #include "util/u_inlines.h" #include "util/u_math.h" @@ -229,11 +230,15 @@ svga_texture_destroy(struct pipe_screen *screen, SVGA_DBG(DEBUG_DMA, "unref sid %p (texture)\n", tex->handle); svga_screen_surface_destroy(ss, &tex->key, &tex->handle); - ss->total_resource_bytes -= tex->size; + ss->hud.total_resource_bytes -= tex->size; FREE(tex->defined); FREE(tex->rendered_to); FREE(tex); + + assert(ss->hud.num_resources > 0); + if (ss->hud.num_resources > 0) + ss->hud.num_resources--; } @@ -322,6 +327,8 @@ svga_texture_transfer_map(struct pipe_context *pipe, boolean use_direct_map = svga_have_gb_objects(svga) && !svga_have_gb_dma(svga); unsigned d; + void *returnVal; + int64_t begin = os_time_get(); /* We can't map texture storage directly unless we have GB objects */ if (usage & PIPE_TRANSFER_MAP_DIRECTLY) { @@ -464,10 +471,10 @@ svga_texture_transfer_map(struct pipe_context *pipe, * Begin mapping code */ if (st->swbuf) { - return st->swbuf; + returnVal = st->swbuf; } else if (!st->use_direct_map) { - return sws->buffer_map(sws, st->hwbuf, usage); + returnVal = sws->buffer_map(sws, st->hwbuf, usage); } else { SVGA3dSize baseLevelSize; @@ -518,9 +525,13 @@ svga_texture_transfer_map(struct pipe_context *pipe, offset += svga3dsurface_get_pixel_offset(tex->key.format, mip_width, mip_height, xoffset, yoffset, zoffset); - - return (void *) (map + offset); + returnVal = (void *) (map + offset); } + + svga->hud.map_buffer_time += (os_time_get() - begin); + svga->hud.num_resources_mapped++; + + return returnVal; } @@ -889,7 +900,8 @@ svga_texture_create(struct pipe_screen *screen, (debug_reference_descriptor)debug_describe_resource, 0); tex->size = util_resource_size(template); - svgascreen->total_resource_bytes += tex->size; + svgascreen->hud.total_resource_bytes += tex->size; + svgascreen->hud.num_resources++; return &tex->b.b; } @@ -901,6 +913,7 @@ svga_texture_from_handle(struct pipe_screen *screen, struct winsys_handle *whandle) { struct svga_winsys_screen *sws = svga_winsys_screen(screen); + struct svga_screen *ss = svga_screen(screen); struct svga_winsys_surface *srf; struct svga_texture *tex; enum SVGA3dSurfaceFormat format = 0; @@ -970,5 +983,7 @@ svga_texture_from_handle(struct pipe_screen *screen, tex->rendered_to = CALLOC(1, sizeof(tex->rendered_to[0])); tex->imported = TRUE; + ss->hud.num_resources++; + return &tex->b.b; } diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c index e0a28788238..dab89814334 100644 --- a/src/gallium/drivers/svga/svga_screen.c +++ b/src/gallium/drivers/svga/svga_screen.c @@ -772,9 +772,22 @@ svga_get_driver_query_info(struct pipe_screen *screen, struct pipe_driver_query_info *info) { static const struct pipe_driver_query_info queries[] = { - {"draw-calls", SVGA_QUERY_DRAW_CALLS, {0}}, - {"fallbacks", SVGA_QUERY_FALLBACKS, {0}}, - {"memory-used", SVGA_QUERY_MEMORY_USED, {0}, PIPE_DRIVER_QUERY_TYPE_BYTES} + /* per-frame counters */ + {"num-draw-calls", SVGA_QUERY_NUM_DRAW_CALLS, {0}}, + {"num-fallbacks", SVGA_QUERY_NUM_FALLBACKS, {0}}, + {"num-flushes", SVGA_QUERY_NUM_FLUSHES, {0}}, + {"num-validations", SVGA_QUERY_NUM_VALIDATIONS, {0}}, + {"map-buffer-time", SVGA_QUERY_MAP_BUFFER_TIME, {0}, + PIPE_DRIVER_QUERY_TYPE_MICROSECONDS}, + {"num-resources-mapped", SVGA_QUERY_NUM_RESOURCES_MAPPED, {0}}, + + /* running total counters */ + {"memory-used", SVGA_QUERY_MEMORY_USED, {0}, + PIPE_DRIVER_QUERY_TYPE_BYTES}, + {"num-shaders", SVGA_QUERY_NUM_SHADERS, {0}}, + {"num-resources", SVGA_QUERY_NUM_RESOURCES, {0}}, + {"num-state-objects", SVGA_QUERY_NUM_STATE_OBJECTS, {0}}, + {"num-surface-views", SVGA_QUERY_NUM_SURFACE_VIEWS, {0}}, }; if (!info) diff --git a/src/gallium/drivers/svga/svga_screen.h b/src/gallium/drivers/svga/svga_screen.h index 5581d2e1ffd..98b56b2a6d1 100644 --- a/src/gallium/drivers/svga/svga_screen.h +++ b/src/gallium/drivers/svga/svga_screen.h @@ -80,8 +80,12 @@ struct svga_screen struct svga_host_surface_cache cache; - /** Memory used by all resources (buffers and surfaces) */ - uint64_t total_resource_bytes; + /** HUD counters */ + struct { + /** Memory used by all resources (buffers and surfaces) */ + uint64_t total_resource_bytes; + uint64_t num_resources; + } hud; }; #ifndef DEBUG diff --git a/src/gallium/drivers/svga/svga_shader.c b/src/gallium/drivers/svga/svga_shader.c index d46e7ebbc38..5c99e16d976 100644 --- a/src/gallium/drivers/svga/svga_shader.c +++ b/src/gallium/drivers/svga/svga_shader.c @@ -414,6 +414,14 @@ svga_set_shader(struct svga_context *svga, } +struct svga_shader_variant * +svga_new_shader_variant(struct svga_context *svga) +{ + svga->hud.num_shaders++; + return CALLOC_STRUCT(svga_shader_variant); +} + + enum pipe_error svga_destroy_shader_variant(struct svga_context *svga, SVGA3dShaderType type, @@ -455,6 +463,8 @@ svga_destroy_shader_variant(struct svga_context *svga, FREE((unsigned *)variant->tokens); FREE(variant); + svga->hud.num_shaders--; + return ret; } diff --git a/src/gallium/drivers/svga/svga_shader.h b/src/gallium/drivers/svga/svga_shader.h index b0800c1ecad..efcac408626 100644 --- a/src/gallium/drivers/svga/svga_shader.h +++ b/src/gallium/drivers/svga/svga_shader.h @@ -273,6 +273,9 @@ svga_set_shader(struct svga_context *svga, SVGA3dShaderType type, struct svga_shader_variant *variant); +struct svga_shader_variant * +svga_new_shader_variant(struct svga_context *svga); + enum pipe_error svga_destroy_shader_variant(struct svga_context *svga, SVGA3dShaderType type, diff --git a/src/gallium/drivers/svga/svga_state.c b/src/gallium/drivers/svga/svga_state.c index 37d16dc9afe..722b369fd4b 100644 --- a/src/gallium/drivers/svga/svga_state.c +++ b/src/gallium/drivers/svga/svga_state.c @@ -225,6 +225,9 @@ svga_update_state(struct svga_context *svga, unsigned max_level) svga->state.dirty[i] |= svga->dirty; svga->dirty = 0; + + svga->hud.num_validations++; + return PIPE_OK; } diff --git a/src/gallium/drivers/svga/svga_state_constants.c b/src/gallium/drivers/svga/svga_state_constants.c index 75592d3bf8b..c93d2a5e565 100644 --- a/src/gallium/drivers/svga/svga_state_constants.c +++ b/src/gallium/drivers/svga/svga_state_constants.c @@ -718,7 +718,7 @@ emit_consts_vgpu10(struct svga_context *svga, unsigned shader) /* round down to mulitple of 16 (this may cause rendering problems * but should avoid a device error). */ - size &= ~16; + size &= ~15; } } diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c index c244d5352d9..e392778c2fb 100644 --- a/src/gallium/drivers/svga/svga_state_fs.c +++ b/src/gallium/drivers/svga/svga_state_fs.c @@ -90,7 +90,8 @@ translate_fragment_program(struct svga_context *svga, PIPE_SHADER_FRAGMENT); } else { - return svga_tgsi_vgpu9_translate(&fs->base, key, PIPE_SHADER_FRAGMENT); + return svga_tgsi_vgpu9_translate(svga, &fs->base, key, + PIPE_SHADER_FRAGMENT); } } diff --git a/src/gallium/drivers/svga/svga_state_gs.c b/src/gallium/drivers/svga/svga_state_gs.c index 7f75410fb57..0b336baee86 100644 --- a/src/gallium/drivers/svga/svga_state_gs.c +++ b/src/gallium/drivers/svga/svga_state_gs.c @@ -53,13 +53,9 @@ translate_geometry_program(struct svga_context *svga, const struct svga_geometry_shader *gs, const struct svga_compile_key *key) { - if (svga_have_vgpu10(svga)) { - return svga_tgsi_vgpu10_translate(svga, &gs->base, key, - PIPE_SHADER_GEOMETRY); - } - else { - return svga_tgsi_vgpu9_translate(&gs->base, key, PIPE_SHADER_GEOMETRY); - } + assert(svga_have_vgpu10(svga)); + return svga_tgsi_vgpu10_translate(svga, &gs->base, key, + PIPE_SHADER_GEOMETRY); } diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c index a846b779e70..24574c1bf85 100644 --- a/src/gallium/drivers/svga/svga_state_vs.c +++ b/src/gallium/drivers/svga/svga_state_vs.c @@ -81,7 +81,8 @@ translate_vertex_program(struct svga_context *svga, PIPE_SHADER_VERTEX); } else { - return svga_tgsi_vgpu9_translate(&vs->base, key, PIPE_SHADER_VERTEX); + return svga_tgsi_vgpu9_translate(svga, &vs->base, key, + PIPE_SHADER_VERTEX); } } diff --git a/src/gallium/drivers/svga/svga_surface.c b/src/gallium/drivers/svga/svga_surface.c index aca5abcdfce..9f09311116e 100644 --- a/src/gallium/drivers/svga/svga_surface.c +++ b/src/gallium/drivers/svga/svga_surface.c @@ -317,6 +317,8 @@ svga_create_surface_view(struct pipe_context *pipe, s->real_level = surf_tmpl->u.tex.level; } + svga->hud.num_surface_views++; + return &s->base; } @@ -509,6 +511,8 @@ svga_surface_destroy(struct pipe_context *pipe, pipe_resource_reference(&surf->texture, NULL); FREE(surf); + + svga->hud.num_surface_views--; } diff --git a/src/gallium/drivers/svga/svga_tgsi.c b/src/gallium/drivers/svga/svga_tgsi.c index 9a6fb465ccb..202eee276b7 100644 --- a/src/gallium/drivers/svga/svga_tgsi.c +++ b/src/gallium/drivers/svga/svga_tgsi.c @@ -175,7 +175,8 @@ svga_shader_emit_header(struct svga_shader_emitter *emit) * it is, it will be copied to a hardware buffer for upload. */ struct svga_shader_variant * -svga_tgsi_vgpu9_translate(const struct svga_shader *shader, +svga_tgsi_vgpu9_translate(struct svga_context *svga, + const struct svga_shader *shader, const struct svga_compile_key *key, unsigned unit) { struct svga_shader_variant *variant = NULL; @@ -227,7 +228,7 @@ svga_tgsi_vgpu9_translate(const struct svga_shader *shader, goto fail; } - variant = CALLOC_STRUCT(svga_shader_variant); + variant = svga_new_shader_variant(svga); if (variant == NULL) goto fail; diff --git a/src/gallium/drivers/svga/svga_tgsi.h b/src/gallium/drivers/svga/svga_tgsi.h index 207a3f0a845..2581135701f 100644 --- a/src/gallium/drivers/svga/svga_tgsi.h +++ b/src/gallium/drivers/svga/svga_tgsi.h @@ -63,7 +63,8 @@ static inline void svga_generate_vdecl_semantics( unsigned idx, struct svga_shader_variant * -svga_tgsi_vgpu9_translate(const struct svga_shader *shader, +svga_tgsi_vgpu9_translate(struct svga_context *svga, + const struct svga_shader *shader, const struct svga_compile_key *key, unsigned unit); struct svga_shader_variant * diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c index e4f027b9567..d62f2bbcc96 100644 --- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c +++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c @@ -6735,7 +6735,7 @@ svga_tgsi_vgpu10_translate(struct svga_context *svga, /* * Create, initialize the 'variant' object. */ - variant = CALLOC_STRUCT(svga_shader_variant); + variant = svga_new_shader_variant(svga); if (!variant) goto cleanup; diff --git a/src/gallium/state_trackers/va/image.c b/src/gallium/state_trackers/va/image.c index 022240df84f..b37a9714437 100644 --- a/src/gallium/state_trackers/va/image.c +++ b/src/gallium/state_trackers/va/image.c @@ -116,7 +116,7 @@ vlVaCreateImage(VADriverContextP ctx, VAImageFormat *format, int width, int heig img->width = width; img->height = height; w = align(width, 2); - h = align(width, 2); + h = align(height, 2); switch (format->fourcc) { case VA_FOURCC('N','V','1','2'): @@ -240,9 +240,11 @@ vlVaGetImage(VADriverContextP ctx, VASurfaceID surface, int x, int y, return VA_STATUS_ERROR_OPERATION_FAILED; if (format != surf->buffer->buffer_format) { - /* support NV12 to YV12 conversion now only */ - if (format == PIPE_FORMAT_YV12 && - surf->buffer->buffer_format == PIPE_FORMAT_NV12) + /* support NV12 to YV12 and IYUV conversion now only */ + if ((format == PIPE_FORMAT_YV12 && + surf->buffer->buffer_format == PIPE_FORMAT_NV12) || + (format == PIPE_FORMAT_IYUV && + surf->buffer->buffer_format == PIPE_FORMAT_NV12)) convert = true; else return VA_STATUS_ERROR_OPERATION_FAILED; diff --git a/src/gallium/targets/d3dadapter9/Makefile.am b/src/gallium/targets/d3dadapter9/Makefile.am index e26ca33a521..b5221472ef0 100644 --- a/src/gallium/targets/d3dadapter9/Makefile.am +++ b/src/gallium/targets/d3dadapter9/Makefile.am @@ -76,7 +76,6 @@ d3dadapter9_la_LIBADD = \ $(top_builddir)/src/gallium/auxiliary/libgalliumvl_stub.la \ $(top_builddir)/src/gallium/auxiliary/libgallium.la \ $(top_builddir)/src/glsl/libnir.la \ - $(top_builddir)/src/libglsl_util.la \ $(top_builddir)/src/gallium/state_trackers/nine/libninetracker.la \ $(top_builddir)/src/util/libmesautil.la \ $(top_builddir)/src/gallium/winsys/sw/wrapper/libwsw.la \ diff --git a/src/gallium/targets/pipe-loader/Makefile.am b/src/gallium/targets/pipe-loader/Makefile.am index 4d9f7be2ec9..4f25b4f6073 100644 --- a/src/gallium/targets/pipe-loader/Makefile.am +++ b/src/gallium/targets/pipe-loader/Makefile.am @@ -53,7 +53,6 @@ endif PIPE_LIBS += \ $(top_builddir)/src/gallium/auxiliary/libgallium.la \ $(top_builddir)/src/glsl/libnir.la \ - $(top_builddir)/src/libglsl_util.la \ $(top_builddir)/src/util/libmesautil.la \ $(top_builddir)/src/gallium/drivers/rbug/librbug.la \ $(top_builddir)/src/gallium/drivers/trace/libtrace.la \ diff --git a/src/gallium/targets/xa/Makefile.am b/src/gallium/targets/xa/Makefile.am index 92173dedce3..02c42c665ed 100644 --- a/src/gallium/targets/xa/Makefile.am +++ b/src/gallium/targets/xa/Makefile.am @@ -38,7 +38,6 @@ libxatracker_la_LIBADD = \ $(top_builddir)/src/gallium/auxiliary/libgalliumvl_stub.la \ $(top_builddir)/src/gallium/auxiliary/libgallium.la \ $(top_builddir)/src/glsl/libnir.la \ - $(top_builddir)/src/libglsl_util.la \ $(top_builddir)/src/util/libmesautil.la \ $(LIBDRM_LIBS) \ $(GALLIUM_COMMON_LIB_DEPS) diff --git a/src/glsl/Makefile.am b/src/glsl/Makefile.am index 08368311b8a..8b0a73b250a 100644 --- a/src/glsl/Makefile.am +++ b/src/glsl/Makefile.am @@ -148,9 +148,6 @@ libglsl_la_SOURCES = \ libnir_la_SOURCES = \ - glsl_types.cpp \ - builtin_types.cpp \ - glsl_symbol_table.cpp \ $(NIR_FILES) \ $(NIR_GENERATED_FILES) @@ -160,6 +157,7 @@ glsl_compiler_SOURCES = \ glsl_compiler_LDADD = \ libglsl.la \ $(top_builddir)/src/libglsl_util.la \ + $(top_builddir)/src/util/libmesautil.la \ $(PTHREAD_LIBS) spirv2nir_SOURCES = \ @@ -284,6 +282,5 @@ nir_tests_control_flow_tests_CFLAGS = \ nir_tests_control_flow_tests_LDADD = \ $(top_builddir)/src/gtest/libgtest.la \ $(top_builddir)/src/glsl/libnir.la \ - $(top_builddir)/src/libglsl_util.la \ $(top_builddir)/src/util/libmesautil.la \ $(PTHREAD_LIBS) diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources index 65a26268c2e..47dc628101d 100644 --- a/src/glsl/Makefile.sources +++ b/src/glsl/Makefile.sources @@ -20,6 +20,8 @@ NIR_GENERATED_FILES = \ NIR_FILES = \ nir/glsl_to_nir.cpp \ nir/glsl_to_nir.h \ + nir/glsl_types.cpp \ + nir/glsl_types.h \ nir/nir.c \ nir/nir.h \ nir/nir_array.h \ @@ -33,6 +35,8 @@ NIR_FILES = \ nir/nir_gs_count_vertices.c \ nir/nir_intrinsics.c \ nir/nir_intrinsics.h \ + nir/nir_instr_set.c \ + nir/nir_instr_set.h \ nir/nir_live_variables.c \ nir/nir_lower_alu_to_scalar.c \ nir/nir_lower_atomics.c \ @@ -81,6 +85,8 @@ NIR_FILES = \ nir/nir_worklist.c \ nir/nir_worklist.h \ nir/nir_types.cpp \ + nir/shader_enums.h \ + nir/shader_enums.c \ nir/spirv_to_nir.c \ nir/spirv_glsl450_to_nir.c @@ -103,8 +109,6 @@ LIBGLSL_FILES = \ glsl_parser_extras.h \ glsl_symbol_table.cpp \ glsl_symbol_table.h \ - glsl_types.cpp \ - glsl_types.h \ hir_field_selection.cpp \ ir_basic_block.cpp \ ir_basic_block.h \ @@ -206,8 +210,7 @@ LIBGLSL_FILES = \ opt_vectorize.cpp \ program.h \ s_expression.cpp \ - s_expression.h \ - shader_enums.h + s_expression.h # glsl_compiler diff --git a/src/glsl/SConscript b/src/glsl/SConscript index 89c603580a5..70bf5b09c3c 100644 --- a/src/glsl/SConscript +++ b/src/glsl/SConscript @@ -16,6 +16,7 @@ env.Prepend(CPPPATH = [ '#src/gallium/include', '#src/gallium/auxiliary', '#src/glsl', + '#src/glsl/nir', '#src/glsl/glcpp', ]) @@ -60,6 +61,12 @@ source_lists = env.ParseSourceList('Makefile.sources') for l in ('LIBGLCPP_FILES', 'LIBGLSL_FILES'): glsl_sources += source_lists[l] +# add nir/glsl_types.cpp manually, because SCons still doesn't know about NIR. +# XXX: Remove this once we build NIR and NIR_FILES. +glsl_sources += [ + 'nir/glsl_types.cpp', +] + if env['msvc']: env.Prepend(CPPPATH = ['#/src/getopt']) env.PrependUnique(LIBS = [getopt]) diff --git a/src/glsl/ast.h b/src/glsl/ast.h index 4c314366133..e803e6d7675 100644 --- a/src/glsl/ast.h +++ b/src/glsl/ast.h @@ -62,6 +62,8 @@ public: virtual ir_rvalue *hir(exec_list *instructions, struct _mesa_glsl_parse_state *state); + virtual bool has_sequence_subexpression() const; + /** * Retrieve the source location of an AST node * @@ -181,6 +183,7 @@ enum ast_operators { ast_post_dec, ast_field_selection, ast_array_index, + ast_unsized_array_dim, ast_function_call, @@ -221,6 +224,8 @@ public: virtual void hir_no_rvalue(exec_list *instructions, struct _mesa_glsl_parse_state *state); + virtual bool has_sequence_subexpression() const; + ir_rvalue *do_hir(exec_list *instructions, struct _mesa_glsl_parse_state *state, bool needs_rvalue); @@ -299,6 +304,8 @@ public: virtual void hir_no_rvalue(exec_list *instructions, struct _mesa_glsl_parse_state *state); + virtual bool has_sequence_subexpression() const; + private: /** * Is this function call actually a constructor? @@ -318,16 +325,7 @@ public: class ast_array_specifier : public ast_node { public: - /** Unsized array specifier ([]) */ - explicit ast_array_specifier(const struct YYLTYPE &locp) - : is_unsized_array(true) - { - set_location(locp); - } - - /** Sized array specifier ([dim]) */ ast_array_specifier(const struct YYLTYPE &locp, ast_expression *dim) - : is_unsized_array(false) { set_location(locp); array_dimensions.push_tail(&dim->link); @@ -338,13 +336,16 @@ public: array_dimensions.push_tail(&dim->link); } - virtual void print(void) const; + const bool is_single_dimension() + { + return this->array_dimensions.tail_pred->prev != NULL && + this->array_dimensions.tail_pred->prev->is_head_sentinel(); + } - /* If true, this means that the array has an unsized outermost dimension. */ - bool is_unsized_array; + virtual void print(void) const; /* This list contains objects of type ast_node containing the - * sized dimensions only, in outermost-to-innermost order. + * array dimensions in outermost-to-innermost order. */ exec_list array_dimensions; }; diff --git a/src/glsl/ast_array_index.cpp b/src/glsl/ast_array_index.cpp index 5e8f49d70b0..74d403fdb65 100644 --- a/src/glsl/ast_array_index.cpp +++ b/src/glsl/ast_array_index.cpp @@ -28,13 +28,10 @@ void ast_array_specifier::print(void) const { - if (this->is_unsized_array) { - printf("[ ] "); - } - foreach_list_typed (ast_node, array_dimension, link, &this->array_dimensions) { printf("[ "); - array_dimension->print(); + if (((ast_expression*)array_dimension)->oper != ast_unsized_array_dim) + array_dimension->print(); printf("] "); } } @@ -64,21 +61,29 @@ update_max_array_access(ir_rvalue *ir, int idx, YYLTYPE *loc, } } else if (ir_dereference_record *deref_record = ir->as_dereference_record()) { - /* There are two possibilities we need to consider: + /* There are three possibilities we need to consider: * * - Accessing an element of an array that is a member of a named * interface block (e.g. ifc.foo[i]) * * - Accessing an element of an array that is a member of a named * interface block array (e.g. ifc[j].foo[i]). + * + * - Accessing an element of an array that is a member of a named + * interface block array of arrays (e.g. ifc[j][k].foo[i]). */ ir_dereference_variable *deref_var = deref_record->record->as_dereference_variable(); if (deref_var == NULL) { - if (ir_dereference_array *deref_array = - deref_record->record->as_dereference_array()) { - deref_var = deref_array->array->as_dereference_variable(); + ir_dereference_array *deref_array = + deref_record->record->as_dereference_array(); + ir_dereference_array *deref_array_prev = NULL; + while (deref_array != NULL) { + deref_array_prev = deref_array; + deref_array = deref_array->array->as_dereference_array(); } + if (deref_array_prev != NULL) + deref_var = deref_array_prev->array->as_dereference_variable(); } if (deref_var != NULL) { @@ -230,7 +235,7 @@ _mesa_ast_array_index_to_hir(void *mem_ctx, ir_var_shader_storage) { _mesa_glsl_error(&loc, state, "unsized array index must be constant"); } - } else if (array->type->fields.array->is_interface() + } else if (array->type->without_array()->is_interface() && (array->variable_referenced()->data.mode == ir_var_uniform || array->variable_referenced()->data.mode == ir_var_shader_storage) && !state->is_version(400, 0) && !state->ARB_gpu_shader5_enable) { diff --git a/src/glsl/ast_function.cpp b/src/glsl/ast_function.cpp index 26d4c62ce36..c5c5cae333b 100644 --- a/src/glsl/ast_function.cpp +++ b/src/glsl/ast_function.cpp @@ -437,13 +437,54 @@ generate_call(exec_list *instructions, ir_function_signature *sig, } } - /* If the function call is a constant expression, don't generate any - * instructions; just generate an ir_constant. + /* Section 4.3.2 (Const) of the GLSL 1.10.59 spec says: + * + * "Initializers for const declarations must be formed from literal + * values, other const variables (not including function call + * paramaters), or expressions of these. + * + * Constructors may be used in such expressions, but function calls may + * not." + * + * Section 4.3.3 (Constant Expressions) of the GLSL 1.20.8 spec says: + * + * "A constant expression is one of + * + * ... + * + * - a built-in function call whose arguments are all constant + * expressions, with the exception of the texture lookup + * functions, the noise functions, and ftransform. The built-in + * functions dFdx, dFdy, and fwidth must return 0 when evaluated + * inside an initializer with an argument that is a constant + * expression." + * + * Section 5.10 (Constant Expressions) of the GLSL ES 1.00.17 spec says: + * + * "A constant expression is one of * - * Function calls were first allowed to be constant expressions in GLSL - * 1.20 and GLSL ES 3.00. + * ... + * + * - a built-in function call whose arguments are all constant + * expressions, with the exception of the texture lookup + * functions." + * + * Section 4.3.3 (Constant Expressions) of the GLSL ES 3.00.4 spec says: + * + * "A constant expression is one of + * + * ... + * + * - a built-in function call whose arguments are all constant + * expressions, with the exception of the texture lookup + * functions. The built-in functions dFdx, dFdy, and fwidth must + * return 0 when evaluated inside an initializer with an argument + * that is a constant expression." + * + * If the function call is a constant expression, don't generate any + * instructions; just generate an ir_constant. */ - if (state->is_version(120, 300)) { + if (state->is_version(120, 100)) { ir_constant *value = sig->constant_expression_value(actual_parameters, NULL); if (value != NULL) { return value; @@ -950,6 +991,7 @@ process_array_constructor(exec_list *instructions, } bool all_parameters_are_constant = true; + const glsl_type *element_type = constructor_type->fields.array; /* Type cast each parameter and, if possible, fold constants. */ foreach_in_list_safe(ir_rvalue, ir, &actual_parameters) { @@ -976,12 +1018,34 @@ process_array_constructor(exec_list *instructions, } } - if (result->type != constructor_type->fields.array) { + if (constructor_type->fields.array->is_unsized_array()) { + /* As the inner parameters of the constructor are created without + * knowledge of each other we need to check to make sure unsized + * parameters of unsized constructors all end up with the same size. + * + * e.g we make sure to fail for a constructor like this: + * vec4[][] a = vec4[][](vec4[](vec4(0.0), vec4(1.0)), + * vec4[](vec4(0.0), vec4(1.0), vec4(1.0)), + * vec4[](vec4(0.0), vec4(1.0))); + */ + if (element_type->is_unsized_array()) { + /* This is the first parameter so just get the type */ + element_type = result->type; + } else if (element_type != result->type) { + _mesa_glsl_error(loc, state, "type error in array constructor: " + "expected: %s, found %s", + element_type->name, + result->type->name); + return ir_rvalue::error_value(ctx); + } + } else if (result->type != constructor_type->fields.array) { _mesa_glsl_error(loc, state, "type error in array constructor: " "expected: %s, found %s", constructor_type->fields.array->name, result->type->name); return ir_rvalue::error_value(ctx); + } else { + element_type = result->type; } /* Attempt to convert the parameter to a constant valued expression. @@ -998,6 +1062,14 @@ process_array_constructor(exec_list *instructions, ir->replace_with(result); } + if (constructor_type->fields.array->is_unsized_array()) { + constructor_type = + glsl_type::get_array_instance(element_type, + parameter_count); + assert(constructor_type != NULL); + assert(constructor_type->length == parameter_count); + } + if (all_parameters_are_constant) return new(ctx) ir_constant(constructor_type, &actual_parameters); @@ -1958,6 +2030,17 @@ ast_function_expression::hir(exec_list *instructions, unreachable("not reached"); } +bool +ast_function_expression::has_sequence_subexpression() const +{ + foreach_list_typed(const ast_node, ast, link, &this->expressions) { + if (ast->has_sequence_subexpression()) + return true; + } + + return false; +} + ir_rvalue * ast_aggregate_initializer::hir(exec_list *instructions, struct _mesa_glsl_parse_state *state) diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp index f38ca84d129..0c11ec58d20 100644 --- a/src/glsl/ast_to_hir.cpp +++ b/src/glsl/ast_to_hir.cpp @@ -782,8 +782,30 @@ validate_assignment(struct _mesa_glsl_parse_state *state, * Note: Whole-array assignments are not permitted in GLSL 1.10, but this * is handled by ir_dereference::is_lvalue. */ - if (lhs->type->is_unsized_array() && rhs->type->is_array() - && (lhs->type->fields.array == rhs->type->fields.array)) { + const glsl_type *lhs_t = lhs->type; + const glsl_type *rhs_t = rhs->type; + bool unsized_array = false; + while(lhs_t->is_array()) { + if (rhs_t == lhs_t) + break; /* the rest of the inner arrays match so break out early */ + if (!rhs_t->is_array()) { + unsized_array = false; + break; /* number of dimensions mismatch */ + } + if (lhs_t->length == rhs_t->length) { + lhs_t = lhs_t->fields.array; + rhs_t = rhs_t->fields.array; + continue; + } else if (lhs_t->is_unsized_array()) { + unsized_array = true; + } else { + unsized_array = false; + break; /* sized array mismatch */ + } + lhs_t = lhs_t->fields.array; + rhs_t = rhs_t->fields.array; + } + if (unsized_array) { if (is_initializer) { return rhs; } else { @@ -1004,6 +1026,12 @@ ast_node::hir(exec_list *instructions, struct _mesa_glsl_parse_state *state) return NULL; } +bool +ast_node::has_sequence_subexpression() const +{ + return false; +} + void ast_function_expression::hir_no_rvalue(exec_list *instructions, struct _mesa_glsl_parse_state *state) @@ -1805,6 +1833,10 @@ ast_expression::do_hir(exec_list *instructions, break; } + case ast_unsized_array_dim: + assert(!"ast_unsized_array_dim: Should never get here."); + break; + case ast_function_call: /* Should *NEVER* get here. ast_function_call should always be handled * by ast_function_expression::hir. @@ -1916,6 +1948,83 @@ ast_expression::do_hir(exec_list *instructions, return result; } +bool +ast_expression::has_sequence_subexpression() const +{ + switch (this->oper) { + case ast_plus: + case ast_neg: + case ast_bit_not: + case ast_logic_not: + case ast_pre_inc: + case ast_pre_dec: + case ast_post_inc: + case ast_post_dec: + return this->subexpressions[0]->has_sequence_subexpression(); + + case ast_assign: + case ast_add: + case ast_sub: + case ast_mul: + case ast_div: + case ast_mod: + case ast_lshift: + case ast_rshift: + case ast_less: + case ast_greater: + case ast_lequal: + case ast_gequal: + case ast_nequal: + case ast_equal: + case ast_bit_and: + case ast_bit_xor: + case ast_bit_or: + case ast_logic_and: + case ast_logic_or: + case ast_logic_xor: + case ast_array_index: + case ast_mul_assign: + case ast_div_assign: + case ast_add_assign: + case ast_sub_assign: + case ast_mod_assign: + case ast_ls_assign: + case ast_rs_assign: + case ast_and_assign: + case ast_xor_assign: + case ast_or_assign: + return this->subexpressions[0]->has_sequence_subexpression() || + this->subexpressions[1]->has_sequence_subexpression(); + + case ast_conditional: + return this->subexpressions[0]->has_sequence_subexpression() || + this->subexpressions[1]->has_sequence_subexpression() || + this->subexpressions[2]->has_sequence_subexpression(); + + case ast_sequence: + return true; + + case ast_field_selection: + case ast_identifier: + case ast_int_constant: + case ast_uint_constant: + case ast_float_constant: + case ast_bool_constant: + case ast_double_constant: + return false; + + case ast_aggregate: + unreachable("ast_aggregate: Should never get here."); + + case ast_function_call: + unreachable("should be handled by ast_function_expression::hir"); + + case ast_unsized_array_dim: + unreachable("ast_unsized_array_dim: Should never get here."); + } + + return false; +} ir_rvalue * ast_expression_statement::hir(exec_list *instructions, @@ -1968,6 +2077,14 @@ process_array_size(exec_node *node, exec_list dummy_instructions; ast_node *array_size = exec_node_data(ast_node, node, link); + + /** + * Dimensions other than the outermost dimension can by unsized if they + * are immediately sized by a constructor or initializer. + */ + if (((ast_expression*)array_size)->oper == ast_unsized_array_dim) + return 0; + ir_rvalue *const ir = array_size->hir(& dummy_instructions, state); YYLTYPE loc = array_size->get_location(); @@ -1990,7 +2107,7 @@ process_array_size(exec_node *node, } ir_constant *const size = ir->constant_expression_value(); - if (size == NULL) { + if (size == NULL || array_size->has_sequence_subexpression()) { _mesa_glsl_error(& loc, state, "array size must be a " "constant valued expression"); return 0; @@ -2028,20 +2145,7 @@ process_array_type(YYLTYPE *loc, const glsl_type *base, * * "Only one-dimensional arrays may be declared." */ - if (!state->ARB_arrays_of_arrays_enable) { - _mesa_glsl_error(loc, state, - "invalid array of `%s'" - "GL_ARB_arrays_of_arrays " - "required for defining arrays of arrays", - base->name); - return glsl_type::error_type; - } - - if (base->length == 0) { - _mesa_glsl_error(loc, state, - "only the outermost array dimension can " - "be unsized", - base->name); + if (!state->check_arrays_of_arrays_allowed(loc)) { return glsl_type::error_type; } } @@ -2051,9 +2155,6 @@ process_array_type(YYLTYPE *loc, const glsl_type *base, unsigned array_size = process_array_size(node, state); array_type = glsl_type::get_array_instance(array_type, array_size); } - - if (array_specifier->is_unsized_array) - array_type = glsl_type::get_array_instance(array_type, 0); } return array_type; @@ -2592,6 +2693,25 @@ is_conflicting_fragcoord_redeclaration(struct _mesa_glsl_parse_state *state, return false; } +static inline void +validate_array_dimensions(const glsl_type *t, + struct _mesa_glsl_parse_state *state, + YYLTYPE *loc) { + if (t->is_array()) { + t = t->fields.array; + while (t->is_array()) { + if (t->is_unsized_array()) { + _mesa_glsl_error(loc, state, + "only the outermost array dimension can " + "be unsized", + t->name); + break; + } + t = t->fields.array; + } + } +} + static void apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual, ir_variable *var, @@ -3171,7 +3291,8 @@ process_initializer(ir_variable *var, ast_declaration *decl, */ if (var->data.mode == ir_var_uniform) { state->check_version(120, 0, &initializer_loc, - "cannot initialize uniforms"); + "cannot initialize uniform %s", + var->name); } /* Section 4.3.7 "Buffer Variables" of the GLSL 4.30 spec: @@ -3179,8 +3300,9 @@ process_initializer(ir_variable *var, ast_declaration *decl, * "Buffer variables cannot have initializers." */ if (var->data.mode == ir_var_shader_storage) { - _mesa_glsl_error(& initializer_loc, state, - "SSBO variables cannot have initializers"); + _mesa_glsl_error(&initializer_loc, state, + "cannot initialize buffer variable %s", + var->name); } /* From section 4.1.7 of the GLSL 4.40 spec: @@ -3190,16 +3312,25 @@ process_initializer(ir_variable *var, ast_declaration *decl, * shader." */ if (var->type->contains_opaque()) { - _mesa_glsl_error(& initializer_loc, state, - "cannot initialize opaque variable"); + _mesa_glsl_error(&initializer_loc, state, + "cannot initialize opaque variable %s", + var->name); } if ((var->data.mode == ir_var_shader_in) && (state->current_function == NULL)) { - _mesa_glsl_error(& initializer_loc, state, - "cannot initialize %s shader input / %s", - _mesa_shader_stage_to_string(state->stage), - (state->stage == MESA_SHADER_VERTEX) - ? "attribute" : "varying"); + _mesa_glsl_error(&initializer_loc, state, + "cannot initialize %s shader input / %s %s", + _mesa_shader_stage_to_string(state->stage), + (state->stage == MESA_SHADER_VERTEX) + ? "attribute" : "varying", + var->name); + } + + if (var->data.mode == ir_var_shader_out && state->current_function == NULL) { + _mesa_glsl_error(&initializer_loc, state, + "cannot initialize %s shader output %s", + _mesa_shader_stage_to_string(state->stage), + var->name); } /* If the initializer is an ast_aggregate_initializer, recursively store @@ -3214,16 +3345,72 @@ process_initializer(ir_variable *var, ast_declaration *decl, /* Calculate the constant value if this is a const or uniform * declaration. + * + * Section 4.3 (Storage Qualifiers) of the GLSL ES 1.00.17 spec says: + * + * "Declarations of globals without a storage qualifier, or with + * just the const qualifier, may include initializers, in which case + * they will be initialized before the first line of main() is + * executed. Such initializers must be a constant expression." + * + * The same section of the GLSL ES 3.00.4 spec has similar language. */ if (type->qualifier.flags.q.constant - || type->qualifier.flags.q.uniform) { + || type->qualifier.flags.q.uniform + || (state->es_shader && state->current_function == NULL)) { ir_rvalue *new_rhs = validate_assignment(state, initializer_loc, lhs, rhs, true); if (new_rhs != NULL) { rhs = new_rhs; + /* Section 4.3.3 (Constant Expressions) of the GLSL ES 3.00.4 spec + * says: + * + * "A constant expression is one of + * + * ... + * + * - an expression formed by an operator on operands that are + * all constant expressions, including getting an element of + * a constant array, or a field of a constant structure, or + * components of a constant vector. However, the sequence + * operator ( , ) and the assignment operators ( =, +=, ...) + * are not included in the operators that can create a + * constant expression." + * + * Section 12.43 (Sequence operator and constant expressions) says: + * + * "Should the following construct be allowed? + * + * float a[2,3]; + * + * The expression within the brackets uses the sequence operator + * (',') and returns the integer 3 so the construct is declaring + * a single-dimensional array of size 3. In some languages, the + * construct declares a two-dimensional array. It would be + * preferable to make this construct illegal to avoid confusion. + * + * One possibility is to change the definition of the sequence + * operator so that it does not return a constant-expression and + * hence cannot be used to declare an array size. + * + * RESOLUTION: The result of a sequence operator is not a + * constant-expression." + * + * Section 4.3.3 (Constant Expressions) of the GLSL 4.30.9 spec + * contains language almost identical to the section 4.3.3 in the + * GLSL ES 3.00.4 spec. This is a new limitation for these GLSL + * versions. + */ ir_constant *constant_value = rhs->constant_expression_value(); - if (!constant_value) { + if (!constant_value || + (state->is_version(430, 300) && + decl->initializer->has_sequence_subexpression())) { + const char *const variable_mode = + (type->qualifier.flags.q.constant) + ? "const" + : ((type->qualifier.flags.q.uniform) ? "uniform" : "global"); + /* If ARB_shading_language_420pack is enabled, initializers of * const-qualified local variables do not have to be constant * expressions. Const-qualified global variables must still be @@ -3234,22 +3421,24 @@ process_initializer(ir_variable *var, ast_declaration *decl, _mesa_glsl_error(& initializer_loc, state, "initializer of %s variable `%s' must be a " "constant expression", - (type->qualifier.flags.q.constant) - ? "const" : "uniform", + variable_mode, decl->identifier); if (var->type->is_numeric()) { /* Reduce cascading errors. */ - var->constant_value = ir_constant::zero(state, var->type); + var->constant_value = type->qualifier.flags.q.constant + ? ir_constant::zero(state, var->type) : NULL; } } } else { rhs = constant_value; - var->constant_value = constant_value; + var->constant_value = type->qualifier.flags.q.constant + ? constant_value : NULL; } } else { if (var->type->is_numeric()) { /* Reduce cascading errors. */ - var->constant_value = ir_constant::zero(state, var->type); + var->constant_value = type->qualifier.flags.q.constant + ? ir_constant::zero(state, var->type) : NULL; } } } @@ -4265,6 +4454,8 @@ ast_declarator_list::hir(exec_list *instructions, result = process_initializer((earlier == NULL) ? var : earlier, decl, this->type, &initializer_instructions, state); + } else { + validate_array_dimensions(var_type, state, &loc); } /* From page 23 (page 29 of the PDF) of the GLSL 1.10 spec: @@ -5790,6 +5981,7 @@ ast_process_structure_or_interface_block(exec_list *instructions, const struct glsl_type *field_type = process_array_type(&loc, decl_type, decl->array_specifier, state); + validate_array_dimensions(field_type, state, &loc); fields[i].type = field_type; fields[i].name = decl->identifier; fields[i].location = -1; @@ -6142,7 +6334,8 @@ ast_interface_block::hir(exec_list *instructions, _mesa_shader_stage_to_string(state->stage)); } if (this->instance_name == NULL || - strcmp(this->instance_name, "gl_in") != 0 || this->array_specifier == NULL) { + strcmp(this->instance_name, "gl_in") != 0 || this->array_specifier == NULL || + !this->array_specifier->is_single_dimension()) { _mesa_glsl_error(&loc, state, "gl_PerVertex input must be redeclared as " "gl_in[]"); @@ -6305,6 +6498,9 @@ ast_interface_block::hir(exec_list *instructions, ir_variable *var; if (this->array_specifier != NULL) { + const glsl_type *block_array_type = + process_array_type(&loc, block_type, this->array_specifier, state); + /* Section 4.3.7 (Interface Blocks) of the GLSL 1.50 spec says: * * For uniform blocks declared an array, each individual array @@ -6328,7 +6524,7 @@ ast_interface_block::hir(exec_list *instructions, * tessellation control shader output, and tessellation evaluation * shader input. */ - if (this->array_specifier->is_unsized_array) { + if (block_array_type->is_unsized_array()) { bool allow_inputs = state->stage == MESA_SHADER_GEOMETRY || state->stage == MESA_SHADER_TESS_CTRL || state->stage == MESA_SHADER_TESS_EVAL; @@ -6355,9 +6551,6 @@ ast_interface_block::hir(exec_list *instructions, } } - const glsl_type *block_array_type = - process_array_type(&loc, block_type, this->array_specifier, state); - /* From section 4.3.9 (Interface Blocks) of the GLSL ES 3.10 spec: * * * Arrays of arrays of blocks are not allowed diff --git a/src/glsl/builtin_functions.cpp b/src/glsl/builtin_functions.cpp index f0f6be21b7d..aae25f893e8 100644 --- a/src/glsl/builtin_functions.cpp +++ b/src/glsl/builtin_functions.cpp @@ -403,7 +403,7 @@ shader_atomic_counters(const _mesa_glsl_parse_state *state) static bool shader_storage_buffer_object(const _mesa_glsl_parse_state *state) { - return state->ARB_shader_storage_buffer_object_enable; + return state->has_shader_storage_buffer_objects(); } static bool diff --git a/src/glsl/builtin_types.cpp b/src/glsl/builtin_types.cpp index 0aedbb3546a..bbdcd199e92 100644 --- a/src/glsl/builtin_types.cpp +++ b/src/glsl/builtin_types.cpp @@ -43,9 +43,7 @@ * convenience pointers (glsl_type::foo_type). * @{ */ -#define DECL_TYPE(NAME, ...) \ - const glsl_type glsl_type::_##NAME##_type = glsl_type(__VA_ARGS__, #NAME); \ - const glsl_type *const glsl_type::NAME##_type = &glsl_type::_##NAME##_type; +#define DECL_TYPE(NAME, ...) #define STRUCT_TYPE(NAME) \ const glsl_type glsl_type::_struct_##NAME##_type = \ diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy index c1bcccc34f4..cd00f6e085b 100644 --- a/src/glsl/glsl_parser.yy +++ b/src/glsl/glsl_parser.yy @@ -1962,7 +1962,9 @@ array_specifier: '[' ']' { void *ctx = state; - $$ = new(ctx) ast_array_specifier(@1); + $$ = new(ctx) ast_array_specifier(@1, new(ctx) ast_expression( + ast_unsized_array_dim, NULL, + NULL, NULL)); $$->set_location_range(@1, @2); } | '[' constant_expression ']' @@ -1973,29 +1975,21 @@ array_specifier: } | array_specifier '[' ']' { + void *ctx = state; $$ = $1; - if (!state->ARB_arrays_of_arrays_enable) { - _mesa_glsl_error(& @1, state, - "GL_ARB_arrays_of_arrays " - "required for defining arrays of arrays"); - } else { - _mesa_glsl_error(& @1, state, - "only the outermost array dimension can " - "be unsized"); + if (state->check_arrays_of_arrays_allowed(& @1)) { + $$->add_dimension(new(ctx) ast_expression(ast_unsized_array_dim, NULL, + NULL, NULL)); } } | array_specifier '[' constant_expression ']' { $$ = $1; - if (!state->ARB_arrays_of_arrays_enable) { - _mesa_glsl_error(& @1, state, - "GL_ARB_arrays_of_arrays " - "required for defining arrays of arrays"); + if (state->check_arrays_of_arrays_allowed(& @1)) { + $$->add_dimension($3); } - - $$->add_dimension($3); } ; diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h index 7fee43ece52..e8740f9ecb9 100644 --- a/src/glsl/glsl_parser_extras.h +++ b/src/glsl/glsl_parser_extras.h @@ -115,6 +115,20 @@ struct _mesa_glsl_parse_state { unsigned required_glsl_es_version, YYLTYPE *locp, const char *fmt, ...) PRINTFLIKE(5, 6); + bool check_arrays_of_arrays_allowed(YYLTYPE *locp) + { + if (!(ARB_arrays_of_arrays_enable || is_version(430, 310))) { + const char *const requirement = this->es_shader + ? "GLSL ES 3.10" + : "GL_ARB_arrays_of_arrays or GLSL 4.30"; + _mesa_glsl_error(locp, this, + "%s required for defining arrays of arrays.", + requirement); + return false; + } + return true; + } + bool check_precision_qualifiers_allowed(YYLTYPE *locp) { return check_version(130, 100, locp, diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp index 2c45b9edc0f..8933b230177 100644 --- a/src/glsl/ir.cpp +++ b/src/glsl/ir.cpp @@ -662,6 +662,22 @@ ir_expression::get_operator(const char *str) return (ir_expression_operation) -1; } +ir_variable * +ir_expression::variable_referenced() const +{ + switch (operation) { + case ir_binop_vector_extract: + case ir_triop_vector_insert: + /* We get these for things like a[0] where a is a vector type. In these + * cases we want variable_referenced() to return the actual vector + * variable this is wrapping. + */ + return operands[0]->variable_referenced(); + default: + return ir_rvalue::variable_referenced(); + } +} + ir_constant::ir_constant() : ir_rvalue(ir_type_constant) { @@ -1673,8 +1689,8 @@ ir_variable::ir_variable(const struct glsl_type *type, const char *name, if (type->is_interface()) this->init_interface_type(type); - else if (type->is_array() && type->fields.array->is_interface()) - this->init_interface_type(type->fields.array); + else if (type->without_array()->is_interface()) + this->init_interface_type(type->without_array()); } } diff --git a/src/glsl/ir.h b/src/glsl/ir.h index 43a2bf0ae1c..9c9f22d018b 100644 --- a/src/glsl/ir.h +++ b/src/glsl/ir.h @@ -1731,6 +1731,8 @@ public: virtual ir_visitor_status accept(ir_hierarchical_visitor *); + virtual ir_variable *variable_referenced() const; + ir_expression_operation operation; ir_rvalue *operands[4]; }; diff --git a/src/glsl/ir_constant_expression.cpp b/src/glsl/ir_constant_expression.cpp index 309b6b72b5b..67ed3605a8c 100644 --- a/src/glsl/ir_constant_expression.cpp +++ b/src/glsl/ir_constant_expression.cpp @@ -36,6 +36,7 @@ #include <math.h> #include "main/core.h" /* for MAX2, MIN2, CLAMP */ #include "util/rounding.h" /* for _mesa_roundeven */ +#include "util/half_float.h" #include "ir.h" #include "glsl_types.h" #include "program/hash_table.h" diff --git a/src/glsl/ir_set_program_inouts.cpp b/src/glsl/ir_set_program_inouts.cpp index b7a0f6e95ba..d7c29b00f88 100644 --- a/src/glsl/ir_set_program_inouts.cpp +++ b/src/glsl/ir_set_program_inouts.cpp @@ -242,6 +242,12 @@ ir_set_program_inouts_visitor::try_mark_partial_variable(ir_variable *var, type = type->fields.array; } + /* TODO: implement proper arrays of arrays support + * for now let the caller mark whole variable as used. + */ + if (type->is_array() && type->fields.array->is_array()) + return false; + /* The code below only handles: * * - Indexing into matrices diff --git a/src/glsl/ir_uniform.h b/src/glsl/ir_uniform.h index 50fe76b7ea2..1854279925b 100644 --- a/src/glsl/ir_uniform.h +++ b/src/glsl/ir_uniform.h @@ -162,6 +162,22 @@ struct gl_uniform_storage { /** @} */ /** + * This is a compiler-generated uniform that should not be advertised + * via the API. + */ + bool hidden; + + /** + * This is a built-in uniform that should not be modified through any gl API. + */ + bool builtin; + + /** + * This is a shader storage buffer variable, not an uniform. + */ + bool is_shader_storage; + + /** * Index within gl_shader_program::AtomicBuffers[] of the atomic * counter buffer this uniform is stored in, or -1 if this is not * an atomic counter. @@ -181,20 +197,16 @@ struct gl_uniform_storage { unsigned num_compatible_subroutines; /** - * This is a compiler-generated uniform that should not be advertised - * via the API. + * A single integer identifying the number of active array elements of + * the top-level shader storage block member (GL_TOP_LEVEL_ARRAY_SIZE). */ - bool hidden; + unsigned top_level_array_size; /** - * This is a built-in uniform that should not be modified through any gl API. + * A single integer identifying the stride between array elements of the + * top-level shader storage block member. (GL_TOP_LEVEL_ARRAY_STRIDE). */ - bool builtin; - - /** - * This is a shader storage buffer variable, not an uniform. - */ - bool is_shader_storage; + unsigned top_level_array_stride; }; #ifdef __cplusplus diff --git a/src/glsl/ir_variable_refcount.cpp b/src/glsl/ir_variable_refcount.cpp index e4d825c454b..790627bd1e3 100644 --- a/src/glsl/ir_variable_refcount.cpp +++ b/src/glsl/ir_variable_refcount.cpp @@ -46,6 +46,15 @@ static void free_entry(struct hash_entry *entry) { ir_variable_refcount_entry *ivre = (ir_variable_refcount_entry *) entry->data; + + /* Free assignment list */ + exec_node *n; + while ((n = ivre->assign_list.pop_head()) != NULL) { + struct assignment_entry *assignment_entry = + exec_node_data(struct assignment_entry, n, link); + free(assignment_entry); + } + delete ivre; } @@ -59,7 +68,6 @@ ir_variable_refcount_visitor::~ir_variable_refcount_visitor() ir_variable_refcount_entry::ir_variable_refcount_entry(ir_variable *var) { this->var = var; - assign = NULL; assigned_count = 0; declaration = false; referenced_count = 0; @@ -125,8 +133,20 @@ ir_variable_refcount_visitor::visit_leave(ir_assignment *ir) entry = this->get_variable_entry(ir->lhs->variable_referenced()); if (entry) { entry->assigned_count++; - if (entry->assign == NULL) - entry->assign = ir; + + /* Build a list for dead code optimisation. Don't add assignment if it + * was declared out of scope (outside the instruction stream). Also don't + * bother adding any more to the list if there are more references than + * assignments as this means the variable is used and won't be optimised + * out. + */ + assert(entry->referenced_count >= entry->assigned_count); + if (entry->referenced_count == entry->assigned_count) { + struct assignment_entry *assignment_entry = + (struct assignment_entry *)calloc(1, sizeof(*assignment_entry)); + assignment_entry->assign = ir; + entry->assign_list.push_head(&assignment_entry->link); + } } return visit_continue; diff --git a/src/glsl/ir_variable_refcount.h b/src/glsl/ir_variable_refcount.h index c15e8110d04..5c74c314781 100644 --- a/src/glsl/ir_variable_refcount.h +++ b/src/glsl/ir_variable_refcount.h @@ -33,13 +33,24 @@ #include "ir_visitor.h" #include "glsl_types.h" +struct assignment_entry { + exec_node link; + ir_assignment *assign; +}; + class ir_variable_refcount_entry { public: ir_variable_refcount_entry(ir_variable *var); ir_variable *var; /* The key: the variable's pointer. */ - ir_assignment *assign; /* An assignment to the variable, if any */ + + /** + * List of assignments to the variable, if any. + * This is intended to be used for dead code optimisation and may + * not be a complete list. + */ + exec_list assign_list; /** Number of times the variable is referenced, including assignments. */ unsigned referenced_count; diff --git a/src/glsl/link_atomics.cpp b/src/glsl/link_atomics.cpp index 100d03c4e8f..70ef0e1c891 100644 --- a/src/glsl/link_atomics.cpp +++ b/src/glsl/link_atomics.cpp @@ -33,7 +33,7 @@ namespace { * Atomic counter as seen by the program. */ struct active_atomic_counter { - unsigned id; + unsigned uniform_loc; ir_variable *var; }; @@ -52,7 +52,7 @@ namespace { free(counters); } - void push_back(unsigned id, ir_variable *var) + void push_back(unsigned uniform_loc, ir_variable *var) { active_atomic_counter *new_counters; @@ -66,7 +66,7 @@ namespace { } counters = new_counters; - counters[num_counters].id = id; + counters[num_counters].uniform_loc = uniform_loc; counters[num_counters].var = var; num_counters++; } @@ -95,6 +95,50 @@ namespace { y->data.atomic.offset < x->data.atomic.offset + x->type->atomic_size())); } + void + process_atomic_variable(const glsl_type *t, struct gl_shader_program *prog, + unsigned *uniform_loc, ir_variable *var, + active_atomic_buffer *const buffers, + unsigned *num_buffers, int *offset, + const unsigned shader_stage) + { + /* FIXME: Arrays of arrays get counted separately. For example: + * x1[3][3][2] = 9 counters + * x2[3][2] = 3 counters + * x3[2] = 1 counter + * + * However this code marks all the counters as active even when they + * might not be used. + */ + if (t->is_array() && t->fields.array->is_array()) { + for (unsigned i = 0; i < t->length; i++) { + process_atomic_variable(t->fields.array, prog, uniform_loc, + var, buffers, num_buffers, offset, + shader_stage); + } + } else { + active_atomic_buffer *buf = &buffers[var->data.binding]; + gl_uniform_storage *const storage = + &prog->UniformStorage[*uniform_loc]; + + /* If this is the first time the buffer is used, increment + * the counter of buffers used. + */ + if (buf->size == 0) + (*num_buffers)++; + + buf->push_back(*uniform_loc, var); + + buf->stage_references[shader_stage]++; + buf->size = MAX2(buf->size, *offset + t->atomic_size()); + + storage->offset = *offset; + *offset += t->atomic_size(); + + (*uniform_loc)++; + } + } + active_atomic_buffer * find_active_atomic_counters(struct gl_context *ctx, struct gl_shader_program *prog, @@ -114,23 +158,10 @@ namespace { ir_variable *var = node->as_variable(); if (var && var->type->contains_atomic()) { - unsigned id = 0; - bool found = prog->UniformHash->get(id, var->name); - assert(found); - (void) found; - active_atomic_buffer *buf = &buffers[var->data.binding]; - - /* If this is the first time the buffer is used, increment - * the counter of buffers used. - */ - if (buf->size == 0) - (*num_buffers)++; - - buf->push_back(id, var); - - buf->stage_references[i]++; - buf->size = MAX2(buf->size, var->data.atomic.offset + - var->type->atomic_size()); + int offset = var->data.atomic.offset; + unsigned uniform_loc = var->data.location; + process_atomic_variable(var->type, prog, &uniform_loc, + var, buffers, num_buffers, &offset, i); } } } @@ -197,10 +228,10 @@ link_assign_atomic_counter_resources(struct gl_context *ctx, /* Assign counter-specific fields. */ for (unsigned j = 0; j < ab.num_counters; j++) { ir_variable *const var = ab.counters[j].var; - const unsigned id = ab.counters[j].id; - gl_uniform_storage *const storage = &prog->UniformStorage[id]; + gl_uniform_storage *const storage = + &prog->UniformStorage[ab.counters[j].uniform_loc]; - mab.Uniforms[j] = id; + mab.Uniforms[j] = ab.counters[j].uniform_loc; if (!var->data.explicit_binding) var->data.binding = i; diff --git a/src/glsl/link_uniform_block_active_visitor.cpp b/src/glsl/link_uniform_block_active_visitor.cpp index bcf17fef758..422739af063 100644 --- a/src/glsl/link_uniform_block_active_visitor.cpp +++ b/src/glsl/link_uniform_block_active_visitor.cpp @@ -71,6 +71,88 @@ process_block(void *mem_ctx, struct hash_table *ht, ir_variable *var) return NULL; } +/* For arrays of arrays this function will give us a middle ground between + * detecting inactive uniform blocks and structuring them in a way that makes + * it easy to calculate the offset for indirect indexing. + * + * For example given the shader: + * + * uniform ArraysOfArraysBlock + * { + * vec4 a; + * } i[3][4][5]; + * + * void main() + * { + * vec4 b = i[0][1][1].a; + * gl_Position = i[2][2][3].a + b; + * } + * + * There are only 2 active blocks above but for the sake of indirect indexing + * and not over complicating the code we will end up with a count of 8. + * Here each dimension has 2 different indices counted so we end up with 2*2*2 + */ +struct uniform_block_array_elements ** +process_arrays(void *mem_ctx, ir_dereference_array *ir, + struct link_uniform_block_active *block) +{ + if (ir) { + struct uniform_block_array_elements **ub_array_ptr = + process_arrays(mem_ctx, ir->array->as_dereference_array(), block); + if (*ub_array_ptr == NULL) { + *ub_array_ptr = rzalloc(mem_ctx, struct uniform_block_array_elements); + (*ub_array_ptr)->ir = ir; + } + + struct uniform_block_array_elements *ub_array = *ub_array_ptr; + ir_constant *c = ir->array_index->as_constant(); + if (c) { + /* Index is a constant, so mark just that element used, + * if not already. + */ + const unsigned idx = c->get_uint_component(0); + + unsigned i; + for (i = 0; i < ub_array->num_array_elements; i++) { + if (ub_array->array_elements[i] == idx) + break; + } + + assert(i <= ub_array->num_array_elements); + + if (i == ub_array->num_array_elements) { + ub_array->array_elements = reralloc(mem_ctx, + ub_array->array_elements, + unsigned, + ub_array->num_array_elements + 1); + + ub_array->array_elements[ub_array->num_array_elements] = idx; + + ub_array->num_array_elements++; + } + } else { + /* The array index is not a constant, + * so mark the entire array used. + */ + assert(ir->array->type->is_array()); + if (ub_array->num_array_elements < ir->array->type->length) { + ub_array->num_array_elements = ir->array->type->length; + ub_array->array_elements = reralloc(mem_ctx, + ub_array->array_elements, + unsigned, + ub_array->num_array_elements); + + for (unsigned i = 0; i < ub_array->num_array_elements; i++) { + ub_array->array_elements[i] = i; + } + } + } + return &ub_array->array; + } else { + return &block->array; + } +} + ir_visitor_status link_uniform_block_active_visitor::visit(ir_variable *var) { @@ -101,24 +183,30 @@ link_uniform_block_active_visitor::visit(ir_variable *var) return visit_stop; } - assert(b->num_array_elements == 0); - assert(b->array_elements == NULL); + assert(b->array == NULL); assert(b->type != NULL); assert(!b->type->is_array() || b->has_instance_name); /* For uniform block arrays declared with a shared or std140 layout * qualifier, mark all its instances as used. */ - if (b->type->is_array() && b->type->length > 0) { - b->num_array_elements = b->type->length; - b->array_elements = reralloc(this->mem_ctx, - b->array_elements, - unsigned, - b->num_array_elements); - - for (unsigned i = 0; i < b->num_array_elements; i++) { - b->array_elements[i] = i; + const glsl_type *type = b->type; + struct uniform_block_array_elements **ub_array = &b->array; + while (type->is_array()) { + assert(b->type->length > 0); + + *ub_array = rzalloc(this->mem_ctx, struct uniform_block_array_elements); + (*ub_array)->num_array_elements = type->length; + (*ub_array)->array_elements = reralloc(this->mem_ctx, + (*ub_array)->array_elements, + unsigned, + (*ub_array)->num_array_elements); + + for (unsigned i = 0; i < (*ub_array)->num_array_elements; i++) { + (*ub_array)->array_elements[i] = i; } + ub_array = &(*ub_array)->array; + type = type->fields.array; } return visit_continue; @@ -127,7 +215,13 @@ link_uniform_block_active_visitor::visit(ir_variable *var) ir_visitor_status link_uniform_block_active_visitor::visit_enter(ir_dereference_array *ir) { - ir_dereference_variable *const d = ir->array->as_dereference_variable(); + /* cycle through arrays of arrays */ + ir_dereference_array *base_ir = ir; + while (base_ir->array->ir_type == ir_type_dereference_array) + base_ir = base_ir->array->as_dereference_array(); + + ir_dereference_variable *const d = + base_ir->array->as_dereference_variable(); ir_variable *const var = (d == NULL) ? NULL : d->var; /* If the r-value being dereferenced is not a variable (e.g., a field of a @@ -158,55 +252,16 @@ link_uniform_block_active_visitor::visit_enter(ir_dereference_array *ir) /* Block arrays must be declared with an instance name. */ assert(b->has_instance_name); - assert((b->num_array_elements == 0) == (b->array_elements == NULL)); assert(b->type != NULL); /* If the block array was declared with a shared or * std140 layout qualifier, all its instances have been already marked * as used in link_uniform_block_active_visitor::visit(ir_variable *). */ - if (var->get_interface_type()->interface_packing != - GLSL_INTERFACE_PACKING_PACKED) - return visit_continue_with_parent; - - ir_constant *c = ir->array_index->as_constant(); - - if (c) { - /* Index is a constant, so mark just that element used, if not already */ - const unsigned idx = c->get_uint_component(0); - - unsigned i; - for (i = 0; i < b->num_array_elements; i++) { - if (b->array_elements[i] == idx) - break; - } - - assert(i <= b->num_array_elements); - - if (i == b->num_array_elements) { - b->array_elements = reralloc(this->mem_ctx, - b->array_elements, - unsigned, - b->num_array_elements + 1); - - b->array_elements[b->num_array_elements] = idx; - - b->num_array_elements++; - } - } else { - /* The array index is not a constant, so mark the entire array used. */ - assert(b->type->is_array()); - if (b->num_array_elements < b->type->length) { - b->num_array_elements = b->type->length; - b->array_elements = reralloc(this->mem_ctx, - b->array_elements, - unsigned, - b->num_array_elements); - - for (unsigned i = 0; i < b->num_array_elements; i++) { - b->array_elements[i] = i; - } - } + if (var->get_interface_type()->interface_packing == + GLSL_INTERFACE_PACKING_PACKED) { + b->var = var; + process_arrays(this->mem_ctx, ir, b); } return visit_continue_with_parent; @@ -234,8 +289,7 @@ link_uniform_block_active_visitor::visit(ir_dereference_variable *ir) return visit_stop; } - assert(b->num_array_elements == 0); - assert(b->array_elements == NULL); + assert(b->array == NULL); assert(b->type != NULL); return visit_continue; diff --git a/src/glsl/link_uniform_block_active_visitor.h b/src/glsl/link_uniform_block_active_visitor.h index b663a884db4..afb52c14a37 100644 --- a/src/glsl/link_uniform_block_active_visitor.h +++ b/src/glsl/link_uniform_block_active_visitor.h @@ -28,11 +28,20 @@ #include "ir.h" #include "util/hash_table.h" +struct uniform_block_array_elements { + unsigned *array_elements; + unsigned num_array_elements; + + ir_dereference_array *ir; + + struct uniform_block_array_elements *array; +}; + struct link_uniform_block_active { const glsl_type *type; + ir_variable *var; - unsigned *array_elements; - unsigned num_array_elements; + struct uniform_block_array_elements *array; unsigned binding; diff --git a/src/glsl/link_uniform_blocks.cpp b/src/glsl/link_uniform_blocks.cpp index 7ceffee799e..5285d8d01e4 100644 --- a/src/glsl/link_uniform_blocks.cpp +++ b/src/glsl/link_uniform_blocks.cpp @@ -116,7 +116,7 @@ private: char *open_bracket = strchr(v->IndexName, '['); assert(open_bracket != NULL); - char *close_bracket = strchr(open_bracket, ']'); + char *close_bracket = strchr(open_bracket, '.') - 1; assert(close_bracket != NULL); /* Length of the tail without the ']' but with the NUL. @@ -185,6 +185,91 @@ struct block { bool has_instance_name; }; +static void +process_block_array(struct uniform_block_array_elements *ub_array, char **name, + size_t name_length, gl_uniform_block *blocks, + ubo_visitor *parcel, gl_uniform_buffer_variable *variables, + const struct link_uniform_block_active *const b, + unsigned *block_index, unsigned *binding_offset, + struct gl_context *ctx, struct gl_shader_program *prog) +{ + if (ub_array) { + for (unsigned j = 0; j < ub_array->num_array_elements; j++) { + size_t new_length = name_length; + + /* Append the subscript to the current variable name */ + ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", + ub_array->array_elements[j]); + + process_block_array(ub_array->array, name, new_length, blocks, + parcel, variables, b, block_index, + binding_offset, ctx, prog); + } + } else { + unsigned i = *block_index; + const glsl_type *type = b->type->without_array(); + + blocks[i].Name = ralloc_strdup(blocks, *name); + blocks[i].Uniforms = &variables[(*parcel).index]; + + /* The GL_ARB_shading_language_420pack spec says: + * + * "If the binding identifier is used with a uniform block + * instanced as an array then the first element of the array + * takes the specified block binding and each subsequent + * element takes the next consecutive uniform block binding + * point." + */ + blocks[i].Binding = (b->has_binding) ? b->binding + *binding_offset : 0; + + blocks[i].UniformBufferSize = 0; + blocks[i]._Packing = gl_uniform_block_packing(type->interface_packing); + + parcel->process(type, blocks[i].Name); + + blocks[i].UniformBufferSize = parcel->buffer_size; + + /* Check SSBO size is lower than maximum supported size for SSBO */ + if (b->is_shader_storage && + parcel->buffer_size > ctx->Const.MaxShaderStorageBlockSize) { + linker_error(prog, "shader storage block `%s' has size %d, " + "which is larger than than the maximum allowed (%d)", + b->type->name, + parcel->buffer_size, + ctx->Const.MaxShaderStorageBlockSize); + } + blocks[i].NumUniforms = + (unsigned)(ptrdiff_t)(&variables[parcel->index] - blocks[i].Uniforms); + blocks[i].IsShaderStorage = b->is_shader_storage; + + *block_index = *block_index + 1; + *binding_offset = *binding_offset + 1; + } +} + +/* This function resizes the array types of the block so that later we can use + * this new size to correctly calculate the offest for indirect indexing. + */ +const glsl_type * +resize_block_array(const glsl_type *type, + struct uniform_block_array_elements *ub_array) +{ + if (type->is_array()) { + struct uniform_block_array_elements *child_array = + type->fields.array->is_array() ? ub_array->array : NULL; + const glsl_type *new_child_type = + resize_block_array(type->fields.array, child_array); + + const glsl_type *new_type = + glsl_type::get_array_instance(new_child_type, + ub_array->num_array_elements); + ub_array->ir->array->type = new_type; + return new_type; + } else { + return type; + } +} + unsigned link_uniform_blocks(void *mem_ctx, struct gl_context *ctx, @@ -223,21 +308,25 @@ link_uniform_blocks(void *mem_ctx, struct hash_entry *entry; hash_table_foreach (block_hash, entry) { - const struct link_uniform_block_active *const b = - (const struct link_uniform_block_active *) entry->data; + struct link_uniform_block_active *const b = + (struct link_uniform_block_active *) entry->data; - const glsl_type *const block_type = - b->type->is_array() ? b->type->fields.array : b->type; + assert((b->array != NULL) == b->type->is_array()); - assert((b->num_array_elements > 0) == b->type->is_array()); + if (b->array != NULL && + (b->type->without_array()->interface_packing == + GLSL_INTERFACE_PACKING_PACKED)) { + b->type = resize_block_array(b->type, b->array); + b->var->type = b->type; + } block_size.num_active_uniforms = 0; - block_size.process(block_type, ""); + block_size.process(b->type->without_array(), ""); - if (b->num_array_elements > 0) { - num_blocks += b->num_array_elements; - num_variables += b->num_array_elements - * block_size.num_active_uniforms; + if (b->array != NULL) { + unsigned aoa_size = b->type->arrays_of_arrays_size(); + num_blocks += aoa_size; + num_variables += aoa_size * block_size.num_active_uniforms; } else { num_blocks++; num_variables += block_size.num_active_uniforms; @@ -281,50 +370,15 @@ link_uniform_blocks(void *mem_ctx, (const struct link_uniform_block_active *) entry->data; const glsl_type *block_type = b->type; - if (b->num_array_elements > 0) { - const char *const name = block_type->fields.array->name; + if (b->array != NULL) { + unsigned binding_offset = 0; + char *name = ralloc_strdup(NULL, block_type->without_array()->name); + size_t name_length = strlen(name); assert(b->has_instance_name); - for (unsigned j = 0; j < b->num_array_elements; j++) { - blocks[i].Name = ralloc_asprintf(blocks, "%s[%u]", name, - b->array_elements[j]); - blocks[i].Uniforms = &variables[parcel.index]; - - /* The GL_ARB_shading_language_420pack spec says: - * - * "If the binding identifier is used with a uniform block - * instanced as an array then the first element of the array - * takes the specified block binding and each subsequent - * element takes the next consecutive uniform block binding - * point." - */ - blocks[i].Binding = (b->has_binding) ? b->binding + j : 0; - - blocks[i].UniformBufferSize = 0; - blocks[i]._Packing = - gl_uniform_block_packing(block_type->interface_packing); - - parcel.process(block_type->fields.array, - blocks[i].Name); - - blocks[i].UniformBufferSize = parcel.buffer_size; - - /* Check SSBO size is lower than maximum supported size for SSBO */ - if (b->is_shader_storage && - parcel.buffer_size > ctx->Const.MaxShaderStorageBlockSize) { - linker_error(prog, "shader storage block `%s' has size %d, " - "which is larger than than the maximum allowed (%d)", - block_type->name, - parcel.buffer_size, - ctx->Const.MaxShaderStorageBlockSize); - } - blocks[i].NumUniforms = - (unsigned)(ptrdiff_t)(&variables[parcel.index] - blocks[i].Uniforms); - - blocks[i].IsShaderStorage = b->is_shader_storage; - - i++; - } + process_block_array(b->array, &name, name_length, blocks, &parcel, + variables, b, &i, &binding_offset, ctx, prog); + ralloc_free(name); } else { blocks[i].Name = ralloc_strdup(blocks, block_type->name); blocks[i].Uniforms = &variables[parcel.index]; diff --git a/src/glsl/link_uniform_initializers.cpp b/src/glsl/link_uniform_initializers.cpp index e9e108a2765..35b9f9c6017 100644 --- a/src/glsl/link_uniform_initializers.cpp +++ b/src/glsl/link_uniform_initializers.cpp @@ -49,7 +49,7 @@ get_uniform_block_index(const gl_shader_program *shProg, const char *uniformBlockName) { for (unsigned i = 0; i < shProg->NumBufferInterfaceBlocks; i++) { - if (!strcmp(shProg->UniformBlocks[i].Name, uniformBlockName)) + if (!strcmp(shProg->BufferInterfaceBlocks[i].Name, uniformBlockName)) return i; } @@ -107,51 +107,64 @@ copy_constant_to_storage(union gl_constant_value *storage, * they have no storage and should be handled elsewhere. */ void -set_opaque_binding(gl_shader_program *prog, const char *name, int binding) +set_opaque_binding(void *mem_ctx, gl_shader_program *prog, + const glsl_type *type, const char *name, int *binding) { - struct gl_uniform_storage *const storage = - get_storage(prog->UniformStorage, prog->NumUniformStorage, name); - if (storage == NULL) { - assert(storage != NULL); - return; - } + if (type->is_array() && type->fields.array->is_array()) { + const glsl_type *const element_type = type->fields.array; - const unsigned elements = MAX2(storage->array_elements, 1); + for (unsigned int i = 0; i < type->length; i++) { + const char *element_name = ralloc_asprintf(mem_ctx, "%s[%d]", name, i); - /* Section 4.4.4 (Opaque-Uniform Layout Qualifiers) of the GLSL 4.20 spec - * says: - * - * "If the binding identifier is used with an array, the first element - * of the array takes the specified unit and each subsequent element - * takes the next consecutive unit." - */ - for (unsigned int i = 0; i < elements; i++) { - storage->storage[i].i = binding + i; - } + set_opaque_binding(mem_ctx, prog, element_type, + element_name, binding); + } + } else { + struct gl_uniform_storage *const storage = + get_storage(prog->UniformStorage, prog->NumUniformStorage, name); - for (int sh = 0; sh < MESA_SHADER_STAGES; sh++) { - gl_shader *shader = prog->_LinkedShaders[sh]; + if (storage == NULL) { + assert(storage != NULL); + return; + } - if (shader) { - if (storage->type->base_type == GLSL_TYPE_SAMPLER && - storage->opaque[sh].active) { - for (unsigned i = 0; i < elements; i++) { - const unsigned index = storage->opaque[sh].index + i; - shader->SamplerUnits[index] = storage->storage[i].i; - } + const unsigned elements = MAX2(storage->array_elements, 1); + + /* Section 4.4.4 (Opaque-Uniform Layout Qualifiers) of the GLSL 4.20 spec + * says: + * + * "If the binding identifier is used with an array, the first element + * of the array takes the specified unit and each subsequent element + * takes the next consecutive unit." + */ + for (unsigned int i = 0; i < elements; i++) { + storage->storage[i].i = (*binding)++; + } + + for (int sh = 0; sh < MESA_SHADER_STAGES; sh++) { + gl_shader *shader = prog->_LinkedShaders[sh]; - } else if (storage->type->base_type == GLSL_TYPE_IMAGE && + if (shader) { + if (storage->type->base_type == GLSL_TYPE_SAMPLER && + storage->opaque[sh].active) { + for (unsigned i = 0; i < elements; i++) { + const unsigned index = storage->opaque[sh].index + i; + shader->SamplerUnits[index] = storage->storage[i].i; + } + + } else if (storage->type->base_type == GLSL_TYPE_IMAGE && storage->opaque[sh].active) { - for (unsigned i = 0; i < elements; i++) { - const unsigned index = storage->opaque[sh].index + i; - shader->ImageUnits[index] = storage->storage[i].i; + for (unsigned i = 0; i < elements; i++) { + const unsigned index = storage->opaque[sh].index + i; + shader->ImageUnits[index] = storage->storage[i].i; + } } } } - } - storage->initialized = true; + storage->initialized = true; + } } void @@ -170,7 +183,7 @@ set_block_binding(gl_shader_program *prog, const char *block_name, int binding) if (stage_index != -1) { struct gl_shader *sh = prog->_LinkedShaders[i]; - sh->UniformBlocks[stage_index].Binding = binding; + sh->BufferInterfaceBlocks[stage_index].Binding = binding; } } } @@ -180,6 +193,7 @@ set_uniform_initializer(void *mem_ctx, gl_shader_program *prog, const char *name, const glsl_type *type, ir_constant *val, unsigned int boolean_true) { + const glsl_type *t_without_array = type->without_array(); if (type->is_record()) { ir_constant *field_constant; @@ -194,7 +208,8 @@ set_uniform_initializer(void *mem_ctx, gl_shader_program *prog, field_constant = (ir_constant *)field_constant->next; } return; - } else if (type->is_array() && type->fields.array->is_record()) { + } else if (t_without_array->is_record() || + (type->is_array() && type->fields.array->is_array())) { const glsl_type *const element_type = type->fields.array; for (unsigned int i = 0; i < type->length; i++) { @@ -284,7 +299,9 @@ link_set_uniform_initializers(struct gl_shader_program *prog, if (type->without_array()->is_sampler() || type->without_array()->is_image()) { - linker::set_opaque_binding(prog, var->name, var->data.binding); + int binding = var->data.binding; + linker::set_opaque_binding(mem_ctx, prog, var->type, + var->name, &binding); } else if (var->is_in_buffer_block()) { const glsl_type *const iface_type = var->get_interface_type(); @@ -327,9 +344,9 @@ link_set_uniform_initializers(struct gl_shader_program *prog, } else { assert(!"Explicit binding not on a sampler, UBO or atomic."); } - } else if (var->constant_value) { + } else if (var->constant_initializer) { linker::set_uniform_initializer(mem_ctx, prog, var->name, - var->type, var->constant_value, + var->type, var->constant_initializer, boolean_true); } } diff --git a/src/glsl/link_uniforms.cpp b/src/glsl/link_uniforms.cpp index 0ccd9c8c865..fe00aa30d07 100644 --- a/src/glsl/link_uniforms.cpp +++ b/src/glsl/link_uniforms.cpp @@ -149,7 +149,8 @@ program_resource_visitor::process(ir_variable *var) recursion(var->type, &name, strlen(name), row_major, NULL, packing, false, record_array_count); ralloc_free(name); - } else if (t->without_array()->is_record()) { + } else if (t_without_array->is_record() || + (t->is_array() && t->fields.array->is_array())) { char *name = ralloc_strdup(NULL, var->name); recursion(var->type, &name, strlen(name), row_major, NULL, packing, false, record_array_count); @@ -160,6 +161,7 @@ program_resource_visitor::process(ir_variable *var) false, record_array_count); ralloc_free(name); } else { + this->set_record_array_count(record_array_count); this->visit_field(t, var->name, row_major, NULL, packing, false); } } @@ -231,7 +233,8 @@ program_resource_visitor::recursion(const glsl_type *t, char **name, this->leave_record(t, *name, row_major, packing); } } else if (t->without_array()->is_record() || - t->without_array()->is_interface()) { + t->without_array()->is_interface() || + (t->is_array() && t->fields.array->is_array())) { if (record_type == NULL && t->fields.array->is_record()) record_type = t->fields.array; @@ -387,6 +390,7 @@ private: { assert(!type->without_array()->is_record()); assert(!type->without_array()->is_interface()); + assert(!(type->is_array() && type->fields.array->is_array())); (void) row_major; @@ -502,9 +506,9 @@ public: for (unsigned i = 0; i < prog->NumBufferInterfaceBlocks; i++) { if (strncmp(var->get_interface_type()->name, - prog->UniformBlocks[i].Name, + prog->BufferInterfaceBlocks[i].Name, l) == 0 - && prog->UniformBlocks[i].Name[l] == '[') { + && prog->BufferInterfaceBlocks[i].Name[l] == '[') { ubo_block_index = i; break; } @@ -512,7 +516,7 @@ public: } else { for (unsigned i = 0; i < prog->NumBufferInterfaceBlocks; i++) { if (strcmp(var->get_interface_type()->name, - prog->UniformBlocks[i].Name) == 0) { + prog->BufferInterfaceBlocks[i].Name) == 0) { ubo_block_index = i; break; } @@ -530,7 +534,7 @@ public: ubo_byte_offset = 0; } else { const struct gl_uniform_block *const block = - &prog->UniformBlocks[ubo_block_index]; + &prog->BufferInterfaceBlocks[ubo_block_index]; assert(var->data.location != -1); @@ -712,6 +716,7 @@ private: { assert(!type->without_array()->is_record()); assert(!type->without_array()->is_interface()); + assert(!(type->is_array() && type->fields.array->is_array())); unsigned id; bool found = this->map->get(id, name); @@ -804,10 +809,11 @@ private: if (type->is_array()) { if (packing == GLSL_INTERFACE_PACKING_STD430) this->uniforms[id].array_stride = - type->fields.array->std430_array_stride(row_major); + type->without_array()->std430_array_stride(row_major); else this->uniforms[id].array_stride = - glsl_align(type->fields.array->std140_size(row_major), 16); + glsl_align(type->without_array()->std140_size(row_major), + 16); } else { this->uniforms[id].array_stride = 0; } @@ -966,15 +972,16 @@ link_update_uniform_buffer_variables(struct gl_shader *shader) if (var->type->is_record()) { sentinel = '.'; - } else if (var->type->without_array()->is_record()) { + } else if (var->type->is_array() && (var->type->fields.array->is_array() + || var->type->without_array()->is_record())) { sentinel = '['; } const unsigned l = strlen(var->name); - for (unsigned i = 0; i < shader->NumUniformBlocks; i++) { - for (unsigned j = 0; j < shader->UniformBlocks[i].NumUniforms; j++) { + for (unsigned i = 0; i < shader->NumBufferInterfaceBlocks; i++) { + for (unsigned j = 0; j < shader->BufferInterfaceBlocks[i].NumUniforms; j++) { if (sentinel) { - const char *begin = shader->UniformBlocks[i].Uniforms[j].Name; + const char *begin = shader->BufferInterfaceBlocks[i].Uniforms[j].Name; const char *end = strchr(begin, sentinel); if (end == NULL) @@ -989,7 +996,7 @@ link_update_uniform_buffer_variables(struct gl_shader *shader) break; } } else if (!strcmp(var->name, - shader->UniformBlocks[i].Uniforms[j].Name)) { + shader->BufferInterfaceBlocks[i].Uniforms[j].Name)) { found = true; var->data.location = j; break; @@ -1115,10 +1122,10 @@ link_assign_uniform_locations(struct gl_shader_program *prog, sh->num_uniform_components = uniform_size.num_shader_uniform_components; sh->num_combined_uniform_components = sh->num_uniform_components; - for (unsigned i = 0; i < sh->NumUniformBlocks; i++) { - if (!sh->UniformBlocks[i].IsShaderStorage) { + for (unsigned i = 0; i < sh->NumBufferInterfaceBlocks; i++) { + if (!sh->BufferInterfaceBlocks[i].IsShaderStorage) { sh->num_combined_uniform_components += - sh->UniformBlocks[i].UniformBufferSize / 4; + sh->BufferInterfaceBlocks[i].UniformBufferSize / 4; } } } diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp index a97b4ef0a32..25ca928aa43 100644 --- a/src/glsl/linker.cpp +++ b/src/glsl/linker.cpp @@ -65,6 +65,7 @@ */ #include <ctype.h> +#include "util/strndup.h" #include "main/core.h" #include "glsl_symbol_table.h" #include "glsl_parser_extras.h" @@ -1161,7 +1162,7 @@ cross_validate_uniforms(struct gl_shader_program *prog) } /** - * Accumulates the array of prog->UniformBlocks and checks that all + * Accumulates the array of prog->BufferInterfaceBlocks and checks that all * definitons of blocks agree on their contents. */ static bool @@ -1170,7 +1171,7 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog) unsigned max_num_uniform_blocks = 0; for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { if (prog->_LinkedShaders[i]) - max_num_uniform_blocks += prog->_LinkedShaders[i]->NumUniformBlocks; + max_num_uniform_blocks += prog->_LinkedShaders[i]->NumBufferInterfaceBlocks; } for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { @@ -1184,15 +1185,15 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog) if (sh == NULL) continue; - for (unsigned int j = 0; j < sh->NumUniformBlocks; j++) { + for (unsigned int j = 0; j < sh->NumBufferInterfaceBlocks; j++) { int index = link_cross_validate_uniform_block(prog, - &prog->UniformBlocks, + &prog->BufferInterfaceBlocks, &prog->NumBufferInterfaceBlocks, - &sh->UniformBlocks[j]); + &sh->BufferInterfaceBlocks[j]); if (index == -1) { linker_error(prog, "uniform block `%s' has mismatching definitions\n", - sh->UniformBlocks[j].Name); + sh->BufferInterfaceBlocks[j].Name); return false; } @@ -1386,8 +1387,10 @@ public: virtual ir_visitor_status visit(ir_variable *var) { + const glsl_type *type_without_array; fixup_type(&var->type, var->data.max_array_access, var->data.from_ssbo_unsized_array); + type_without_array = var->type->without_array(); if (var->type->is_interface()) { if (interface_contains_unsized_arrays(var->type)) { const glsl_type *new_type = @@ -1397,11 +1400,10 @@ public: var->type = new_type; var->change_interface_type(new_type); } - } else if (var->type->is_array() && - var->type->fields.array->is_interface()) { - if (interface_contains_unsized_arrays(var->type->fields.array)) { + } else if (type_without_array->is_interface()) { + if (interface_contains_unsized_arrays(type_without_array)) { const glsl_type *new_type = - resize_interface_members(var->type->fields.array, + resize_interface_members(type_without_array, var->get_max_ifc_array_access(), var->is_in_shader_storage_block()); var->change_interface_type(new_type); @@ -2064,9 +2066,9 @@ link_intrastage_shaders(void *mem_ctx, linked->ir = new(linked) exec_list; clone_ir_list(mem_ctx, linked->ir, main->ir); - linked->UniformBlocks = uniform_blocks; - linked->NumUniformBlocks = num_uniform_blocks; - ralloc_steal(linked, linked->UniformBlocks); + linked->BufferInterfaceBlocks = uniform_blocks; + linked->NumBufferInterfaceBlocks = num_uniform_blocks; + ralloc_steal(linked, linked->BufferInterfaceBlocks); link_fs_input_layout_qualifiers(prog, linked, shader_list, num_shaders); link_tcs_out_layout_qualifiers(prog, linked, shader_list, num_shaders); @@ -2804,19 +2806,19 @@ check_resources(struct gl_context *ctx, struct gl_shader_program *prog) for (unsigned i = 0; i < prog->NumBufferInterfaceBlocks; i++) { /* Don't check SSBOs for Uniform Block Size */ - if (!prog->UniformBlocks[i].IsShaderStorage && - prog->UniformBlocks[i].UniformBufferSize > ctx->Const.MaxUniformBlockSize) { + if (!prog->BufferInterfaceBlocks[i].IsShaderStorage && + prog->BufferInterfaceBlocks[i].UniformBufferSize > ctx->Const.MaxUniformBlockSize) { linker_error(prog, "Uniform block %s too big (%d/%d)\n", - prog->UniformBlocks[i].Name, - prog->UniformBlocks[i].UniformBufferSize, + prog->BufferInterfaceBlocks[i].Name, + prog->BufferInterfaceBlocks[i].UniformBufferSize, ctx->Const.MaxUniformBlockSize); } - if (prog->UniformBlocks[i].IsShaderStorage && - prog->UniformBlocks[i].UniformBufferSize > ctx->Const.MaxShaderStorageBlockSize) { + if (prog->BufferInterfaceBlocks[i].IsShaderStorage && + prog->BufferInterfaceBlocks[i].UniformBufferSize > ctx->Const.MaxShaderStorageBlockSize) { linker_error(prog, "Shader storage block %s too big (%d/%d)\n", - prog->UniformBlocks[i].Name, - prog->UniformBlocks[i].UniformBufferSize, + prog->BufferInterfaceBlocks[i].Name, + prog->BufferInterfaceBlocks[i].UniformBufferSize, ctx->Const.MaxShaderStorageBlockSize); } @@ -2824,7 +2826,7 @@ check_resources(struct gl_context *ctx, struct gl_shader_program *prog) if (prog->UniformBlockStageIndex[j][i] != -1) { struct gl_shader *sh = prog->_LinkedShaders[j]; int stage_index = prog->UniformBlockStageIndex[j][i]; - if (sh && sh->UniformBlocks[stage_index].IsShaderStorage) { + if (sh && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage) { shader_blocks[j]++; total_shader_storage_blocks++; } else { @@ -2941,7 +2943,7 @@ check_image_resources(struct gl_context *ctx, struct gl_shader_program *prog) for (unsigned j = 0; j < prog->NumBufferInterfaceBlocks; j++) { int stage_index = prog->UniformBlockStageIndex[i][j]; - if (stage_index != -1 && sh->UniformBlocks[stage_index].IsShaderStorage) + if (stage_index != -1 && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage) total_shader_storage_blocks++; } @@ -3147,7 +3149,7 @@ should_add_buffer_variable(struct gl_shader_program *shProg, return true; for (unsigned i = 0; i < shProg->NumBufferInterfaceBlocks; i++) { - block_name = shProg->UniformBlocks[i].Name; + block_name = shProg->BufferInterfaceBlocks[i].Name; if (strncmp(block_name, name, strlen(block_name)) == 0) { found_interface = true; break; @@ -3389,6 +3391,242 @@ add_packed_varyings(struct gl_shader_program *shProg, int stage) return true; } +static char* +get_top_level_name(const char *name) +{ + const char *first_dot = strchr(name, '.'); + const char *first_square_bracket = strchr(name, '['); + int name_size = 0; + /* From ARB_program_interface_query spec: + * + * "For the property TOP_LEVEL_ARRAY_SIZE, a single integer identifying the + * number of active array elements of the top-level shader storage block + * member containing to the active variable is written to <params>. If the + * top-level block member is not declared as an array, the value one is + * written to <params>. If the top-level block member is an array with no + * declared size, the value zero is written to <params>. + */ + + /* The buffer variable is on top level.*/ + if (!first_square_bracket && !first_dot) + name_size = strlen(name); + else if ((!first_square_bracket || + (first_dot && first_dot < first_square_bracket))) + name_size = first_dot - name; + else + name_size = first_square_bracket - name; + + return strndup(name, name_size); +} + +static char* +get_var_name(const char *name) +{ + const char *first_dot = strchr(name, '.'); + + if (!first_dot) + return strdup(name); + + return strndup(first_dot+1, strlen(first_dot) - 1); +} + +static bool +is_top_level_shader_storage_block_member(const char* name, + const char* interface_name, + const char* field_name) +{ + bool result = false; + + /* If the given variable is already a top-level shader storage + * block member, then return array_size = 1. + * We could have two possibilities: if we have an instanced + * shader storage block or not instanced. + * + * For the first, we check create a name as it was in top level and + * compare it with the real name. If they are the same, then + * the variable is already at top-level. + * + * Full instanced name is: interface name + '.' + var name + + * NULL character + */ + int name_length = strlen(interface_name) + 1 + strlen(field_name) + 1; + char *full_instanced_name = (char *) calloc(name_length, sizeof(char)); + if (!full_instanced_name) { + fprintf(stderr, "%s: Cannot allocate space for name\n", __func__); + return false; + } + + snprintf(full_instanced_name, name_length, "%s.%s", + interface_name, field_name); + + /* Check if its top-level shader storage block member of an + * instanced interface block, or of a unnamed interface block. + */ + if (strcmp(name, full_instanced_name) == 0 || + strcmp(name, field_name) == 0) + result = true; + + free(full_instanced_name); + return result; +} + +static void +calculate_array_size(struct gl_shader_program *shProg, + struct gl_uniform_storage *uni) +{ + int block_index = uni->block_index; + int array_size = -1; + char *var_name = get_top_level_name(uni->name); + char *interface_name = + get_top_level_name(shProg->BufferInterfaceBlocks[block_index].Name); + + if (strcmp(var_name, interface_name) == 0) { + /* Deal with instanced array of SSBOs */ + char *temp_name = get_var_name(uni->name); + free(var_name); + var_name = get_top_level_name(temp_name); + free(temp_name); + } + + for (unsigned i = 0; i < shProg->NumShaders; i++) { + if (shProg->Shaders[i] == NULL) + continue; + + const gl_shader *stage = shProg->Shaders[i]; + foreach_in_list(ir_instruction, node, stage->ir) { + ir_variable *var = node->as_variable(); + if (!var || !var->get_interface_type() || + var->data.mode != ir_var_shader_storage) + continue; + + const glsl_type *interface = var->get_interface_type(); + + if (strcmp(interface_name, interface->name) != 0) + continue; + + for (unsigned i = 0; i < interface->length; i++) { + const glsl_struct_field *field = &interface->fields.structure[i]; + if (strcmp(field->name, var_name) != 0) + continue; + /* From GL_ARB_program_interface_query spec: + * + * "For the property TOP_LEVEL_ARRAY_SIZE, a single integer + * identifying the number of active array elements of the top-level + * shader storage block member containing to the active variable is + * written to <params>. If the top-level block member is not + * declared as an array, the value one is written to <params>. If + * the top-level block member is an array with no declared size, + * the value zero is written to <params>. + */ + if (is_top_level_shader_storage_block_member(uni->name, + interface_name, + var_name)) + array_size = 1; + else if (field->type->is_unsized_array()) + array_size = 0; + else if (field->type->is_array()) + array_size = field->type->length; + else + array_size = 1; + + goto found_top_level_array_size; + } + } + } +found_top_level_array_size: + free(interface_name); + free(var_name); + uni->top_level_array_size = array_size; +} + +static void +calculate_array_stride(struct gl_shader_program *shProg, + struct gl_uniform_storage *uni) +{ + int block_index = uni->block_index; + int array_stride = -1; + char *var_name = get_top_level_name(uni->name); + char *interface_name = + get_top_level_name(shProg->BufferInterfaceBlocks[block_index].Name); + + if (strcmp(var_name, interface_name) == 0) { + /* Deal with instanced array of SSBOs */ + char *temp_name = get_var_name(uni->name); + free(var_name); + var_name = get_top_level_name(temp_name); + free(temp_name); + } + + for (unsigned i = 0; i < shProg->NumShaders; i++) { + if (shProg->Shaders[i] == NULL) + continue; + + const gl_shader *stage = shProg->Shaders[i]; + foreach_in_list(ir_instruction, node, stage->ir) { + ir_variable *var = node->as_variable(); + if (!var || !var->get_interface_type() || + var->data.mode != ir_var_shader_storage) + continue; + + const glsl_type *interface = var->get_interface_type(); + + if (strcmp(interface_name, interface->name) != 0) { + continue; + } + + for (unsigned i = 0; i < interface->length; i++) { + const glsl_struct_field *field = &interface->fields.structure[i]; + if (strcmp(field->name, var_name) != 0) + continue; + /* From GL_ARB_program_interface_query: + * + * "For the property TOP_LEVEL_ARRAY_STRIDE, a single integer + * identifying the stride between array elements of the top-level + * shader storage block member containing the active variable is + * written to <params>. For top-level block members declared as + * arrays, the value written is the difference, in basic machine + * units, between the offsets of the active variable for + * consecutive elements in the top-level array. For top-level + * block members not declared as an array, zero is written to + * <params>." + */ + if (field->type->is_array()) { + const enum glsl_matrix_layout matrix_layout = + glsl_matrix_layout(field->matrix_layout); + bool row_major = matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR; + const glsl_type *array_type = field->type->fields.array; + + if (is_top_level_shader_storage_block_member(uni->name, + interface_name, + var_name)) { + array_stride = 0; + goto found_top_level_array_stride; + } + if (interface->interface_packing != GLSL_INTERFACE_PACKING_STD430) { + if (array_type->is_record() || array_type->is_array()) { + array_stride = array_type->std140_size(row_major); + array_stride = glsl_align(array_stride, 16); + } else { + unsigned element_base_align = 0; + element_base_align = array_type->std140_base_alignment(row_major); + array_stride = MAX2(element_base_align, 16); + } + } else { + array_stride = array_type->std430_array_stride(row_major); + } + } else { + array_stride = 0; + } + goto found_top_level_array_stride; + } + } + } +found_top_level_array_stride: + free(interface_name); + free(var_name); + uni->top_level_array_stride = array_stride; +} + /** * Builds up a list of program resources that point to existing * resource data. @@ -3473,6 +3711,11 @@ build_program_resource_list(struct gl_shader_program *shProg) shProg->UniformStorage[i].name)) continue; + if (is_shader_storage) { + calculate_array_size(shProg, &shProg->UniformStorage[i]); + calculate_array_stride(shProg, &shProg->UniformStorage[i]); + } + if (!add_program_resource(shProg, type, &shProg->UniformStorage[i], stageref)) return; @@ -3480,10 +3723,10 @@ build_program_resource_list(struct gl_shader_program *shProg) /* Add program uniform blocks and shader storage blocks. */ for (unsigned i = 0; i < shProg->NumBufferInterfaceBlocks; i++) { - bool is_shader_storage = shProg->UniformBlocks[i].IsShaderStorage; + bool is_shader_storage = shProg->BufferInterfaceBlocks[i].IsShaderStorage; GLenum type = is_shader_storage ? GL_SHADER_STORAGE_BLOCK : GL_UNIFORM_BLOCK; if (!add_program_resource(shProg, type, - &shProg->UniformBlocks[i], 0)) + &shProg->BufferInterfaceBlocks[i], 0)) return; } @@ -3599,6 +3842,42 @@ link_assign_subroutine_types(struct gl_shader_program *prog) } } +static void +split_ubos_and_ssbos(void *mem_ctx, + struct gl_uniform_block *blocks, + unsigned num_blocks, + struct gl_uniform_block ***ubos, + unsigned *num_ubos, + struct gl_uniform_block ***ssbos, + unsigned *num_ssbos) +{ + unsigned num_ubo_blocks = 0; + unsigned num_ssbo_blocks = 0; + + for (unsigned i = 0; i < num_blocks; i++) { + if (blocks[i].IsShaderStorage) + num_ssbo_blocks++; + else + num_ubo_blocks++; + } + + *ubos = ralloc_array(mem_ctx, gl_uniform_block *, num_ubo_blocks); + *num_ubos = 0; + + *ssbos = ralloc_array(mem_ctx, gl_uniform_block *, num_ssbo_blocks); + *num_ssbos = 0; + + for (unsigned i = 0; i < num_blocks; i++) { + if (blocks[i].IsShaderStorage) { + (*ssbos)[(*num_ssbos)++] = &blocks[i]; + } else { + (*ubos)[(*num_ubos)++] = &blocks[i]; + } + } + + assert(*num_ubos + *num_ssbos == num_blocks); +} + void link_shaders(struct gl_context *ctx, struct gl_shader_program *prog) { @@ -4110,6 +4389,31 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog) } } + /* Split BufferInterfaceBlocks into UniformBlocks and ShaderStorageBlocks + * for gl_shader_program and gl_shader, so that drivers that need separate + * index spaces for each set can have that. + */ + for (unsigned i = MESA_SHADER_VERTEX; i < MESA_SHADER_STAGES; i++) { + if (prog->_LinkedShaders[i] != NULL) { + gl_shader *sh = prog->_LinkedShaders[i]; + split_ubos_and_ssbos(sh, + sh->BufferInterfaceBlocks, + sh->NumBufferInterfaceBlocks, + &sh->UniformBlocks, + &sh->NumUniformBlocks, + &sh->ShaderStorageBlocks, + &sh->NumShaderStorageBlocks); + } + } + + split_ubos_and_ssbos(prog, + prog->BufferInterfaceBlocks, + prog->NumBufferInterfaceBlocks, + &prog->UniformBlocks, + &prog->NumUniformBlocks, + &prog->ShaderStorageBlocks, + &prog->NumShaderStorageBlocks); + /* FINISHME: Assign fragment shader output locations. */ done: diff --git a/src/glsl/lower_named_interface_blocks.cpp b/src/glsl/lower_named_interface_blocks.cpp index 01bbdd0587e..276a2dedf47 100644 --- a/src/glsl/lower_named_interface_blocks.cpp +++ b/src/glsl/lower_named_interface_blocks.cpp @@ -65,6 +65,39 @@ #include "ir_rvalue_visitor.h" #include "program/hash_table.h" +static const glsl_type * +process_array_type(const glsl_type *type, unsigned idx) +{ + const glsl_type *element_type = type->fields.array; + if (element_type->is_array()) { + const glsl_type *new_array_type = process_array_type(element_type, idx); + return glsl_type::get_array_instance(new_array_type, type->length); + } else { + return glsl_type::get_array_instance( + element_type->fields.structure[idx].type, type->length); + } +} + +static ir_rvalue * +process_array_ir(void * const mem_ctx, + ir_dereference_array *deref_array_prev, + ir_rvalue *deref_var) +{ + ir_dereference_array *deref_array = + deref_array_prev->array->as_dereference_array(); + + if (deref_array == NULL) { + return new(mem_ctx) ir_dereference_array(deref_var, + deref_array_prev->array_index); + } else { + deref_array = (ir_dereference_array *) process_array_ir(mem_ctx, + deref_array, + deref_var); + return new(mem_ctx) ir_dereference_array(deref_array, + deref_array_prev->array_index); + } +} + namespace { class flatten_named_interface_blocks_declarations : public ir_rvalue_visitor @@ -112,15 +145,9 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions) var->data.mode == ir_var_shader_storage) continue; - const glsl_type * iface_t = var->type; - const glsl_type * array_t = NULL; + const glsl_type * iface_t = var->type->without_array(); exec_node *insert_pos = var; - if (iface_t->is_array()) { - array_t = iface_t; - iface_t = array_t->fields.array; - } - assert (iface_t->is_interface()); for (unsigned i = 0; i < iface_t->length; i++) { @@ -137,7 +164,7 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions) ir_variable *new_var; char *var_name = ralloc_strdup(mem_ctx, iface_t->fields.structure[i].name); - if (array_t == NULL) { + if (!var->type->is_array()) { new_var = new(mem_ctx) ir_variable(iface_t->fields.structure[i].type, var_name, @@ -145,9 +172,7 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions) new_var->data.from_named_ifc_block_nonarray = 1; } else { const glsl_type *new_array_type = - glsl_type::get_array_instance( - iface_t->fields.structure[i].type, - array_t->length); + process_array_type(var->type, i); new_var = new(mem_ctx) ir_variable(new_array_type, var_name, @@ -236,9 +261,8 @@ flatten_named_interface_blocks_declarations::handle_rvalue(ir_rvalue **rvalue) ir_dereference_array *deref_array = ir->record->as_dereference_array(); if (deref_array != NULL) { - *rvalue = - new(mem_ctx) ir_dereference_array(deref_var, - deref_array->array_index); + *rvalue = process_array_ir(mem_ctx, deref_array, + (ir_rvalue *)deref_var); } else { *rvalue = deref_var; } diff --git a/src/glsl/lower_ubo_reference.cpp b/src/glsl/lower_ubo_reference.cpp index 247620e6148..e818c048461 100644 --- a/src/glsl/lower_ubo_reference.cpp +++ b/src/glsl/lower_ubo_reference.cpp @@ -203,55 +203,114 @@ static const char * interface_field_name(void *mem_ctx, char *base_name, ir_rvalue *d, ir_rvalue **nonconst_block_index) { - ir_rvalue *previous_index = NULL; *nonconst_block_index = NULL; + char *name_copy = NULL; + size_t base_length = 0; + + /* Loop back through the IR until we find the uniform block */ + ir_rvalue *ir = d; + while (ir != NULL) { + switch (ir->ir_type) { + case ir_type_dereference_variable: { + /* Exit loop */ + ir = NULL; + break; + } + + case ir_type_dereference_record: { + ir_dereference_record *r = (ir_dereference_record *) ir; + ir = r->record->as_dereference(); + + /* If we got here it means any previous array subscripts belong to + * block members and not the block itself so skip over them in the + * next pass. + */ + d = ir; + break; + } + + case ir_type_dereference_array: { + ir_dereference_array *a = (ir_dereference_array *) ir; + ir = a->array->as_dereference(); + break; + } + + case ir_type_swizzle: { + ir_swizzle *s = (ir_swizzle *) ir; + ir = s->val->as_dereference(); + break; + } + + default: + assert(!"Should not get here."); + break; + } + } while (d != NULL) { switch (d->ir_type) { case ir_type_dereference_variable: { ir_dereference_variable *v = (ir_dereference_variable *) d; - if (previous_index - && v->var->is_interface_instance() - && v->var->type->is_array()) { - - ir_constant *const_index = previous_index->as_constant(); - if (!const_index) { - *nonconst_block_index = previous_index; - return ralloc_asprintf(mem_ctx, "%s[0]", base_name); - } else { - return ralloc_asprintf(mem_ctx, - "%s[%d]", - base_name, - const_index->get_uint_component(0)); - } + if (name_copy != NULL && + v->var->is_interface_instance() && + v->var->type->is_array()) { + return name_copy; } else { + *nonconst_block_index = NULL; return base_name; } break; } - case ir_type_dereference_record: { - ir_dereference_record *r = (ir_dereference_record *) d; - - d = r->record->as_dereference(); - break; - } - case ir_type_dereference_array: { ir_dereference_array *a = (ir_dereference_array *) d; + size_t new_length; + + if (name_copy == NULL) { + name_copy = ralloc_strdup(mem_ctx, base_name); + base_length = strlen(name_copy); + } + + /* For arrays of arrays we start at the innermost array and work our + * way out so we need to insert the subscript at the base of the + * name string rather than just attaching it to the end. + */ + new_length = base_length; + ir_constant *const_index = a->array_index->as_constant(); + char *end = ralloc_strdup(NULL, &name_copy[new_length]); + if (!const_index) { + ir_rvalue *array_index = a->array_index; + if (array_index->type != glsl_type::uint_type) + array_index = i2u(array_index); + + if (a->array->type->is_array() && + a->array->type->fields.array->is_array()) { + ir_constant *base_size = new(mem_ctx) + ir_constant(a->array->type->fields.array->arrays_of_arrays_size()); + array_index = mul(array_index, base_size); + } + + if (*nonconst_block_index) { + *nonconst_block_index = add(*nonconst_block_index, array_index); + } else { + *nonconst_block_index = array_index; + } + + ralloc_asprintf_rewrite_tail(&name_copy, &new_length, "[0]%s", + end); + } else { + ralloc_asprintf_rewrite_tail(&name_copy, &new_length, "[%d]%s", + const_index->get_uint_component(0), + end); + } + ralloc_free(end); d = a->array->as_dereference(); - previous_index = a->array_index; break; } - case ir_type_swizzle: { - ir_swizzle *s = (ir_swizzle *) d; - d = s->val->as_dereference(); - break; - } default: assert(!"Should not get here."); break; @@ -277,27 +336,31 @@ lower_ubo_reference_visitor::setup_for_load_or_store(ir_variable *var, interface_field_name(mem_ctx, (char *) var->get_interface_type()->name, deref, &nonconst_block_index); - /* Locate the ubo block by interface name */ + /* Locate the block by interface name */ + this->is_shader_storage = var->is_in_shader_storage_block(); + unsigned num_blocks; + struct gl_uniform_block **blocks; + if (this->is_shader_storage) { + num_blocks = shader->NumShaderStorageBlocks; + blocks = shader->ShaderStorageBlocks; + } else { + num_blocks = shader->NumUniformBlocks; + blocks = shader->UniformBlocks; + } this->uniform_block = NULL; - for (unsigned i = 0; i < shader->NumUniformBlocks; i++) { - if (strcmp(field_name, shader->UniformBlocks[i].Name) == 0) { + for (unsigned i = 0; i < num_blocks; i++) { + if (strcmp(field_name, blocks[i]->Name) == 0) { ir_constant *index = new(mem_ctx) ir_constant(i); if (nonconst_block_index) { - if (nonconst_block_index->type != glsl_type::uint_type) - nonconst_block_index = i2u(nonconst_block_index); this->uniform_block = add(nonconst_block_index, index); } else { this->uniform_block = index; } - this->is_shader_storage = shader->UniformBlocks[i].IsShaderStorage; - - struct gl_uniform_block *block = &shader->UniformBlocks[i]; - this->ubo_var = var->is_interface_instance() - ? &block->Uniforms[0] : &block->Uniforms[var->data.location]; + ? &blocks[i]->Uniforms[0] : &blocks[i]->Uniforms[var->data.location]; break; } @@ -335,7 +398,7 @@ lower_ubo_reference_visitor::setup_for_load_or_store(ir_variable *var, if (deref_array->array->type->is_double()) array_stride *= 2; *matrix_columns = deref_array->array->type->matrix_columns; - } else if (deref_array->type->is_interface()) { + } else if (deref_array->type->without_array()->is_interface()) { /* We're processing an array dereference of an interface instance * array. The thing being dereferenced *must* be a variable * dereference because interfaces cannot be embedded in other @@ -344,7 +407,6 @@ lower_ubo_reference_visitor::setup_for_load_or_store(ir_variable *var, * interface instance array will have the same offsets relative to * the base of the block that backs them. */ - assert(deref_array->array->as_dereference_variable()); deref = deref_array->array->as_dereference(); break; } else { @@ -744,7 +806,31 @@ lower_ubo_reference_visitor::emit_access(bool is_write, * or 32 depending on the number of columns. */ assert(matrix_columns <= 4); - unsigned matrix_stride = glsl_align(matrix_columns * N, 16); + unsigned matrix_stride = 0; + /* Matrix stride for std430 mat2xY matrices are not rounded up to + * vec4 size. From OpenGL 4.3 spec, section 7.6.2.2 "Standard Uniform + * Block Layout": + * + * "2. If the member is a two- or four-component vector with components + * consuming N basic machine units, the base alignment is 2N or 4N, + * respectively." [...] + * "4. If the member is an array of scalars or vectors, the base alignment + * and array stride are set to match the base alignment of a single array + * element, according to rules (1), (2), and (3), and rounded up to the + * base alignment of a vec4." [...] + * "7. If the member is a row-major matrix with C columns and R rows, the + * matrix is stored identically to an array of R row vectors with C + * components each, according to rule (4)." [...] + * "When using the std430 storage layout, shader storage blocks will be + * laid out in buffer storage identically to uniform and shader storage + * blocks using the std140 layout, except that the base alignment and + * stride of arrays of scalars and vectors in rule 4 and of structures in + * rule 9 are not rounded up a multiple of the base alignment of a vec4." + */ + if (packing == GLSL_INTERFACE_PACKING_STD430 && matrix_columns == 2) + matrix_stride = 2 * N; + else + matrix_stride = glsl_align(matrix_columns * N, 16); const glsl_type *deref_type = deref->type->base_type == GLSL_TYPE_FLOAT ? glsl_type::float_type : glsl_type::double_type; diff --git a/src/glsl/lower_vec_index_to_cond_assign.cpp b/src/glsl/lower_vec_index_to_cond_assign.cpp index 0c3394a504b..b6238825f8a 100644 --- a/src/glsl/lower_vec_index_to_cond_assign.cpp +++ b/src/glsl/lower_vec_index_to_cond_assign.cpp @@ -88,7 +88,9 @@ ir_vec_index_to_cond_assign_visitor::convert_vec_index_to_cond_assign(void *mem_ exec_list list; /* Store the index to a temporary to avoid reusing its tree. */ - index = new(base_ir) ir_variable(glsl_type::int_type, + assert(orig_index->type == glsl_type::int_type || + orig_index->type == glsl_type::uint_type); + index = new(base_ir) ir_variable(orig_index->type, "vec_index_tmp_i", ir_var_temporary); list.push_tail(index); diff --git a/src/glsl/lower_vector_insert.cpp b/src/glsl/lower_vector_insert.cpp index 6d7cfa94262..26d31b03c12 100644 --- a/src/glsl/lower_vector_insert.cpp +++ b/src/glsl/lower_vector_insert.cpp @@ -108,9 +108,13 @@ vector_insert_visitor::handle_rvalue(ir_rvalue **rv) factory.emit(assign(temp, expr->operands[0])); factory.emit(assign(src_temp, expr->operands[1])); + assert(expr->operands[2]->type == glsl_type::int_type || + expr->operands[2]->type == glsl_type::uint_type); + for (unsigned i = 0; i < expr->type->vector_elements; i++) { ir_constant *const cmp_index = - new(factory.mem_ctx) ir_constant(int(i)); + ir_constant::zero(factory.mem_ctx, expr->operands[2]->type); + cmp_index->value.u[0] = i; ir_variable *const cmp_result = factory.make_temp(glsl_type::bool_type, "index_condition"); diff --git a/src/glsl/builtin_type_macros.h b/src/glsl/nir/builtin_type_macros.h index 8e16ae45489..8e16ae45489 100644 --- a/src/glsl/builtin_type_macros.h +++ b/src/glsl/nir/builtin_type_macros.h diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp index 6bedb4eb8e6..e57e834d948 100644 --- a/src/glsl/nir/glsl_to_nir.cpp +++ b/src/glsl/nir/glsl_to_nir.cpp @@ -152,11 +152,13 @@ glsl_to_nir(const struct gl_shader_program *shader_prog, if (sh->Program->SamplersUsed & (1 << i)) num_textures = i; - shader->info.name = ralloc_asprintf(shader, "GLSL%d", sh->Name); + shader->info.name = ralloc_asprintf(shader, "GLSL%d", shader_prog->Name); + if (shader_prog->Label) + shader->info.label = ralloc_strdup(shader, shader_prog->Label); shader->info.num_textures = num_textures; shader->info.num_ubos = sh->NumUniformBlocks; shader->info.num_abos = shader_prog->NumAtomicBuffers; - shader->info.num_ssbos = shader_prog->NumBufferInterfaceBlocks; + shader->info.num_ssbos = sh->NumShaderStorageBlocks; shader->info.num_images = sh->NumImages; shader->info.inputs_read = sh->Program->InputsRead; shader->info.outputs_written = sh->Program->OutputsWritten; @@ -164,11 +166,37 @@ glsl_to_nir(const struct gl_shader_program *shader_prog, shader->info.uses_texture_gather = sh->Program->UsesGather; shader->info.uses_clip_distance_out = sh->Program->UsesClipDistanceOut; shader->info.separate_shader = shader_prog->SeparateShader; - shader->info.gs.vertices_out = sh->Geom.VerticesOut; - shader->info.gs.invocations = sh->Geom.Invocations; shader->info.has_transform_feedback_varyings = shader_prog->TransformFeedback.NumVarying > 0; + switch (stage) { + case MESA_SHADER_GEOMETRY: + shader->info.gs.vertices_out = sh->Geom.VerticesOut; + shader->info.gs.invocations = sh->Geom.Invocations; + break; + + case MESA_SHADER_FRAGMENT: { + struct gl_fragment_program *fp = + (struct gl_fragment_program *)sh->Program; + + shader->info.fs.uses_discard = fp->UsesKill; + shader->info.fs.early_fragment_tests = sh->EarlyFragmentTests; + shader->info.fs.depth_layout = fp->FragDepthLayout; + break; + } + + case MESA_SHADER_COMPUTE: { + struct gl_compute_program *cp = (struct gl_compute_program *)sh->Program; + shader->info.cs.local_size[0] = cp->LocalSize[0]; + shader->info.cs.local_size[1] = cp->LocalSize[1]; + shader->info.cs.local_size[2] = cp->LocalSize[2]; + break; + } + + default: + break; /* No stage-specific info */ + } + return shader; } @@ -393,35 +421,10 @@ nir_visitor::visit(ir_variable *ir) var->interface_type = ir->get_interface_type(); - switch (var->data.mode) { - case nir_var_local: - exec_list_push_tail(&impl->locals, &var->node); - break; - - case nir_var_global: - exec_list_push_tail(&shader->globals, &var->node); - break; - - case nir_var_shader_in: - exec_list_push_tail(&shader->inputs, &var->node); - break; - - case nir_var_shader_out: - exec_list_push_tail(&shader->outputs, &var->node); - break; - - case nir_var_uniform: - case nir_var_shader_storage: - exec_list_push_tail(&shader->uniforms, &var->node); - break; - - case nir_var_system_value: - exec_list_push_tail(&shader->system_values, &var->node); - break; - - default: - unreachable("not reached"); - } + if (var->data.mode == nir_var_local) + nir_function_impl_add_variable(impl, var); + else + nir_shader_add_variable(shader, var); _mesa_hash_table_insert(var_table, ir, var); this->var = var; @@ -695,9 +698,21 @@ nir_visitor::visit(ir_call *ir) } else if (strcmp(ir->callee_name(), "__intrinsic_ssbo_atomic_xor_internal") == 0) { op = nir_intrinsic_ssbo_atomic_xor; } else if (strcmp(ir->callee_name(), "__intrinsic_ssbo_atomic_min_internal") == 0) { - op = nir_intrinsic_ssbo_atomic_min; + assert(ir->return_deref); + if (ir->return_deref->type == glsl_type::int_type) + op = nir_intrinsic_ssbo_atomic_imin; + else if (ir->return_deref->type == glsl_type::uint_type) + op = nir_intrinsic_ssbo_atomic_umin; + else + unreachable("Invalid type"); } else if (strcmp(ir->callee_name(), "__intrinsic_ssbo_atomic_max_internal") == 0) { - op = nir_intrinsic_ssbo_atomic_max; + assert(ir->return_deref); + if (ir->return_deref->type == glsl_type::int_type) + op = nir_intrinsic_ssbo_atomic_imax; + else if (ir->return_deref->type == glsl_type::uint_type) + op = nir_intrinsic_ssbo_atomic_umax; + else + unreachable("Invalid type"); } else if (strcmp(ir->callee_name(), "__intrinsic_ssbo_atomic_exchange_internal") == 0) { op = nir_intrinsic_ssbo_atomic_exchange; } else if (strcmp(ir->callee_name(), "__intrinsic_ssbo_atomic_comp_swap_internal") == 0) { @@ -906,8 +921,10 @@ nir_visitor::visit(ir_call *ir) break; } case nir_intrinsic_ssbo_atomic_add: - case nir_intrinsic_ssbo_atomic_min: - case nir_intrinsic_ssbo_atomic_max: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: case nir_intrinsic_ssbo_atomic_and: case nir_intrinsic_ssbo_atomic_or: case nir_intrinsic_ssbo_atomic_xor: @@ -2065,13 +2082,10 @@ nir_visitor::visit(ir_constant *ir) * constant initializer and return a dereference. */ - nir_variable *var = ralloc(this->shader, nir_variable); - var->name = ralloc_strdup(var, "const_temp"); - var->type = ir->type; - var->data.mode = nir_var_local; + nir_variable *var = + nir_local_variable_create(this->impl, ir->type, "const_temp"); var->data.read_only = true; var->constant_initializer = constant_copy(ir, var); - exec_list_push_tail(&this->impl->locals, &var->node); this->deref_head = nir_deref_var_create(this->shader, var); this->deref_tail = &this->deref_head->deref; diff --git a/src/glsl/glsl_types.cpp b/src/glsl/nir/glsl_types.cpp index 9ef2fbf2525..309f9dca61e 100644 --- a/src/glsl/glsl_types.cpp +++ b/src/glsl/nir/glsl_types.cpp @@ -1175,7 +1175,22 @@ glsl_type::record_location_offset(unsigned length) const const glsl_type *wa = st->without_array(); if (wa->is_record()) { unsigned r_offset = wa->record_location_offset(wa->length); - offset += st->is_array() ? st->length * r_offset : r_offset; + offset += st->is_array() ? + st->arrays_of_arrays_size() * r_offset : r_offset; + } else if (st->is_array() && st->fields.array->is_array()) { + unsigned outer_array_size = st->length; + const glsl_type *base_type = st->fields.array; + + /* For arrays of arrays the outer arrays take up a uniform + * slot for each element. The innermost array elements share a + * single slot so we ignore the innermost array when calculating + * the offset. + */ + while (base_type->fields.array->is_array()) { + outer_array_size = outer_array_size * base_type->length; + base_type = base_type->fields.array; + } + offset += outer_array_size; } else { /* We dont worry about arrays here because unless the array * contains a structure or another array it only takes up a single @@ -1419,8 +1434,8 @@ glsl_type::std140_size(bool row_major) const unsigned int array_len; if (this->is_array()) { - element_type = this->fields.array; - array_len = this->length; + element_type = this->without_array(); + array_len = this->arrays_of_arrays_size(); } else { element_type = this; array_len = 1; @@ -1453,12 +1468,13 @@ glsl_type::std140_size(bool row_major) const * the array are laid out in order, according to rule (9). */ if (this->is_array()) { - if (this->fields.array->is_record()) { - return this->length * this->fields.array->std140_size(row_major); + if (this->without_array()->is_record()) { + return this->arrays_of_arrays_size() * + this->without_array()->std140_size(row_major); } else { - unsigned element_base_align = - this->fields.array->std140_base_alignment(row_major); - return this->length * MAX2(element_base_align, 16); + unsigned element_base_align = + this->without_array()->std140_base_alignment(row_major); + return this->arrays_of_arrays_size() * MAX2(element_base_align, 16); } } @@ -1818,3 +1834,17 @@ glsl_type::coordinate_components() const return size; } + +/** + * Declarations of type flyweights (glsl_type::_foo_type) and + * convenience pointers (glsl_type::foo_type). + * @{ + */ +#define DECL_TYPE(NAME, ...) \ + const glsl_type glsl_type::_##NAME##_type = glsl_type(__VA_ARGS__, #NAME); \ + const glsl_type *const glsl_type::NAME##_type = &glsl_type::_##NAME##_type; + +#define STRUCT_TYPE(NAME) + +#include "builtin_type_macros.h" +/** @} */ diff --git a/src/glsl/glsl_types.h b/src/glsl/nir/glsl_types.h index b83e1ca3d2c..b83e1ca3d2c 100644 --- a/src/glsl/glsl_types.h +++ b/src/glsl/nir/glsl_types.h diff --git a/src/glsl/nir/nir.c b/src/glsl/nir/nir.c index e12da805281..793bdafb54b 100644 --- a/src/glsl/nir/nir.c +++ b/src/glsl/nir/nir.c @@ -103,6 +103,72 @@ nir_reg_remove(nir_register *reg) exec_node_remove(®->node); } +void +nir_shader_add_variable(nir_shader *shader, nir_variable *var) +{ + switch (var->data.mode) { + case nir_var_local: + assert(!"nir_shader_add_variable cannot be used for local variables"); + break; + + case nir_var_global: + exec_list_push_tail(&shader->globals, &var->node); + break; + + case nir_var_shader_in: + exec_list_push_tail(&shader->inputs, &var->node); + break; + + case nir_var_shader_out: + exec_list_push_tail(&shader->outputs, &var->node); + break; + + case nir_var_uniform: + case nir_var_shader_storage: + exec_list_push_tail(&shader->uniforms, &var->node); + break; + + case nir_var_system_value: + exec_list_push_tail(&shader->system_values, &var->node); + break; + } +} + +nir_variable * +nir_variable_create(nir_shader *shader, nir_variable_mode mode, + const struct glsl_type *type, const char *name) +{ + nir_variable *var = rzalloc(shader, nir_variable); + var->name = ralloc_strdup(var, name); + var->type = type; + var->data.mode = mode; + + if ((mode == nir_var_shader_in && shader->stage != MESA_SHADER_VERTEX) || + (mode == nir_var_shader_out && shader->stage != MESA_SHADER_FRAGMENT)) + var->data.interpolation = INTERP_QUALIFIER_SMOOTH; + + if (mode == nir_var_shader_in || mode == nir_var_uniform) + var->data.read_only = true; + + nir_shader_add_variable(shader, var); + + return var; +} + +nir_variable * +nir_local_variable_create(nir_function_impl *impl, + const struct glsl_type *type, const char *name) +{ + nir_variable *var = rzalloc(impl->overload->function->shader, nir_variable); + var->name = ralloc_strdup(var, name); + var->type = type; + var->data.mode = nir_var_local; + + nir_function_impl_add_variable(impl, var); + + return var; +} + nir_function * nir_function_create(nir_shader *shader, const char *name) { @@ -1080,31 +1146,33 @@ nir_src_as_const_value(nir_src src) return &load->value; } +/** + * Returns true if the source is known to be dynamically uniform. Otherwise it + * returns false which means it may or may not be dynamically uniform but it + * can't be determined. + */ bool -nir_srcs_equal(nir_src src1, nir_src src2) +nir_src_is_dynamically_uniform(nir_src src) { - if (src1.is_ssa) { - if (src2.is_ssa) { - return src1.ssa == src2.ssa; - } else { - return false; - } - } else { - if (src2.is_ssa) { - return false; - } else { - if ((src1.reg.indirect == NULL) != (src2.reg.indirect == NULL)) - return false; + if (!src.is_ssa) + return false; - if (src1.reg.indirect) { - if (!nir_srcs_equal(*src1.reg.indirect, *src2.reg.indirect)) - return false; - } + /* Constants are trivially dynamically uniform */ + if (src.ssa->parent_instr->type == nir_instr_type_load_const) + return true; - return src1.reg.reg == src2.reg.reg && - src1.reg.base_offset == src2.reg.base_offset; - } + /* As are uniform variables */ + if (src.ssa->parent_instr->type == nir_instr_type_intrinsic) { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(src.ssa->parent_instr); + + if (intr->intrinsic == nir_intrinsic_load_uniform) + return true; } + + /* XXX: this could have many more tests, such as when a sampler function is + * called with dynamically uniform arguments. + */ + return false; } static void diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h index f7b9483d74a..825c34805c4 100644 --- a/src/glsl/nir/nir.h +++ b/src/glsl/nir/nir.h @@ -35,7 +35,7 @@ #include "util/set.h" #include "util/bitset.h" #include "nir_types.h" -#include "glsl/shader_enums.h" +#include "shader_enums.h" #include <stdio.h> #include "nir_opcodes.h" @@ -738,7 +738,7 @@ nir_alu_instr_channel_used(nir_alu_instr *instr, unsigned src, unsigned channel) * used for a source */ static inline unsigned -nir_ssa_alu_instr_src_components(nir_alu_instr *instr, unsigned src) +nir_ssa_alu_instr_src_components(const nir_alu_instr *instr, unsigned src) { assert(instr->dest.dest.is_ssa); @@ -1486,6 +1486,9 @@ typedef struct nir_shader_compiler_options { typedef struct nir_shader_info { const char *name; + /* Descriptive name provided by the client; may be NULL */ + const char *label; + /* Number of textures used by this shader */ unsigned num_textures; /* Number of uniform buffers used by this shader */ @@ -1516,13 +1519,32 @@ typedef struct nir_shader_info { /** Was this shader linked with any transform feedback varyings? */ bool has_transform_feedback_varyings; - struct { - /** The maximum number of vertices the geometry shader might write. */ - unsigned vertices_out; + union { + struct { + /** The maximum number of vertices the geometry shader might write. */ + unsigned vertices_out; + + /** 1 .. MAX_GEOMETRY_SHADER_INVOCATIONS */ + unsigned invocations; + } gs; + + struct { + bool uses_discard; + + /** + * Whether early fragment tests are enabled as defined by + * ARB_shader_image_load_store. + */ + bool early_fragment_tests; + + /** gl_FragDepth layout for ARB_conservative_depth. */ + enum gl_frag_depth_layout depth_layout; + } fs; - /** 1 .. MAX_GEOMETRY_SHADER_INVOCATIONS */ - unsigned invocations; - } gs; + struct { + unsigned local_size[3]; + } cs; + }; } nir_shader_info; typedef struct nir_shader { @@ -1585,6 +1607,26 @@ nir_register *nir_local_reg_create(nir_function_impl *impl); void nir_reg_remove(nir_register *reg); +/** Adds a variable to the appropreate list in nir_shader */ +void nir_shader_add_variable(nir_shader *shader, nir_variable *var); + +static inline void +nir_function_impl_add_variable(nir_function_impl *impl, nir_variable *var) +{ + assert(var->data.mode == nir_var_local); + exec_list_push_tail(&impl->locals, &var->node); +} + +/** creates a variable, sets a few defaults, and adds it to the list */ +nir_variable *nir_variable_create(nir_shader *shader, + nir_variable_mode mode, + const struct glsl_type *type, + const char *name); +/** creates a local variable and adds it to the list */ +nir_variable *nir_local_variable_create(nir_function_impl *impl, + const struct glsl_type *type, + const char *name); + /** creates a function and adds it to the shader's list of functions */ nir_function *nir_function_create(nir_shader *shader, const char *name); @@ -1821,6 +1863,7 @@ bool nir_foreach_dest(nir_instr *instr, nir_foreach_dest_cb cb, void *state); bool nir_foreach_src(nir_instr *instr, nir_foreach_src_cb cb, void *state); nir_const_value *nir_src_as_const_value(nir_src src); +bool nir_src_is_dynamically_uniform(nir_src src); bool nir_srcs_equal(nir_src src1, nir_src src2); void nir_instr_rewrite_src(nir_instr *instr, nir_src *src, nir_src new_src); void nir_instr_move_src(nir_instr *dest_instr, nir_src *dest, nir_src *src); diff --git a/src/glsl/nir/nir_constant_expressions.py b/src/glsl/nir/nir_constant_expressions.py index 8fd9b1039a7..2ba8554645d 100644 --- a/src/glsl/nir/nir_constant_expressions.py +++ b/src/glsl/nir/nir_constant_expressions.py @@ -29,6 +29,7 @@ template = """\ #include <math.h> #include "main/core.h" #include "util/rounding.h" /* for _mesa_roundeven */ +#include "util/half_float.h" #include "nir_constant_expressions.h" #if defined(__SUNPRO_CC) diff --git a/src/glsl/nir/nir_instr_set.c b/src/glsl/nir/nir_instr_set.c new file mode 100644 index 00000000000..d3f939fe805 --- /dev/null +++ b/src/glsl/nir/nir_instr_set.c @@ -0,0 +1,519 @@ +/* + * Copyright © 2014 Connor Abbott + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "nir_instr_set.h" +#include "nir_vla.h" + +#define HASH(hash, data) _mesa_fnv32_1a_accumulate((hash), (data)) + +static uint32_t +hash_src(uint32_t hash, const nir_src *src) +{ + assert(src->is_ssa); + hash = HASH(hash, src->ssa); + return hash; +} + +static uint32_t +hash_alu_src(uint32_t hash, const nir_alu_src *src, unsigned num_components) +{ + hash = HASH(hash, src->abs); + hash = HASH(hash, src->negate); + + for (unsigned i = 0; i < num_components; i++) + hash = HASH(hash, src->swizzle[i]); + + hash = hash_src(hash, &src->src); + return hash; +} + +static uint32_t +hash_alu(uint32_t hash, const nir_alu_instr *instr) +{ + hash = HASH(hash, instr->op); + hash = HASH(hash, instr->dest.dest.ssa.num_components); + + if (nir_op_infos[instr->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) { + assert(nir_op_infos[instr->op].num_inputs == 2); + uint32_t hash0 = hash_alu_src(hash, &instr->src[0], + nir_ssa_alu_instr_src_components(instr, 0)); + uint32_t hash1 = hash_alu_src(hash, &instr->src[1], + nir_ssa_alu_instr_src_components(instr, 1)); + /* For commutative operations, we need some commutative way of + * combining the hashes. One option would be to XOR them but that + * means that anything with two identical sources will hash to 0 and + * that's common enough we probably don't want the guaranteed + * collision. Either addition or multiplication will also work. + */ + hash = hash0 * hash1; + } else { + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { + hash = hash_alu_src(hash, &instr->src[i], + nir_ssa_alu_instr_src_components(instr, i)); + } + } + + return hash; +} + +static uint32_t +hash_load_const(uint32_t hash, const nir_load_const_instr *instr) +{ + hash = HASH(hash, instr->def.num_components); + + hash = _mesa_fnv32_1a_accumulate_block(hash, instr->value.f, + instr->def.num_components + * sizeof(instr->value.f[0])); + + return hash; +} + +static int +cmp_phi_src(const void *data1, const void *data2) +{ + nir_phi_src *src1 = *(nir_phi_src **)data1; + nir_phi_src *src2 = *(nir_phi_src **)data2; + return src1->pred - src2->pred; +} + +static uint32_t +hash_phi(uint32_t hash, const nir_phi_instr *instr) +{ + hash = HASH(hash, instr->instr.block); + + /* sort sources by predecessor, since the order shouldn't matter */ + unsigned num_preds = instr->instr.block->predecessors->entries; + NIR_VLA(nir_phi_src *, srcs, num_preds); + unsigned i = 0; + nir_foreach_phi_src(instr, src) { + srcs[i++] = src; + } + + qsort(srcs, num_preds, sizeof(nir_phi_src *), cmp_phi_src); + + for (i = 0; i < num_preds; i++) { + hash = hash_src(hash, &srcs[i]->src); + hash = HASH(hash, srcs[i]->pred); + } + + return hash; +} + +static uint32_t +hash_intrinsic(uint32_t hash, const nir_intrinsic_instr *instr) +{ + const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; + hash = HASH(hash, instr->intrinsic); + + if (info->has_dest) + hash = HASH(hash, instr->dest.ssa.num_components); + + assert(info->num_variables == 0); + + hash = _mesa_fnv32_1a_accumulate_block(hash, instr->const_index, + info->num_indices + * sizeof(instr->const_index[0])); + return hash; +} + +static uint32_t +hash_tex(uint32_t hash, const nir_tex_instr *instr) +{ + hash = HASH(hash, instr->op); + hash = HASH(hash, instr->num_srcs); + + for (unsigned i = 0; i < instr->num_srcs; i++) { + hash = HASH(hash, instr->src[i].src_type); + hash = hash_src(hash, &instr->src[i].src); + } + + hash = HASH(hash, instr->coord_components); + hash = HASH(hash, instr->sampler_dim); + hash = HASH(hash, instr->is_array); + hash = HASH(hash, instr->is_shadow); + hash = HASH(hash, instr->is_new_style_shadow); + hash = HASH(hash, instr->const_offset); + unsigned component = instr->component; + hash = HASH(hash, component); + hash = HASH(hash, instr->sampler_index); + hash = HASH(hash, instr->sampler_array_size); + + assert(!instr->sampler); + + return hash; +} + +/* Computes a hash of an instruction for use in a hash table. Note that this + * will only work for instructions where instr_can_rewrite() returns true, and + * it should return identical hashes for two instructions that are the same + * according nir_instrs_equal(). + */ + +static uint32_t +hash_instr(const void *data) +{ + const nir_instr *instr = data; + uint32_t hash = _mesa_fnv32_1a_offset_bias; + + switch (instr->type) { + case nir_instr_type_alu: + hash = hash_alu(hash, nir_instr_as_alu(instr)); + break; + case nir_instr_type_load_const: + hash = hash_load_const(hash, nir_instr_as_load_const(instr)); + break; + case nir_instr_type_phi: + hash = hash_phi(hash, nir_instr_as_phi(instr)); + break; + case nir_instr_type_intrinsic: + hash = hash_intrinsic(hash, nir_instr_as_intrinsic(instr)); + break; + case nir_instr_type_tex: + hash = hash_tex(hash, nir_instr_as_tex(instr)); + break; + default: + unreachable("Invalid instruction type"); + } + + return hash; +} + +bool +nir_srcs_equal(nir_src src1, nir_src src2) +{ + if (src1.is_ssa) { + if (src2.is_ssa) { + return src1.ssa == src2.ssa; + } else { + return false; + } + } else { + if (src2.is_ssa) { + return false; + } else { + if ((src1.reg.indirect == NULL) != (src2.reg.indirect == NULL)) + return false; + + if (src1.reg.indirect) { + if (!nir_srcs_equal(*src1.reg.indirect, *src2.reg.indirect)) + return false; + } + + return src1.reg.reg == src2.reg.reg && + src1.reg.base_offset == src2.reg.base_offset; + } + } +} + +static bool +nir_alu_srcs_equal(const nir_alu_instr *alu1, const nir_alu_instr *alu2, + unsigned src1, unsigned src2) +{ + if (alu1->src[src1].abs != alu2->src[src2].abs || + alu1->src[src1].negate != alu2->src[src2].negate) + return false; + + for (unsigned i = 0; i < nir_ssa_alu_instr_src_components(alu1, src1); i++) { + if (alu1->src[src1].swizzle[i] != alu2->src[src2].swizzle[i]) + return false; + } + + return nir_srcs_equal(alu1->src[src1].src, alu2->src[src2].src); +} + +/* Returns "true" if two instructions are equal. Note that this will only + * work for the subset of instructions defined by instr_can_rewrite(). Also, + * it should only return "true" for instructions that hash_instr() will return + * the same hash for (ignoring collisions, of course). + */ + +static bool +nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2) +{ + if (instr1->type != instr2->type) + return false; + + switch (instr1->type) { + case nir_instr_type_alu: { + nir_alu_instr *alu1 = nir_instr_as_alu(instr1); + nir_alu_instr *alu2 = nir_instr_as_alu(instr2); + + if (alu1->op != alu2->op) + return false; + + /* TODO: We can probably acutally do something more inteligent such + * as allowing different numbers and taking a maximum or something + * here */ + if (alu1->dest.dest.ssa.num_components != alu2->dest.dest.ssa.num_components) + return false; + + if (nir_op_infos[alu1->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) { + assert(nir_op_infos[alu1->op].num_inputs == 2); + return (nir_alu_srcs_equal(alu1, alu2, 0, 0) && + nir_alu_srcs_equal(alu1, alu2, 1, 1)) || + (nir_alu_srcs_equal(alu1, alu2, 0, 1) && + nir_alu_srcs_equal(alu1, alu2, 1, 0)); + } else { + for (unsigned i = 0; i < nir_op_infos[alu1->op].num_inputs; i++) { + if (!nir_alu_srcs_equal(alu1, alu2, i, i)) + return false; + } + } + return true; + } + case nir_instr_type_tex: { + nir_tex_instr *tex1 = nir_instr_as_tex(instr1); + nir_tex_instr *tex2 = nir_instr_as_tex(instr2); + + if (tex1->op != tex2->op) + return false; + + if (tex1->num_srcs != tex2->num_srcs) + return false; + for (unsigned i = 0; i < tex1->num_srcs; i++) { + if (tex1->src[i].src_type != tex2->src[i].src_type || + !nir_srcs_equal(tex1->src[i].src, tex2->src[i].src)) { + return false; + } + } + + if (tex1->coord_components != tex2->coord_components || + tex1->sampler_dim != tex2->sampler_dim || + tex1->is_array != tex2->is_array || + tex1->is_shadow != tex2->is_shadow || + tex1->is_new_style_shadow != tex2->is_new_style_shadow || + memcmp(tex1->const_offset, tex2->const_offset, + sizeof(tex1->const_offset)) != 0 || + tex1->component != tex2->component || + tex1->sampler_index != tex2->sampler_index || + tex1->sampler_array_size != tex2->sampler_array_size) { + return false; + } + + /* Don't support un-lowered sampler derefs currently. */ + assert(!tex1->sampler && !tex2->sampler); + + return true; + } + case nir_instr_type_load_const: { + nir_load_const_instr *load1 = nir_instr_as_load_const(instr1); + nir_load_const_instr *load2 = nir_instr_as_load_const(instr2); + + if (load1->def.num_components != load2->def.num_components) + return false; + + return memcmp(load1->value.f, load2->value.f, + load1->def.num_components * sizeof(*load2->value.f)) == 0; + } + case nir_instr_type_phi: { + nir_phi_instr *phi1 = nir_instr_as_phi(instr1); + nir_phi_instr *phi2 = nir_instr_as_phi(instr2); + + if (phi1->instr.block != phi2->instr.block) + return false; + + nir_foreach_phi_src(phi1, src1) { + nir_foreach_phi_src(phi2, src2) { + if (src1->pred == src2->pred) { + if (!nir_srcs_equal(src1->src, src2->src)) + return false; + + break; + } + } + } + + return true; + } + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intrinsic1 = nir_instr_as_intrinsic(instr1); + nir_intrinsic_instr *intrinsic2 = nir_instr_as_intrinsic(instr2); + const nir_intrinsic_info *info = + &nir_intrinsic_infos[intrinsic1->intrinsic]; + + if (intrinsic1->intrinsic != intrinsic2->intrinsic || + intrinsic1->num_components != intrinsic2->num_components) + return false; + + if (info->has_dest && intrinsic1->dest.ssa.num_components != + intrinsic2->dest.ssa.num_components) + return false; + + for (unsigned i = 0; i < info->num_srcs; i++) { + if (!nir_srcs_equal(intrinsic1->src[i], intrinsic2->src[i])) + return false; + } + + assert(info->num_variables == 0); + + for (unsigned i = 0; i < info->num_indices; i++) { + if (intrinsic1->const_index[i] != intrinsic2->const_index[i]) + return false; + } + + return true; + } + case nir_instr_type_call: + case nir_instr_type_jump: + case nir_instr_type_ssa_undef: + case nir_instr_type_parallel_copy: + default: + unreachable("Invalid instruction type"); + } + + return false; +} + +static bool +src_is_ssa(nir_src *src, void *data) +{ + (void) data; + return src->is_ssa; +} + +static bool +dest_is_ssa(nir_dest *dest, void *data) +{ + (void) data; + return dest->is_ssa; +} + +/* This function determines if uses of an instruction can safely be rewritten + * to use another identical instruction instead. Note that this function must + * be kept in sync with hash_instr() and nir_instrs_equal() -- only + * instructions that pass this test will be handed on to those functions, and + * conversely they must handle everything that this function returns true for. + */ + +static bool +instr_can_rewrite(nir_instr *instr) +{ + /* We only handle SSA. */ + if (!nir_foreach_dest(instr, dest_is_ssa, NULL) || + !nir_foreach_src(instr, src_is_ssa, NULL)) + return false; + + switch (instr->type) { + case nir_instr_type_alu: + case nir_instr_type_load_const: + case nir_instr_type_phi: + return true; + case nir_instr_type_tex: { + nir_tex_instr *tex = nir_instr_as_tex(instr); + + /* Don't support un-lowered sampler derefs currently. */ + if (tex->sampler) + return false; + + return true; + } + case nir_instr_type_intrinsic: { + const nir_intrinsic_info *info = + &nir_intrinsic_infos[nir_instr_as_intrinsic(instr)->intrinsic]; + return (info->flags & NIR_INTRINSIC_CAN_ELIMINATE) && + (info->flags & NIR_INTRINSIC_CAN_REORDER) && + info->num_variables == 0; /* not implemented yet */ + } + case nir_instr_type_call: + case nir_instr_type_jump: + case nir_instr_type_ssa_undef: + return false; + case nir_instr_type_parallel_copy: + default: + unreachable("Invalid instruction type"); + } + + return false; +} + +static nir_ssa_def * +nir_instr_get_dest_ssa_def(nir_instr *instr) +{ + switch (instr->type) { + case nir_instr_type_alu: + assert(nir_instr_as_alu(instr)->dest.dest.is_ssa); + return &nir_instr_as_alu(instr)->dest.dest.ssa; + case nir_instr_type_load_const: + return &nir_instr_as_load_const(instr)->def; + case nir_instr_type_phi: + assert(nir_instr_as_phi(instr)->dest.is_ssa); + return &nir_instr_as_phi(instr)->dest.ssa; + case nir_instr_type_intrinsic: + assert(nir_instr_as_intrinsic(instr)->dest.is_ssa); + return &nir_instr_as_intrinsic(instr)->dest.ssa; + case nir_instr_type_tex: + assert(nir_instr_as_tex(instr)->dest.is_ssa); + return &nir_instr_as_tex(instr)->dest.ssa; + default: + unreachable("We never ask for any of these"); + } +} + +static bool +cmp_func(const void *data1, const void *data2) +{ + return nir_instrs_equal(data1, data2); +} + +struct set * +nir_instr_set_create(void *mem_ctx) +{ + return _mesa_set_create(mem_ctx, hash_instr, cmp_func); +} + +void +nir_instr_set_destroy(struct set *instr_set) +{ + _mesa_set_destroy(instr_set, NULL); +} + +bool +nir_instr_set_add_or_rewrite(struct set *instr_set, nir_instr *instr) +{ + if (!instr_can_rewrite(instr)) + return false; + + struct set_entry *entry = _mesa_set_search(instr_set, instr); + if (entry) { + nir_ssa_def *def = nir_instr_get_dest_ssa_def(instr); + nir_ssa_def *new_def = + nir_instr_get_dest_ssa_def((nir_instr *) entry->key); + nir_ssa_def_rewrite_uses(def, nir_src_for_ssa(new_def)); + return true; + } + + _mesa_set_add(instr_set, instr); + return false; +} + +void +nir_instr_set_remove(struct set *instr_set, nir_instr *instr) +{ + if (!instr_can_rewrite(instr)) + return; + + struct set_entry *entry = _mesa_set_search(instr_set, instr); + if (entry) + _mesa_set_remove(instr_set, entry); +} + diff --git a/src/glsl/nir/nir_instr_set.h b/src/glsl/nir/nir_instr_set.h new file mode 100644 index 00000000000..939e8ddbf58 --- /dev/null +++ b/src/glsl/nir/nir_instr_set.h @@ -0,0 +1,62 @@ +/* + * Copyright © 2014 Connor Abbott + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#include "nir.h" + +/** + * This file defines functions for creating, destroying, and manipulating an + * "instruction set," which is an abstraction for finding duplicate + * instructions using a hash set. Note that the question of whether an + * instruction is actually a duplicate (e.g. whether it has any side effects) + * is handled transparently. The user can pass any instruction to + * nir_instr_set_add_or_rewrite() and nir_instr_set_remove(), and if the + * instruction isn't safe to rewrite or isn't supported, it's silently + * removed. + */ + +/*@{*/ + +/** Creates an instruction set, using a given ralloc mem_ctx */ +struct set *nir_instr_set_create(void *mem_ctx); + +/** Destroys an instruction set. */ +void nir_instr_set_destroy(struct set *instr_set); + +/** + * Adds an instruction to an instruction set if it doesn't exist, or if it + * does already exist, rewrites all uses of it to point to the other + * already-inserted instruction. Returns 'true' if the uses of the instruction + * were rewritten. + */ +bool nir_instr_set_add_or_rewrite(struct set *instr_set, nir_instr *instr); + +/** + * Removes an instruction from an instruction set, so that other instructions + * won't be merged with it. + */ +void nir_instr_set_remove(struct set *instr_set, nir_instr *instr); + +/*@}*/ + diff --git a/src/glsl/nir/nir_intrinsics.h b/src/glsl/nir/nir_intrinsics.h index b5a0d715aa3..68a18b9c11a 100644 --- a/src/glsl/nir/nir_intrinsics.h +++ b/src/glsl/nir/nir_intrinsics.h @@ -174,8 +174,10 @@ INTRINSIC(image_samples, 0, ARR(), true, 1, 1, 0, * 3: For CompSwap only: the second data parameter. */ INTRINSIC(ssbo_atomic_add, 3, ARR(1, 1, 1), true, 1, 0, 0, 0) -INTRINSIC(ssbo_atomic_min, 3, ARR(1, 1, 1), true, 1, 0, 0, 0) -INTRINSIC(ssbo_atomic_max, 3, ARR(1, 1, 1), true, 1, 0, 0, 0) +INTRINSIC(ssbo_atomic_imin, 3, ARR(1, 1, 1), true, 1, 0, 0, 0) +INTRINSIC(ssbo_atomic_umin, 3, ARR(1, 1, 1), true, 1, 0, 0, 0) +INTRINSIC(ssbo_atomic_imax, 3, ARR(1, 1, 1), true, 1, 0, 0, 0) +INTRINSIC(ssbo_atomic_umax, 3, ARR(1, 1, 1), true, 1, 0, 0, 0) INTRINSIC(ssbo_atomic_and, 3, ARR(1, 1, 1), true, 1, 0, 0, 0) INTRINSIC(ssbo_atomic_or, 3, ARR(1, 1, 1), true, 1, 0, 0, 0) INTRINSIC(ssbo_atomic_xor, 3, ARR(1, 1, 1), true, 1, 0, 0, 0) diff --git a/src/glsl/nir/nir_lower_atomics.c b/src/glsl/nir/nir_lower_atomics.c index 6f9ecc019ec..46e137652a1 100644 --- a/src/glsl/nir/nir_lower_atomics.c +++ b/src/glsl/nir/nir_lower_atomics.c @@ -72,20 +72,22 @@ lower_instr(nir_intrinsic_instr *instr, nir_function_impl *impl) nir_ssa_def *offset_def = &offset_const->def; - if (instr->variables[0]->deref.child != NULL) { - assert(instr->variables[0]->deref.child->deref_type == - nir_deref_type_array); - nir_deref_array *deref_array = - nir_deref_as_array(instr->variables[0]->deref.child); - assert(deref_array->deref.child == NULL); + nir_deref *tail = &instr->variables[0]->deref; + while (tail->child != NULL) { + assert(tail->child->deref_type == nir_deref_type_array); + nir_deref_array *deref_array = nir_deref_as_array(tail->child); + tail = tail->child; - offset_const->value.u[0] += - deref_array->base_offset * ATOMIC_COUNTER_SIZE; + unsigned child_array_elements = tail->child != NULL ? + glsl_get_aoa_size(tail->type) : 1; + + offset_const->value.u[0] += deref_array->base_offset * + child_array_elements * ATOMIC_COUNTER_SIZE; if (deref_array->deref_array_type == nir_deref_array_type_indirect) { nir_load_const_instr *atomic_counter_size = nir_load_const_instr_create(mem_ctx, 1); - atomic_counter_size->value.u[0] = ATOMIC_COUNTER_SIZE; + atomic_counter_size->value.u[0] = child_array_elements * ATOMIC_COUNTER_SIZE; nir_instr_insert_before(&instr->instr, &atomic_counter_size->instr); nir_alu_instr *mul = nir_alu_instr_create(mem_ctx, nir_op_imul); @@ -102,7 +104,7 @@ lower_instr(nir_intrinsic_instr *instr, nir_function_impl *impl) add->src[0].src.is_ssa = true; add->src[0].src.ssa = &mul->dest.dest.ssa; add->src[1].src.is_ssa = true; - add->src[1].src.ssa = &offset_const->def; + add->src[1].src.ssa = offset_def; nir_instr_insert_before(&instr->instr, &add->instr); offset_def = &add->dest.dest.ssa; diff --git a/src/glsl/nir/nir_opt_cse.c b/src/glsl/nir/nir_opt_cse.c index 64c94afd480..93a6635337a 100644 --- a/src/glsl/nir/nir_opt_cse.c +++ b/src/glsl/nir/nir_opt_cse.c @@ -22,306 +22,60 @@ * * Authors: * Jason Ekstrand ([email protected]) + * Connor Abbott ([email protected]) * */ -#include "nir.h" +#include "nir_instr_set.h" /* * Implements common subexpression elimination */ -struct cse_state { - void *mem_ctx; - bool progress; -}; - -static bool -nir_alu_srcs_equal(nir_alu_instr *alu1, nir_alu_instr *alu2, unsigned src1, - unsigned src2) -{ - if (alu1->src[src1].abs != alu2->src[src2].abs || - alu1->src[src1].negate != alu2->src[src2].negate) - return false; - - for (unsigned i = 0; i < nir_ssa_alu_instr_src_components(alu1, src1); i++) { - if (alu1->src[src1].swizzle[i] != alu2->src[src2].swizzle[i]) - return false; - } - - return nir_srcs_equal(alu1->src[src1].src, alu2->src[src2].src); -} - -static bool -nir_instrs_equal(nir_instr *instr1, nir_instr *instr2) -{ - if (instr1->type != instr2->type) - return false; - - switch (instr1->type) { - case nir_instr_type_alu: { - nir_alu_instr *alu1 = nir_instr_as_alu(instr1); - nir_alu_instr *alu2 = nir_instr_as_alu(instr2); - - if (alu1->op != alu2->op) - return false; - - /* TODO: We can probably acutally do something more inteligent such - * as allowing different numbers and taking a maximum or something - * here */ - if (alu1->dest.dest.ssa.num_components != alu2->dest.dest.ssa.num_components) - return false; - - if (nir_op_infos[alu1->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) { - assert(nir_op_infos[alu1->op].num_inputs == 2); - return (nir_alu_srcs_equal(alu1, alu2, 0, 0) && - nir_alu_srcs_equal(alu1, alu2, 1, 1)) || - (nir_alu_srcs_equal(alu1, alu2, 0, 1) && - nir_alu_srcs_equal(alu1, alu2, 1, 0)); - } else { - for (unsigned i = 0; i < nir_op_infos[alu1->op].num_inputs; i++) { - if (!nir_alu_srcs_equal(alu1, alu2, i, i)) - return false; - } - } - return true; - } - case nir_instr_type_tex: { - nir_tex_instr *tex1 = nir_instr_as_tex(instr1); - nir_tex_instr *tex2 = nir_instr_as_tex(instr2); - - if (tex1->op != tex2->op) - return false; - - if (tex1->num_srcs != tex2->num_srcs) - return false; - for (unsigned i = 0; i < tex1->num_srcs; i++) { - if (tex1->src[i].src_type != tex2->src[i].src_type || - !nir_srcs_equal(tex1->src[i].src, tex2->src[i].src)) { - return false; - } - } - - if (tex1->coord_components != tex2->coord_components || - tex1->sampler_dim != tex2->sampler_dim || - tex1->is_array != tex2->is_array || - tex1->is_shadow != tex2->is_shadow || - tex1->is_new_style_shadow != tex2->is_new_style_shadow || - memcmp(tex1->const_offset, tex2->const_offset, - sizeof(tex1->const_offset)) != 0 || - tex1->component != tex2->component || - tex1->sampler_index != tex2->sampler_index || - tex1->sampler_array_size != tex2->sampler_array_size) { - return false; - } - - /* Don't support un-lowered sampler derefs currently. */ - if (tex1->sampler || tex2->sampler) - return false; - - return true; - } - case nir_instr_type_load_const: { - nir_load_const_instr *load1 = nir_instr_as_load_const(instr1); - nir_load_const_instr *load2 = nir_instr_as_load_const(instr2); - - if (load1->def.num_components != load2->def.num_components) - return false; - - return memcmp(load1->value.f, load2->value.f, - load1->def.num_components * sizeof(*load2->value.f)) == 0; - } - case nir_instr_type_phi: { - nir_phi_instr *phi1 = nir_instr_as_phi(instr1); - nir_phi_instr *phi2 = nir_instr_as_phi(instr2); - - if (phi1->instr.block != phi2->instr.block) - return false; - - nir_foreach_phi_src(phi1, src1) { - nir_foreach_phi_src(phi2, src2) { - if (src1->pred == src2->pred) { - if (!nir_srcs_equal(src1->src, src2->src)) - return false; - - break; - } - } - } - - return true; - } - case nir_instr_type_intrinsic: { - nir_intrinsic_instr *intrinsic1 = nir_instr_as_intrinsic(instr1); - nir_intrinsic_instr *intrinsic2 = nir_instr_as_intrinsic(instr2); - const nir_intrinsic_info *info = - &nir_intrinsic_infos[intrinsic1->intrinsic]; - - if (intrinsic1->intrinsic != intrinsic2->intrinsic || - intrinsic1->num_components != intrinsic2->num_components) - return false; - - if (info->has_dest && intrinsic1->dest.ssa.num_components != - intrinsic2->dest.ssa.num_components) - return false; - - for (unsigned i = 0; i < info->num_srcs; i++) { - if (!nir_srcs_equal(intrinsic1->src[i], intrinsic2->src[i])) - return false; - } - - assert(info->num_variables == 0); - - for (unsigned i = 0; i < info->num_indices; i++) { - if (intrinsic1->const_index[i] != intrinsic2->const_index[i]) - return false; - } - - return true; - } - case nir_instr_type_call: - case nir_instr_type_jump: - case nir_instr_type_ssa_undef: - case nir_instr_type_parallel_copy: - default: - unreachable("Invalid instruction type"); - } - - return false; -} - -static bool -src_is_ssa(nir_src *src, void *data) -{ - (void) data; - return src->is_ssa; -} - -static bool -dest_is_ssa(nir_dest *dest, void *data) -{ - (void) data; - return dest->is_ssa; -} +/* + * Visits and CSE's the given block and all its descendants in the dominance + * tree recursively. Note that the instr_set is guaranteed to only ever + * contain instructions that dominate the current block. + */ static bool -nir_instr_can_cse(nir_instr *instr) -{ - /* We only handle SSA. */ - if (!nir_foreach_dest(instr, dest_is_ssa, NULL) || - !nir_foreach_src(instr, src_is_ssa, NULL)) - return false; - - switch (instr->type) { - case nir_instr_type_alu: - case nir_instr_type_tex: - case nir_instr_type_load_const: - case nir_instr_type_phi: - return true; - case nir_instr_type_intrinsic: { - const nir_intrinsic_info *info = - &nir_intrinsic_infos[nir_instr_as_intrinsic(instr)->intrinsic]; - return (info->flags & NIR_INTRINSIC_CAN_ELIMINATE) && - (info->flags & NIR_INTRINSIC_CAN_REORDER) && - info->num_variables == 0; /* not implemented yet */ - } - case nir_instr_type_call: - case nir_instr_type_jump: - case nir_instr_type_ssa_undef: - return false; - case nir_instr_type_parallel_copy: - default: - unreachable("Invalid instruction type"); - } - - return false; -} - -static nir_ssa_def * -nir_instr_get_dest_ssa_def(nir_instr *instr) +cse_block(nir_block *block, struct set *instr_set) { - switch (instr->type) { - case nir_instr_type_alu: - assert(nir_instr_as_alu(instr)->dest.dest.is_ssa); - return &nir_instr_as_alu(instr)->dest.dest.ssa; - case nir_instr_type_tex: - assert(nir_instr_as_tex(instr)->dest.is_ssa); - return &nir_instr_as_tex(instr)->dest.ssa; - case nir_instr_type_load_const: - return &nir_instr_as_load_const(instr)->def; - case nir_instr_type_phi: - assert(nir_instr_as_phi(instr)->dest.is_ssa); - return &nir_instr_as_phi(instr)->dest.ssa; - case nir_instr_type_intrinsic: - assert(nir_instr_as_intrinsic(instr)->dest.is_ssa); - return &nir_instr_as_intrinsic(instr)->dest.ssa; - default: - unreachable("We never ask for any of these"); - } -} - -static void -nir_opt_cse_instr(nir_instr *instr, struct cse_state *state) -{ - if (!nir_instr_can_cse(instr)) - return; + bool progress = false; - for (struct exec_node *node = instr->node.prev; - !exec_node_is_head_sentinel(node); node = node->prev) { - nir_instr *other = exec_node_data(nir_instr, node, node); - if (nir_instrs_equal(instr, other)) { - nir_ssa_def *other_def = nir_instr_get_dest_ssa_def(other); - nir_ssa_def_rewrite_uses(nir_instr_get_dest_ssa_def(instr), - nir_src_for_ssa(other_def)); + nir_foreach_instr_safe(block, instr) { + if (nir_instr_set_add_or_rewrite(instr_set, instr)) { + progress = true; nir_instr_remove(instr); - state->progress = true; - return; } } - for (nir_block *block = instr->block->imm_dom; - block != NULL; block = block->imm_dom) { - nir_foreach_instr_reverse(block, other) { - if (nir_instrs_equal(instr, other)) { - nir_ssa_def *other_def = nir_instr_get_dest_ssa_def(other); - nir_ssa_def_rewrite_uses(nir_instr_get_dest_ssa_def(instr), - nir_src_for_ssa(other_def)); - nir_instr_remove(instr); - state->progress = true; - return; - } - } + for (unsigned i = 0; i < block->num_dom_children; i++) { + nir_block *child = block->dom_children[i]; + progress |= cse_block(child, instr_set); } -} - -static bool -nir_opt_cse_block(nir_block *block, void *void_state) -{ - struct cse_state *state = void_state; - nir_foreach_instr_safe(block, instr) - nir_opt_cse_instr(instr, state); + nir_foreach_instr(block, instr) + nir_instr_set_remove(instr_set, instr); - return true; + return progress; } static bool nir_opt_cse_impl(nir_function_impl *impl) { - struct cse_state state; - - state.mem_ctx = ralloc_parent(impl); - state.progress = false; + struct set *instr_set = nir_instr_set_create(NULL); nir_metadata_require(impl, nir_metadata_dominance); - nir_foreach_block(impl, nir_opt_cse_block, &state); + bool progress = cse_block(nir_start_block(impl), instr_set); - if (state.progress) + if (progress) nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance); - return state.progress; + nir_instr_set_destroy(instr_set); + return progress; } bool @@ -336,3 +90,4 @@ nir_opt_cse(nir_shader *shader) return progress; } + diff --git a/src/glsl/nir/nir_sweep.c b/src/glsl/nir/nir_sweep.c index b6ce43b5224..5a22f509f50 100644 --- a/src/glsl/nir/nir_sweep.c +++ b/src/glsl/nir/nir_sweep.c @@ -155,6 +155,8 @@ nir_sweep(nir_shader *nir) ralloc_adopt(rubbish, nir); ralloc_steal(nir, (char *)nir->info.name); + if (nir->info.label) + ralloc_steal(nir, (char *)nir->info.label); /* Variables and registers are not dead. Steal them back. */ steal_list(nir, nir_variable, &nir->uniforms); diff --git a/src/glsl/nir/nir_types.cpp b/src/glsl/nir/nir_types.cpp index 01f0e9b5abc..4a1250e546c 100644 --- a/src/glsl/nir/nir_types.cpp +++ b/src/glsl/nir/nir_types.cpp @@ -118,6 +118,12 @@ glsl_get_length(const struct glsl_type *type) return type->is_matrix() ? type->matrix_columns : type->length; } +unsigned +glsl_get_aoa_size(const struct glsl_type *type) +{ + return type->arrays_of_arrays_size(); +} + const char * glsl_get_struct_elem_name(const struct glsl_type *type, unsigned index) { diff --git a/src/glsl/nir/nir_types.h b/src/glsl/nir/nir_types.h index 1a0cb1fb774..a61af6cba75 100644 --- a/src/glsl/nir/nir_types.h +++ b/src/glsl/nir/nir_types.h @@ -31,7 +31,7 @@ /* C wrapper around glsl_types.h */ -#include "../glsl_types.h" +#include "glsl_types.h" #ifdef __cplusplus extern "C" { @@ -65,6 +65,8 @@ unsigned glsl_get_matrix_columns(const struct glsl_type *type); unsigned glsl_get_length(const struct glsl_type *type); +unsigned glsl_get_aoa_size(const struct glsl_type *type); + const char *glsl_get_struct_elem_name(const struct glsl_type *type, unsigned index); diff --git a/src/glsl/shader_enums.c b/src/glsl/nir/shader_enums.c index c196b791d4f..66a25e72344 100644 --- a/src/glsl/shader_enums.c +++ b/src/glsl/nir/shader_enums.c @@ -26,8 +26,9 @@ * Rob Clark <[email protected]> */ -#include "glsl/shader_enums.h" +#include "shader_enums.h" #include "util/macros.h" +#include "mesa/main/config.h" #define ENUM(x) [x] = #x #define NAME(val) ((((val) < ARRAY_SIZE(names)) && names[(val)]) ? names[(val)] : "UNKNOWN") @@ -42,6 +43,7 @@ const char * gl_shader_stage_name(gl_shader_stage stage) ENUM(MESA_SHADER_FRAGMENT), ENUM(MESA_SHADER_COMPUTE), }; + STATIC_ASSERT(ARRAY_SIZE(names) == MESA_SHADER_STAGES); return NAME(stage); } @@ -82,6 +84,7 @@ const char * gl_vert_attrib_name(gl_vert_attrib attrib) ENUM(VERT_ATTRIB_GENERIC14), ENUM(VERT_ATTRIB_GENERIC15), }; + STATIC_ASSERT(ARRAY_SIZE(names) == VERT_ATTRIB_MAX); return NAME(attrib); } @@ -147,6 +150,7 @@ const char * gl_varying_slot_name(gl_varying_slot slot) ENUM(VARYING_SLOT_VAR30), ENUM(VARYING_SLOT_VAR31), }; + STATIC_ASSERT(ARRAY_SIZE(names) == VARYING_SLOT_MAX); return NAME(slot); } @@ -169,8 +173,10 @@ const char * gl_system_value_name(gl_system_value sysval) ENUM(SYSTEM_VALUE_TESS_LEVEL_INNER), ENUM(SYSTEM_VALUE_LOCAL_INVOCATION_ID), ENUM(SYSTEM_VALUE_WORK_GROUP_ID), + ENUM(SYSTEM_VALUE_NUM_WORK_GROUPS), ENUM(SYSTEM_VALUE_VERTEX_CNT), }; + STATIC_ASSERT(ARRAY_SIZE(names) == SYSTEM_VALUE_MAX); return NAME(sysval); } @@ -182,6 +188,7 @@ const char * glsl_interp_qualifier_name(enum glsl_interp_qualifier qual) ENUM(INTERP_QUALIFIER_FLAT), ENUM(INTERP_QUALIFIER_NOPERSPECTIVE), }; + STATIC_ASSERT(ARRAY_SIZE(names) == INTERP_QUALIFIER_COUNT); return NAME(qual); } @@ -201,5 +208,6 @@ const char * gl_frag_result_name(gl_frag_result result) ENUM(FRAG_RESULT_DATA6), ENUM(FRAG_RESULT_DATA7), }; + STATIC_ASSERT(ARRAY_SIZE(names) == FRAG_RESULT_MAX); return NAME(result); } diff --git a/src/glsl/shader_enums.h b/src/glsl/nir/shader_enums.h index 2a5d2c5bfa7..d1cf7ca04cc 100644 --- a/src/glsl/shader_enums.h +++ b/src/glsl/nir/shader_enums.h @@ -233,6 +233,11 @@ typedef enum VARYING_SLOT_VAR31, } gl_varying_slot; + +#define VARYING_SLOT_MAX (VARYING_SLOT_VAR0 + MAX_VARYING) +#define VARYING_SLOT_PATCH0 (VARYING_SLOT_MAX) +#define VARYING_SLOT_TESS_MAX (VARYING_SLOT_PATCH0 + MAX_VARYING) + const char * gl_varying_slot_name(gl_varying_slot slot); /** @@ -473,4 +478,23 @@ typedef enum const char * gl_frag_result_name(gl_frag_result result); +#define FRAG_RESULT_MAX (FRAG_RESULT_DATA0 + MAX_DRAW_BUFFERS) + +/** + * \brief Layout qualifiers for gl_FragDepth. + * + * Extension AMD_conservative_depth allows gl_FragDepth to be redeclared with + * a layout qualifier. + * + * \see enum ir_depth_layout + */ +enum gl_frag_depth_layout +{ + FRAG_DEPTH_LAYOUT_NONE, /**< No layout is specified. */ + FRAG_DEPTH_LAYOUT_ANY, + FRAG_DEPTH_LAYOUT_GREATER, + FRAG_DEPTH_LAYOUT_LESS, + FRAG_DEPTH_LAYOUT_UNCHANGED +}; + #endif /* SHADER_ENUMS_H */ diff --git a/src/glsl/opt_dead_code.cpp b/src/glsl/opt_dead_code.cpp index 2cb7f41adef..c5be166e75a 100644 --- a/src/glsl/opt_dead_code.cpp +++ b/src/glsl/opt_dead_code.cpp @@ -75,24 +75,35 @@ do_dead_code(exec_list *instructions, bool uniform_locations_assigned) || !entry->declaration) continue; - if (entry->assign) { - /* Remove a single dead assignment to the variable we found. - * Don't do so if it's a shader or function output or a shader - * storage variable though. + if (!entry->assign_list.is_empty()) { + /* Remove all the dead assignments to the variable we found. + * Don't do so if it's a shader or function output, though. */ if (entry->var->data.mode != ir_var_function_out && entry->var->data.mode != ir_var_function_inout && entry->var->data.mode != ir_var_shader_out && entry->var->data.mode != ir_var_shader_storage) { - entry->assign->remove(); - progress = true; - if (debug) { - printf("Removed assignment to %s@%p\n", - entry->var->name, (void *) entry->var); - } + while (!entry->assign_list.is_empty()) { + struct assignment_entry *assignment_entry = + exec_node_data(struct assignment_entry, + entry->assign_list.head, link); + + assignment_entry->assign->remove(); + + if (debug) { + printf("Removed assignment to %s@%p\n", + entry->var->name, (void *) entry->var); + } + + assignment_entry->link.remove(); + free(assignment_entry); + } + progress = true; } - } else { + } + + if (entry->assign_list.is_empty()) { /* If there are no assignments or references to the variable left, * then we can remove its declaration. */ @@ -103,7 +114,7 @@ do_dead_code(exec_list *instructions, bool uniform_locations_assigned) */ if (entry->var->data.mode == ir_var_uniform || entry->var->data.mode == ir_var_shader_storage) { - if (uniform_locations_assigned || entry->var->constant_value) + if (uniform_locations_assigned || entry->var->constant_initializer) continue; /* Section 2.11.6 (Uniform Variables) of the OpenGL ES 3.0.3 spec diff --git a/src/glsl/opt_tree_grafting.cpp b/src/glsl/opt_tree_grafting.cpp index a7a219c55ca..e38a0e93058 100644 --- a/src/glsl/opt_tree_grafting.cpp +++ b/src/glsl/opt_tree_grafting.cpp @@ -373,8 +373,6 @@ tree_grafting_basic_block(ir_instruction *bb_first, entry->referenced_count != 2) continue; - assert(assign == entry->assign); - /* Found a possibly graftable assignment. Now, walk through the * rest of the BB seeing if the deref is here, and if nothing interfered with * pasting its expression's values in between. diff --git a/src/glsl/standalone_scaffolding.cpp b/src/glsl/standalone_scaffolding.cpp index 05140192893..3a95360eda6 100644 --- a/src/glsl/standalone_scaffolding.cpp +++ b/src/glsl/standalone_scaffolding.cpp @@ -113,9 +113,18 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg) ralloc_free(shProg->InfoLog); shProg->InfoLog = ralloc_strdup(shProg, ""); + ralloc_free(shProg->BufferInterfaceBlocks); + shProg->BufferInterfaceBlocks = NULL; + shProg->NumBufferInterfaceBlocks = 0; + ralloc_free(shProg->UniformBlocks); shProg->UniformBlocks = NULL; - shProg->NumBufferInterfaceBlocks = 0; + shProg->NumUniformBlocks = 0; + + ralloc_free(shProg->ShaderStorageBlocks); + shProg->ShaderStorageBlocks = NULL; + shProg->NumShaderStorageBlocks = 0; + for (i = 0; i < MESA_SHADER_STAGES; i++) { ralloc_free(shProg->UniformBlockStageIndex[i]); shProg->UniformBlockStageIndex[i] = NULL; diff --git a/src/mesa/Android.libmesa_dricore.mk b/src/mesa/Android.libmesa_dricore.mk index 2e308b83733..cd31e148222 100644 --- a/src/mesa/Android.libmesa_dricore.mk +++ b/src/mesa/Android.libmesa_dricore.mk @@ -50,7 +50,7 @@ endif # MESA_ENABLE_ASM ifeq ($(ARCH_X86_HAVE_SSE4_1),true) LOCAL_SRC_FILES += \ main/streaming-load-memcpy.c \ - mesa/main/sse_minmax.c + main/sse_minmax.c LOCAL_CFLAGS := \ -msse4.1 \ -DUSE_SSE41 @@ -60,6 +60,7 @@ LOCAL_C_INCLUDES := \ $(MESA_TOP)/src/mapi \ $(MESA_TOP)/src/mesa/main \ $(MESA_TOP)/src/glsl \ + $(MESA_TOP)/src/glsl/nir \ $(MESA_TOP)/src/gallium/include \ $(MESA_TOP)/src/gallium/auxiliary diff --git a/src/mesa/Android.libmesa_glsl_utils.mk b/src/mesa/Android.libmesa_glsl_utils.mk index ed620ac648c..9e150eaa3c0 100644 --- a/src/mesa/Android.libmesa_glsl_utils.mk +++ b/src/mesa/Android.libmesa_glsl_utils.mk @@ -37,6 +37,7 @@ LOCAL_MODULE := libmesa_glsl_utils LOCAL_C_INCLUDES := \ $(MESA_TOP)/src/glsl \ + $(MESA_TOP)/src/glsl/nir \ $(MESA_TOP)/src/mapi \ $(MESA_TOP)/src/gallium/include \ $(MESA_TOP)/src/gallium/auxiliary @@ -62,6 +63,7 @@ LOCAL_CFLAGS := -D_POSIX_C_SOURCE=199309L LOCAL_C_INCLUDES := \ $(MESA_TOP)/src/glsl \ + $(MESA_TOP)/src/glsl/nir \ $(MESA_TOP)/src/mapi \ $(MESA_TOP)/src/gallium/include \ $(MESA_TOP)/src/gallium/auxiliary diff --git a/src/mesa/Android.libmesa_st_mesa.mk b/src/mesa/Android.libmesa_st_mesa.mk index b4b7fd97722..427a35f4f6e 100644 --- a/src/mesa/Android.libmesa_st_mesa.mk +++ b/src/mesa/Android.libmesa_st_mesa.mk @@ -55,6 +55,7 @@ LOCAL_C_INCLUDES := \ $(MESA_TOP)/src/mapi \ $(MESA_TOP)/src/mesa/main \ $(MESA_TOP)/src/glsl \ + $(MESA_TOP)/src/glsl/nir \ $(MESA_TOP)/src/gallium/auxiliary \ $(MESA_TOP)/src/gallium/include diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources index 0915594cea6..34fb4461985 100644 --- a/src/mesa/Makefile.sources +++ b/src/mesa/Makefile.sources @@ -415,6 +415,7 @@ STATETRACKER_FILES = \ state_tracker/st_cache.h \ state_tracker/st_cb_bitmap.c \ state_tracker/st_cb_bitmap.h \ + state_tracker/st_cb_bitmap_shader.c \ state_tracker/st_cb_blit.c \ state_tracker/st_cb_blit.h \ state_tracker/st_cb_bufferobjects.c \ @@ -425,6 +426,7 @@ STATETRACKER_FILES = \ state_tracker/st_cb_condrender.h \ state_tracker/st_cb_drawpixels.c \ state_tracker/st_cb_drawpixels.h \ + state_tracker/st_cb_drawpixels_shader.c \ state_tracker/st_cb_drawtex.c \ state_tracker/st_cb_drawtex.h \ state_tracker/st_cb_eglimage.c \ @@ -525,9 +527,7 @@ PROGRAM_FILES = \ program/sampler.h \ program/string_to_uint_map.cpp \ program/symbol_table.c \ - program/symbol_table.h \ - ../glsl/shader_enums.c \ - ../glsl/shader_enums.h + program/symbol_table.h PROGRAM_NIR_FILES = \ program/prog_to_nir.c \ @@ -620,6 +620,7 @@ INCLUDE_DIRS = \ -I$(top_srcdir)/include \ -I$(top_srcdir)/src \ -I$(top_srcdir)/src/glsl \ + -I$(top_srcdir)/src/glsl/nir \ -I$(top_builddir)/src/glsl \ -I$(top_builddir)/src/glsl/nir \ -I$(top_srcdir)/src/glsl/glcpp \ diff --git a/src/mesa/SConscript b/src/mesa/SConscript index 5b80a216fef..c986326d2bf 100644 --- a/src/mesa/SConscript +++ b/src/mesa/SConscript @@ -16,6 +16,7 @@ env.Append(CPPPATH = [ '#/src', '#/src/mapi', '#/src/glsl', + '#/src/glsl/nir', '#/src/mesa', '#/src/gallium/include', '#/src/gallium/auxiliary', diff --git a/src/mesa/drivers/common/meta_copy_image.c b/src/mesa/drivers/common/meta_copy_image.c index 33490ee6615..04b9cafe308 100644 --- a/src/mesa/drivers/common/meta_copy_image.c +++ b/src/mesa/drivers/common/meta_copy_image.c @@ -108,7 +108,11 @@ make_view(struct gl_context *ctx, struct gl_texture_image *tex_image, return false; } + assert(tex_obj->Target != 0); + assert(tex_obj->TargetIndex < NUM_TEXTURE_TARGETS); + view_tex_obj->Target = tex_obj->Target; + view_tex_obj->TargetIndex = tex_obj->TargetIndex; *view_tex_image = _mesa_get_tex_image(ctx, view_tex_obj, tex_obj->Target, 0); @@ -129,7 +133,6 @@ make_view(struct gl_context *ctx, struct gl_texture_image *tex_image, view_tex_obj->NumLayers = tex_obj->NumLayers; view_tex_obj->Immutable = tex_obj->Immutable; view_tex_obj->ImmutableLevels = tex_obj->ImmutableLevels; - view_tex_obj->Target = tex_obj->Target; if (ctx->Driver.TextureView != NULL && !ctx->Driver.TextureView(ctx, view_tex_obj, tex_obj)) { diff --git a/src/mesa/drivers/dri/i915/i915_fragprog.c b/src/mesa/drivers/dri/i915/i915_fragprog.c index 1a5943c87fb..59d795998c6 100644 --- a/src/mesa/drivers/dri/i915/i915_fragprog.c +++ b/src/mesa/drivers/dri/i915/i915_fragprog.c @@ -1315,9 +1315,10 @@ static struct gl_program * i915NewProgram(struct gl_context * ctx, GLenum target, GLuint id) { switch (target) { - case GL_VERTEX_PROGRAM_ARB: - return _mesa_init_vertex_program(ctx, CALLOC_STRUCT(gl_vertex_program), - target, id); + case GL_VERTEX_PROGRAM_ARB: { + struct gl_vertex_program *prog = CALLOC_STRUCT(gl_vertex_program); + return _mesa_init_gl_program(&prog->Base, target, id); + } case GL_FRAGMENT_PROGRAM_ARB:{ struct i915_fragment_program *prog = @@ -1325,8 +1326,7 @@ i915NewProgram(struct gl_context * ctx, GLenum target, GLuint id) if (prog) { i915_init_program(I915_CONTEXT(ctx), prog); - return _mesa_init_fragment_program(ctx, &prog->FragProg, - target, id); + return _mesa_init_gl_program(&prog->FragProg.Base, target, id); } else return NULL; diff --git a/src/mesa/drivers/dri/i965/Android.mk b/src/mesa/drivers/dri/i965/Android.mk index a9b963a9eca..d30a053e10f 100644 --- a/src/mesa/drivers/dri/i965/Android.mk +++ b/src/mesa/drivers/dri/i965/Android.mk @@ -48,6 +48,7 @@ LOCAL_C_INCLUDES := \ $(MESA_DRI_C_INCLUDES) LOCAL_SRC_FILES := \ + $(i965_compiler_FILES) \ $(i965_FILES) LOCAL_WHOLE_STATIC_LIBRARIES := \ diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am index 2e241511049..04b3f9cc8ce 100644 --- a/src/mesa/drivers/dri/i965/Makefile.am +++ b/src/mesa/drivers/dri/i965/Makefile.am @@ -33,6 +33,7 @@ AM_CFLAGS = \ -I$(top_srcdir)/src/mesa/drivers/dri/common \ -I$(top_srcdir)/src/mesa/drivers/dri/intel/server \ -I$(top_srcdir)/src/gtest/include \ + -I$(top_srcdir)/src/glsl/nir \ -I$(top_builddir)/src/glsl/nir \ -I$(top_builddir)/src/mesa/drivers/dri/common \ $(DEFINES) \ diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources index b242ab55aae..ccd540dabca 100644 --- a/src/mesa/drivers/dri/i965/Makefile.sources +++ b/src/mesa/drivers/dri/i965/Makefile.sources @@ -1,6 +1,7 @@ i965_compiler_FILES = \ brw_cfg.cpp \ brw_cfg.h \ + brw_compiler.h \ brw_cubemap_normalize.cpp \ brw_dead_control_flow.cpp \ brw_dead_control_flow.h \ diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp index d458ad846bf..5308d175416 100644 --- a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp +++ b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp @@ -32,7 +32,7 @@ brw_blorp_eu_emitter::brw_blorp_eu_emitter(struct brw_context *brw, generator(brw->intelScreen->compiler, brw, mem_ctx, (void *) rzalloc(mem_ctx, struct brw_wm_prog_key), (struct brw_stage_prog_data *) rzalloc(mem_ctx, struct brw_wm_prog_data), - NULL, 0, false, "BLORP") + 0, false, "BLORP") { if (debug_flag) generator.enable_debug("blorp"); diff --git a/src/mesa/drivers/dri/i965/brw_cfg.cpp b/src/mesa/drivers/dri/i965/brw_cfg.cpp index 91d53eff5a7..10bcd4bafd4 100644 --- a/src/mesa/drivers/dri/i965/brw_cfg.cpp +++ b/src/mesa/drivers/dri/i965/brw_cfg.cpp @@ -305,6 +305,10 @@ cfg_t::cfg_t(exec_list *instructions) assert(cur_do != NULL && cur_while != NULL); cur->add_successor(mem_ctx, cur_do); + + if (inst->predicate) + cur->add_successor(mem_ctx, cur_while); + set_next_block(&cur, cur_while, ip); /* Pop the stack so we're in the previous loop */ @@ -422,7 +426,11 @@ cfg_t::dump(backend_shader *s) calculate_idom(); foreach_block (block, this) { - fprintf(stderr, "START B%d IDOM(B%d)", block->num, block->idom->num); + if (block->idom) + fprintf(stderr, "START B%d IDOM(B%d)", block->num, block->idom->num); + else + fprintf(stderr, "START B%d IDOM(none)", block->num); + foreach_list_typed(bblock_link, link, link, &block->parents) { fprintf(stderr, " <-B%d", link->block->num); diff --git a/src/mesa/drivers/dri/i965/brw_clear.c b/src/mesa/drivers/dri/i965/brw_clear.c index 17a745d0373..b0119558c3a 100644 --- a/src/mesa/drivers/dri/i965/brw_clear.c +++ b/src/mesa/drivers/dri/i965/brw_clear.c @@ -241,7 +241,7 @@ brw_clear(struct gl_context *ctx, GLbitfield mask) } /* Clear color buffers with fast clear or at least rep16 writes. */ - if (brw->gen >= 6 && brw->gen < 9 && (mask & BUFFER_BITS_COLOR)) { + if (brw->gen >= 6 && (mask & BUFFER_BITS_COLOR)) { if (brw_meta_fast_clear(brw, fb, mask, partial_clear)) { debug_mask("blorp color", mask & BUFFER_BITS_COLOR); mask &= ~BUFFER_BITS_COLOR; diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h new file mode 100644 index 00000000000..11c485d2f08 --- /dev/null +++ b/src/mesa/drivers/dri/i965/brw_compiler.h @@ -0,0 +1,661 @@ +/* + * Copyright © 2010 - 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#include "brw_device_info.h" +#include "main/mtypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct ra_regs; +struct nir_shader; +struct brw_geometry_program; +union gl_constant_value; + +struct brw_compiler { + const struct brw_device_info *devinfo; + + struct { + struct ra_regs *regs; + + /** + * Array of the ra classes for the unaligned contiguous register + * block sizes used. + */ + int *classes; + + /** + * Mapping for register-allocated objects in *regs to the first + * GRF for that object. + */ + uint8_t *ra_reg_to_grf; + } vec4_reg_set; + + struct { + struct ra_regs *regs; + + /** + * Array of the ra classes for the unaligned contiguous register + * block sizes used, indexed by register size. + */ + int classes[16]; + + /** + * Mapping from classes to ra_reg ranges. Each of the per-size + * classes corresponds to a range of ra_reg nodes. This array stores + * those ranges in the form of first ra_reg in each class and the + * total number of ra_reg elements in the last array element. This + * way the range of the i'th class is given by: + * [ class_to_ra_reg_range[i], class_to_ra_reg_range[i+1] ) + */ + int class_to_ra_reg_range[17]; + + /** + * Mapping for register-allocated objects in *regs to the first + * GRF for that object. + */ + uint8_t *ra_reg_to_grf; + + /** + * ra class for the aligned pairs we use for PLN, which doesn't + * appear in *classes. + */ + int aligned_pairs_class; + } fs_reg_sets[2]; + + void (*shader_debug_log)(void *, const char *str, ...) PRINTFLIKE(2, 3); + void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3); + + bool scalar_vs; + struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES]; +}; + + +/** + * Program key structures. + * + * When drawing, we look for the currently bound shaders in the program + * cache. This is essentially a hash table lookup, and these are the keys. + * + * Sometimes OpenGL features specified as state need to be simulated via + * shader code, due to a mismatch between the API and the hardware. This + * is often referred to as "non-orthagonal state" or "NOS". We store NOS + * in the program key so it's considered when searching for a program. If + * we haven't seen a particular combination before, we have to recompile a + * new specialized version. + * + * Shader compilation should not look up state in gl_context directly, but + * instead use the copy in the program key. This guarantees recompiles will + * happen correctly. + * + * @{ + */ + +enum PACKED gen6_gather_sampler_wa { + WA_SIGN = 1, /* whether we need to sign extend */ + WA_8BIT = 2, /* if we have an 8bit format needing wa */ + WA_16BIT = 4, /* if we have a 16bit format needing wa */ +}; + +/** + * Sampler information needed by VS, WM, and GS program cache keys. + */ +struct brw_sampler_prog_key_data { + /** + * EXT_texture_swizzle and DEPTH_TEXTURE_MODE swizzles. + */ + uint16_t swizzles[MAX_SAMPLERS]; + + uint32_t gl_clamp_mask[3]; + + /** + * For RG32F, gather4's channel select is broken. + */ + uint32_t gather_channel_quirk_mask; + + /** + * Whether this sampler uses the compressed multisample surface layout. + */ + uint32_t compressed_multisample_layout_mask; + + /** + * For Sandybridge, which shader w/a we need for gather quirks. + */ + enum gen6_gather_sampler_wa gen6_gather_wa[MAX_SAMPLERS]; +}; + + +/** The program key for Vertex Shaders. */ +struct brw_vs_prog_key { + unsigned program_string_id; + + /* + * Per-attribute workaround flags + */ + uint8_t gl_attrib_wa_flags[VERT_ATTRIB_MAX]; + + bool copy_edgeflag:1; + + bool clamp_vertex_color:1; + + /** + * How many user clipping planes are being uploaded to the vertex shader as + * push constants. + * + * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to + * clip distances. + */ + unsigned nr_userclip_plane_consts:4; + + /** + * For pre-Gen6 hardware, a bitfield indicating which texture coordinates + * are going to be replaced with point coordinates (as a consequence of a + * call to glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)). Because + * our SF thread requires exact matching between VS outputs and FS inputs, + * these texture coordinates will need to be unconditionally included in + * the VUE, even if they aren't written by the vertex shader. + */ + uint8_t point_coord_replace; + + struct brw_sampler_prog_key_data tex; +}; + +/** The program key for Geometry Shaders. */ +struct brw_gs_prog_key +{ + unsigned program_string_id; + + struct brw_sampler_prog_key_data tex; +}; + +/** The program key for Fragment/Pixel Shaders. */ +struct brw_wm_prog_key { + uint8_t iz_lookup; + bool stats_wm:1; + bool flat_shade:1; + bool persample_shading:1; + bool persample_2x:1; + unsigned nr_color_regions:5; + bool replicate_alpha:1; + bool render_to_fbo:1; + bool clamp_fragment_color:1; + bool compute_pos_offset:1; + bool compute_sample_id:1; + unsigned line_aa:2; + bool high_quality_derivatives:1; + + uint16_t drawable_height; + uint64_t input_slots_valid; + unsigned program_string_id; + GLenum alpha_test_func; /* < For Gen4/5 MRT alpha test */ + float alpha_test_ref; + + struct brw_sampler_prog_key_data tex; +}; + +struct brw_cs_prog_key { + uint32_t program_string_id; + struct brw_sampler_prog_key_data tex; +}; + +/* + * Image metadata structure as laid out in the shader parameter + * buffer. Entries have to be 16B-aligned for the vec4 back-end to be + * able to use them. That's okay because the padding and any unused + * entries [most of them except when we're doing untyped surface + * access] will be removed by the uniform packing pass. + */ +#define BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET 0 +#define BRW_IMAGE_PARAM_OFFSET_OFFSET 4 +#define BRW_IMAGE_PARAM_SIZE_OFFSET 8 +#define BRW_IMAGE_PARAM_STRIDE_OFFSET 12 +#define BRW_IMAGE_PARAM_TILING_OFFSET 16 +#define BRW_IMAGE_PARAM_SWIZZLING_OFFSET 20 +#define BRW_IMAGE_PARAM_SIZE 24 + +struct brw_image_param { + /** Surface binding table index. */ + uint32_t surface_idx; + + /** Offset applied to the X and Y surface coordinates. */ + uint32_t offset[2]; + + /** Surface X, Y and Z dimensions. */ + uint32_t size[3]; + + /** X-stride in bytes, Y-stride in pixels, horizontal slice stride in + * pixels, vertical slice stride in pixels. + */ + uint32_t stride[4]; + + /** Log2 of the tiling modulus in the X, Y and Z dimension. */ + uint32_t tiling[3]; + + /** + * Right shift to apply for bit 6 address swizzling. Two different + * swizzles can be specified and will be applied one after the other. The + * resulting address will be: + * + * addr' = addr ^ ((1 << 6) & ((addr >> swizzling[0]) ^ + * (addr >> swizzling[1]))) + * + * Use \c 0xff if any of the swizzles is not required. + */ + uint32_t swizzling[2]; +}; + +struct brw_stage_prog_data { + struct { + /** size of our binding table. */ + uint32_t size_bytes; + + /** @{ + * surface indices for the various groups of surfaces + */ + uint32_t pull_constants_start; + uint32_t texture_start; + uint32_t gather_texture_start; + uint32_t ubo_start; + uint32_t ssbo_start; + uint32_t abo_start; + uint32_t image_start; + uint32_t shader_time_start; + /** @} */ + } binding_table; + + GLuint nr_params; /**< number of float params/constants */ + GLuint nr_pull_params; + unsigned nr_image_params; + + unsigned curb_read_length; + unsigned total_scratch; + + /** + * Register where the thread expects to find input data from the URB + * (typically uniforms, followed by vertex or fragment attributes). + */ + unsigned dispatch_grf_start_reg; + + bool use_alt_mode; /**< Use ALT floating point mode? Otherwise, IEEE. */ + + /* Pointers to tracked values (only valid once + * _mesa_load_state_parameters has been called at runtime). + */ + const union gl_constant_value **param; + const union gl_constant_value **pull_param; + + /** Image metadata passed to the shader as uniforms. */ + struct brw_image_param *image_param; +}; + +/* Data about a particular attempt to compile a program. Note that + * there can be many of these, each in a different GL state + * corresponding to a different brw_wm_prog_key struct, with different + * compiled programs. + */ +struct brw_wm_prog_data { + struct brw_stage_prog_data base; + + GLuint num_varying_inputs; + + GLuint dispatch_grf_start_reg_16; + GLuint reg_blocks; + GLuint reg_blocks_16; + + struct { + /** @{ + * surface indices the WM-specific surfaces + */ + uint32_t render_target_start; + /** @} */ + } binding_table; + + uint8_t computed_depth_mode; + + bool early_fragment_tests; + bool no_8; + bool dual_src_blend; + bool uses_pos_offset; + bool uses_omask; + bool uses_kill; + bool pulls_bary; + uint32_t prog_offset_16; + + /** + * Mask of which interpolation modes are required by the fragment shader. + * Used in hardware setup on gen6+. + */ + uint32_t barycentric_interp_modes; + + /** + * Map from gl_varying_slot to the position within the FS setup data + * payload where the varying's attribute vertex deltas should be delivered. + * For varying slots that are not used by the FS, the value is -1. + */ + int urb_setup[VARYING_SLOT_MAX]; +}; + +struct brw_cs_prog_data { + struct brw_stage_prog_data base; + + GLuint dispatch_grf_start_reg_16; + unsigned local_size[3]; + unsigned simd_size; + bool uses_barrier; + bool uses_num_work_groups; + unsigned local_invocation_id_regs; + + struct { + /** @{ + * surface indices the CS-specific surfaces + */ + uint32_t work_groups_start; + /** @} */ + } binding_table; +}; + +/** + * Enum representing the i965-specific vertex results that don't correspond + * exactly to any element of gl_varying_slot. The values of this enum are + * assigned such that they don't conflict with gl_varying_slot. + */ +typedef enum +{ + BRW_VARYING_SLOT_NDC = VARYING_SLOT_MAX, + BRW_VARYING_SLOT_PAD, + /** + * Technically this is not a varying but just a placeholder that + * compile_sf_prog() inserts into its VUE map to cause the gl_PointCoord + * builtin variable to be compiled correctly. see compile_sf_prog() for + * more info. + */ + BRW_VARYING_SLOT_PNTC, + BRW_VARYING_SLOT_COUNT +} brw_varying_slot; + +/** + * Data structure recording the relationship between the gl_varying_slot enum + * and "slots" within the vertex URB entry (VUE). A "slot" is defined as a + * single octaword within the VUE (128 bits). + * + * Note that each BRW register contains 256 bits (2 octawords), so when + * accessing the VUE in URB_NOSWIZZLE mode, each register corresponds to two + * consecutive VUE slots. When accessing the VUE in URB_INTERLEAVED mode (as + * in a vertex shader), each register corresponds to a single VUE slot, since + * it contains data for two separate vertices. + */ +struct brw_vue_map { + /** + * Bitfield representing all varying slots that are (a) stored in this VUE + * map, and (b) actually written by the shader. Does not include any of + * the additional varying slots defined in brw_varying_slot. + */ + GLbitfield64 slots_valid; + + /** + * Is this VUE map for a separate shader pipeline? + * + * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched + * without the linker having a chance to dead code eliminate unused varyings. + * + * This means that we have to use a fixed slot layout, based on the output's + * location field, rather than assigning slots in a compact contiguous block. + */ + bool separate; + + /** + * Map from gl_varying_slot value to VUE slot. For gl_varying_slots that are + * not stored in a slot (because they are not written, or because + * additional processing is applied before storing them in the VUE), the + * value is -1. + */ + signed char varying_to_slot[BRW_VARYING_SLOT_COUNT]; + + /** + * Map from VUE slot to gl_varying_slot value. For slots that do not + * directly correspond to a gl_varying_slot, the value comes from + * brw_varying_slot. + * + * For slots that are not in use, the value is BRW_VARYING_SLOT_COUNT (this + * simplifies code that uses the value stored in slot_to_varying to + * create a bit mask). + */ + signed char slot_to_varying[BRW_VARYING_SLOT_COUNT]; + + /** + * Total number of VUE slots in use + */ + int num_slots; +}; + +/** + * Convert a VUE slot number into a byte offset within the VUE. + */ +static inline GLuint brw_vue_slot_to_offset(GLuint slot) +{ + return 16*slot; +} + +/** + * Convert a vertex output (brw_varying_slot) into a byte offset within the + * VUE. + */ +static inline GLuint brw_varying_to_offset(struct brw_vue_map *vue_map, + GLuint varying) +{ + return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]); +} + +void brw_compute_vue_map(const struct brw_device_info *devinfo, + struct brw_vue_map *vue_map, + GLbitfield64 slots_valid, + bool separate_shader); + +enum shader_dispatch_mode { + DISPATCH_MODE_4X1_SINGLE = 0, + DISPATCH_MODE_4X2_DUAL_INSTANCE = 1, + DISPATCH_MODE_4X2_DUAL_OBJECT = 2, + DISPATCH_MODE_SIMD8 = 3, +}; + +struct brw_vue_prog_data { + struct brw_stage_prog_data base; + struct brw_vue_map vue_map; + + GLuint urb_read_length; + GLuint total_grf; + + /* Used for calculating urb partitions. In the VS, this is the size of the + * URB entry used for both input and output to the thread. In the GS, this + * is the size of the URB entry used for output. + */ + GLuint urb_entry_size; + + enum shader_dispatch_mode dispatch_mode; +}; + +struct brw_vs_prog_data { + struct brw_vue_prog_data base; + + GLbitfield64 inputs_read; + + unsigned nr_attributes; + + bool uses_vertexid; + bool uses_instanceid; +}; + +struct brw_gs_prog_data +{ + struct brw_vue_prog_data base; + + /** + * Size of an output vertex, measured in HWORDS (32 bytes). + */ + unsigned output_vertex_size_hwords; + + unsigned output_topology; + + /** + * Size of the control data (cut bits or StreamID bits), in hwords (32 + * bytes). 0 if there is no control data. + */ + unsigned control_data_header_size_hwords; + + /** + * Format of the control data (either GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID + * if the control data is StreamID bits, or + * GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT if the control data is cut bits). + * Ignored if control_data_header_size is 0. + */ + unsigned control_data_format; + + bool include_primitive_id; + + /** + * The number of vertices emitted, if constant - otherwise -1. + */ + int static_vertex_count; + + int invocations; + + /** + * Gen6 transform feedback enabled flag. + */ + bool gen6_xfb_enabled; + + /** + * Gen6: Provoking vertex convention for odd-numbered triangles + * in tristrips. + */ + GLuint pv_first:1; + + /** + * Gen6: Number of varyings that are output to transform feedback. + */ + GLuint num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */ + + /** + * Gen6: Map from the index of a transform feedback binding table entry to the + * gl_varying_slot that should be streamed out through that binding table + * entry. + */ + unsigned char transform_feedback_bindings[64 /* BRW_MAX_SOL_BINDINGS */]; + + /** + * Gen6: Map from the index of a transform feedback binding table entry to the + * swizzles that should be used when streaming out data through that + * binding table entry. + */ + unsigned char transform_feedback_swizzles[64 /* BRW_MAX_SOL_BINDINGS */]; +}; + + +/** @} */ + +/** + * Compile a vertex shader. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_vs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_vs_prog_key *key, + struct brw_vs_prog_data *prog_data, + const struct nir_shader *shader, + gl_clip_plane *clip_planes, + bool use_legacy_snorm_formula, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str); + +/** + * Scratch data used when compiling a GLSL geometry shader. + */ +struct brw_gs_compile +{ + struct brw_gs_prog_key key; + struct brw_gs_prog_data prog_data; + struct brw_vue_map input_vue_map; + + struct brw_geometry_program *gp; + + unsigned control_data_bits_per_vertex; + unsigned control_data_header_size_bits; +}; + +/** + * Compile a vertex shader. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_gs(const struct brw_compiler *compiler, void *log_data, + struct brw_gs_compile *c, + const struct nir_shader *shader, + struct gl_shader_program *shader_prog, + void *mem_ctx, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str); + +/** + * Compile a fragment shader. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_fs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_wm_prog_key *key, + struct brw_wm_prog_data *prog_data, + const struct nir_shader *shader, + struct gl_program *prog, + int shader_time_index8, + int shader_time_index16, + bool use_rep_send, + unsigned *final_assembly_size, + char **error_str); + +/** + * Compile a compute shader. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_cs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_cs_prog_key *key, + struct brw_cs_prog_data *prog_data, + const struct nir_shader *shader, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str); + +#ifdef __cplusplus +} /* extern "C" */ +#endif diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index 6b2bbd21703..3b125448e14 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -51,7 +51,7 @@ #include "brw_context.h" #include "brw_defines.h" -#include "brw_shader.h" +#include "brw_compiler.h" #include "brw_draw.h" #include "brw_state.h" diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index aa1284db3ce..4f503ae4869 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -40,6 +40,7 @@ #include "main/mm.h" #include "main/mtypes.h" #include "brw_structs.h" +#include "brw_compiler.h" #include "intel_aub.h" #include "program/prog_parameter.h" @@ -340,260 +341,6 @@ struct brw_shader { bool compiled_once; }; -struct brw_stage_prog_data { - struct { - /** size of our binding table. */ - uint32_t size_bytes; - - /** @{ - * surface indices for the various groups of surfaces - */ - uint32_t pull_constants_start; - uint32_t texture_start; - uint32_t gather_texture_start; - uint32_t ubo_start; - uint32_t abo_start; - uint32_t image_start; - uint32_t shader_time_start; - /** @} */ - } binding_table; - - GLuint nr_params; /**< number of float params/constants */ - GLuint nr_pull_params; - unsigned nr_image_params; - - unsigned curb_read_length; - unsigned total_scratch; - - /** - * Register where the thread expects to find input data from the URB - * (typically uniforms, followed by vertex or fragment attributes). - */ - unsigned dispatch_grf_start_reg; - - bool use_alt_mode; /**< Use ALT floating point mode? Otherwise, IEEE. */ - - /* Pointers to tracked values (only valid once - * _mesa_load_state_parameters has been called at runtime). - */ - const gl_constant_value **param; - const gl_constant_value **pull_param; - - /** Image metadata passed to the shader as uniforms. */ - struct brw_image_param *image_param; -}; - -/* - * Image metadata structure as laid out in the shader parameter - * buffer. Entries have to be 16B-aligned for the vec4 back-end to be - * able to use them. That's okay because the padding and any unused - * entries [most of them except when we're doing untyped surface - * access] will be removed by the uniform packing pass. - */ -#define BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET 0 -#define BRW_IMAGE_PARAM_OFFSET_OFFSET 4 -#define BRW_IMAGE_PARAM_SIZE_OFFSET 8 -#define BRW_IMAGE_PARAM_STRIDE_OFFSET 12 -#define BRW_IMAGE_PARAM_TILING_OFFSET 16 -#define BRW_IMAGE_PARAM_SWIZZLING_OFFSET 20 -#define BRW_IMAGE_PARAM_SIZE 24 - -struct brw_image_param { - /** Surface binding table index. */ - uint32_t surface_idx; - - /** Offset applied to the X and Y surface coordinates. */ - uint32_t offset[2]; - - /** Surface X, Y and Z dimensions. */ - uint32_t size[3]; - - /** X-stride in bytes, Y-stride in pixels, horizontal slice stride in - * pixels, vertical slice stride in pixels. - */ - uint32_t stride[4]; - - /** Log2 of the tiling modulus in the X, Y and Z dimension. */ - uint32_t tiling[3]; - - /** - * Right shift to apply for bit 6 address swizzling. Two different - * swizzles can be specified and will be applied one after the other. The - * resulting address will be: - * - * addr' = addr ^ ((1 << 6) & ((addr >> swizzling[0]) ^ - * (addr >> swizzling[1]))) - * - * Use \c 0xff if any of the swizzles is not required. - */ - uint32_t swizzling[2]; -}; - -/* Data about a particular attempt to compile a program. Note that - * there can be many of these, each in a different GL state - * corresponding to a different brw_wm_prog_key struct, with different - * compiled programs. - */ -struct brw_wm_prog_data { - struct brw_stage_prog_data base; - - GLuint num_varying_inputs; - - GLuint dispatch_grf_start_reg_16; - GLuint reg_blocks; - GLuint reg_blocks_16; - - struct { - /** @{ - * surface indices the WM-specific surfaces - */ - uint32_t render_target_start; - /** @} */ - } binding_table; - - uint8_t computed_depth_mode; - - bool early_fragment_tests; - bool no_8; - bool dual_src_blend; - bool uses_pos_offset; - bool uses_omask; - bool uses_kill; - bool pulls_bary; - uint32_t prog_offset_16; - - /** - * Mask of which interpolation modes are required by the fragment shader. - * Used in hardware setup on gen6+. - */ - uint32_t barycentric_interp_modes; - - /** - * Map from gl_varying_slot to the position within the FS setup data - * payload where the varying's attribute vertex deltas should be delivered. - * For varying slots that are not used by the FS, the value is -1. - */ - int urb_setup[VARYING_SLOT_MAX]; -}; - -struct brw_cs_prog_data { - struct brw_stage_prog_data base; - - GLuint dispatch_grf_start_reg_16; - unsigned local_size[3]; - unsigned simd_size; - bool uses_barrier; - bool uses_num_work_groups; - unsigned local_invocation_id_regs; - - struct { - /** @{ - * surface indices the CS-specific surfaces - */ - uint32_t work_groups_start; - /** @} */ - } binding_table; -}; - -/** - * Enum representing the i965-specific vertex results that don't correspond - * exactly to any element of gl_varying_slot. The values of this enum are - * assigned such that they don't conflict with gl_varying_slot. - */ -typedef enum -{ - BRW_VARYING_SLOT_NDC = VARYING_SLOT_MAX, - BRW_VARYING_SLOT_PAD, - /** - * Technically this is not a varying but just a placeholder that - * compile_sf_prog() inserts into its VUE map to cause the gl_PointCoord - * builtin variable to be compiled correctly. see compile_sf_prog() for - * more info. - */ - BRW_VARYING_SLOT_PNTC, - BRW_VARYING_SLOT_COUNT -} brw_varying_slot; - - -/** - * Data structure recording the relationship between the gl_varying_slot enum - * and "slots" within the vertex URB entry (VUE). A "slot" is defined as a - * single octaword within the VUE (128 bits). - * - * Note that each BRW register contains 256 bits (2 octawords), so when - * accessing the VUE in URB_NOSWIZZLE mode, each register corresponds to two - * consecutive VUE slots. When accessing the VUE in URB_INTERLEAVED mode (as - * in a vertex shader), each register corresponds to a single VUE slot, since - * it contains data for two separate vertices. - */ -struct brw_vue_map { - /** - * Bitfield representing all varying slots that are (a) stored in this VUE - * map, and (b) actually written by the shader. Does not include any of - * the additional varying slots defined in brw_varying_slot. - */ - GLbitfield64 slots_valid; - - /** - * Is this VUE map for a separate shader pipeline? - * - * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched - * without the linker having a chance to dead code eliminate unused varyings. - * - * This means that we have to use a fixed slot layout, based on the output's - * location field, rather than assigning slots in a compact contiguous block. - */ - bool separate; - - /** - * Map from gl_varying_slot value to VUE slot. For gl_varying_slots that are - * not stored in a slot (because they are not written, or because - * additional processing is applied before storing them in the VUE), the - * value is -1. - */ - signed char varying_to_slot[BRW_VARYING_SLOT_COUNT]; - - /** - * Map from VUE slot to gl_varying_slot value. For slots that do not - * directly correspond to a gl_varying_slot, the value comes from - * brw_varying_slot. - * - * For slots that are not in use, the value is BRW_VARYING_SLOT_COUNT (this - * simplifies code that uses the value stored in slot_to_varying to - * create a bit mask). - */ - signed char slot_to_varying[BRW_VARYING_SLOT_COUNT]; - - /** - * Total number of VUE slots in use - */ - int num_slots; -}; - -/** - * Convert a VUE slot number into a byte offset within the VUE. - */ -static inline GLuint brw_vue_slot_to_offset(GLuint slot) -{ - return 16*slot; -} - -/** - * Convert a vertex output (brw_varying_slot) into a byte offset within the - * VUE. - */ -static inline GLuint brw_varying_to_offset(struct brw_vue_map *vue_map, - GLuint varying) -{ - return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]); -} - -void brw_compute_vue_map(const struct brw_device_info *devinfo, - struct brw_vue_map *vue_map, - GLbitfield64 slots_valid, - bool separate_shader); - - /** * Bitmask indicating which fragment shader inputs represent varyings (and * hence have to be delivered to the fragment shader by the SF/SBE stage). @@ -670,39 +417,6 @@ struct brw_ff_gs_prog_data { unsigned svbi_postincrement_value; }; -enum shader_dispatch_mode { - DISPATCH_MODE_4X1_SINGLE = 0, - DISPATCH_MODE_4X2_DUAL_INSTANCE = 1, - DISPATCH_MODE_4X2_DUAL_OBJECT = 2, - DISPATCH_MODE_SIMD8 = 3, -}; - -struct brw_vue_prog_data { - struct brw_stage_prog_data base; - struct brw_vue_map vue_map; - - GLuint urb_read_length; - GLuint total_grf; - - /* Used for calculating urb partitions. In the VS, this is the size of the - * URB entry used for both input and output to the thread. In the GS, this - * is the size of the URB entry used for output. - */ - GLuint urb_entry_size; - - enum shader_dispatch_mode dispatch_mode; -}; - - -struct brw_vs_prog_data { - struct brw_vue_prog_data base; - - GLbitfield64 inputs_read; - - bool uses_vertexid; - bool uses_instanceid; -}; - /** Number of texture sampler units */ #define BRW_MAX_TEX_UNIT 32 @@ -715,9 +429,6 @@ struct brw_vs_prog_data { /** Max number of SSBOs in a shader */ #define BRW_MAX_SSBO 12 -/** Max number of combined UBOs and SSBOs in a shader */ -#define BRW_MAX_COMBINED_UBO_SSBO (BRW_MAX_UBO + BRW_MAX_SSBO) - /** Max number of atomic counter buffer objects in a shader */ #define BRW_MAX_ABO 16 @@ -763,71 +474,6 @@ struct brw_vs_prog_data { #define SURF_INDEX_GEN6_SOL_BINDING(t) (t) -struct brw_gs_prog_data -{ - struct brw_vue_prog_data base; - - /** - * Size of an output vertex, measured in HWORDS (32 bytes). - */ - unsigned output_vertex_size_hwords; - - unsigned output_topology; - - /** - * Size of the control data (cut bits or StreamID bits), in hwords (32 - * bytes). 0 if there is no control data. - */ - unsigned control_data_header_size_hwords; - - /** - * Format of the control data (either GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID - * if the control data is StreamID bits, or - * GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT if the control data is cut bits). - * Ignored if control_data_header_size is 0. - */ - unsigned control_data_format; - - bool include_primitive_id; - - /** - * The number of vertices emitted, if constant - otherwise -1. - */ - int static_vertex_count; - - int invocations; - - /** - * Gen6 transform feedback enabled flag. - */ - bool gen6_xfb_enabled; - - /** - * Gen6: Provoking vertex convention for odd-numbered triangles - * in tristrips. - */ - GLuint pv_first:1; - - /** - * Gen6: Number of varyings that are output to transform feedback. - */ - GLuint num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */ - - /** - * Gen6: Map from the index of a transform feedback binding table entry to the - * gl_varying_slot that should be streamed out through that binding table - * entry. - */ - unsigned char transform_feedback_bindings[BRW_MAX_SOL_BINDINGS]; - - /** - * Gen6: Map from the index of a transform feedback binding table entry to the - * swizzles that should be used when streaming out data through that - * binding table entry. - */ - unsigned char transform_feedback_swizzles[BRW_MAX_SOL_BINDINGS]; -}; - /** * Stride in bytes between shader_time entries. * @@ -953,6 +599,8 @@ struct intel_batchbuffer { } saved; }; +#define MAX_GS_INPUT_VERTICES 6 + #define BRW_MAX_XFB_STREAMS 4 struct brw_transform_feedback_object { diff --git a/src/mesa/drivers/dri/i965/brw_cs.c b/src/mesa/drivers/dri/i965/brw_cs.c index 45fb816c160..263d224e882 100644 --- a/src/mesa/drivers/dri/i965/brw_cs.c +++ b/src/mesa/drivers/dri/i965/brw_cs.c @@ -105,9 +105,15 @@ brw_codegen_cs_prog(struct brw_context *brw, if (INTEL_DEBUG & DEBUG_SHADER_TIME) st_index = brw_get_shader_time_index(brw, prog, &cp->program.Base, ST_CS); - program = brw_cs_emit(brw, mem_ctx, key, &prog_data, - &cp->program, prog, st_index, &program_size); + char *error_str; + program = brw_compile_cs(brw->intelScreen->compiler, brw, mem_ctx, + key, &prog_data, cp->program.Base.nir, + st_index, &program_size, &error_str); if (program == NULL) { + prog->LinkStatus = false; + ralloc_strcat(&prog->InfoLog, error_str); + _mesa_problem(NULL, "Failed to compile compute shader: %s\n", error_str); + ralloc_free(mem_ctx); return false; } diff --git a/src/mesa/drivers/dri/i965/brw_cs.h b/src/mesa/drivers/dri/i965/brw_cs.h index 17c2ff9871a..899e340f14e 100644 --- a/src/mesa/drivers/dri/i965/brw_cs.h +++ b/src/mesa/drivers/dri/i965/brw_cs.h @@ -27,11 +27,6 @@ #include "brw_program.h" -struct brw_cs_prog_key { - uint32_t program_string_id; - struct brw_sampler_prog_key_data tex; -}; - #ifdef __cplusplus extern "C" { #endif @@ -39,16 +34,6 @@ extern "C" { void brw_upload_cs_prog(struct brw_context *brw); -const unsigned * -brw_cs_emit(struct brw_context *brw, - void *mem_ctx, - const struct brw_cs_prog_key *key, - struct brw_cs_prog_data *prog_data, - struct gl_compute_program *cp, - struct gl_shader_program *prog, - int shader_time_index, - unsigned *final_assembly_size); - void brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data, void *buffer, uint32_t threads, uint32_t stride); diff --git a/src/mesa/drivers/dri/i965/brw_cubemap_normalize.cpp b/src/mesa/drivers/dri/i965/brw_cubemap_normalize.cpp index 33571292007..33d2048e657 100644 --- a/src/mesa/drivers/dri/i965/brw_cubemap_normalize.cpp +++ b/src/mesa/drivers/dri/i965/brw_cubemap_normalize.cpp @@ -30,7 +30,7 @@ * \author Eric Anholt <[email protected]> */ -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "glsl/ir.h" #include "program/prog_instruction.h" /* For WRITEMASK_* */ diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h index 761aa0ec5fa..0ac1ad9378b 100644 --- a/src/mesa/drivers/dri/i965/brw_eu.h +++ b/src/mesa/drivers/dri/i965/brw_eu.h @@ -461,7 +461,7 @@ brw_pixel_interpolator_query(struct brw_codegen *p, struct brw_reg mrf, bool noperspective, unsigned mode, - unsigned data, + struct brw_reg data, unsigned msg_length, unsigned response_length); diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c index dc699bb6321..bf2fee9ed48 100644 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c @@ -3212,26 +3212,29 @@ brw_pixel_interpolator_query(struct brw_codegen *p, struct brw_reg mrf, bool noperspective, unsigned mode, - unsigned data, + struct brw_reg data, unsigned msg_length, unsigned response_length) { const struct brw_device_info *devinfo = p->devinfo; - struct brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); - - brw_set_dest(p, insn, dest); - brw_set_src0(p, insn, mrf); - brw_set_message_descriptor(p, insn, GEN7_SFID_PIXEL_INTERPOLATOR, - msg_length, response_length, - false /* header is never present for PI */, - false); + struct brw_inst *insn; + const uint16_t exec_size = brw_inst_exec_size(devinfo, p->current); - brw_inst_set_pi_simd_mode( - devinfo, insn, brw_inst_exec_size(devinfo, insn) == BRW_EXECUTE_16); + /* brw_send_indirect_message will automatically use a direct send message + * if data is actually immediate. + */ + insn = brw_send_indirect_message(p, + GEN7_SFID_PIXEL_INTERPOLATOR, + dest, + mrf, + vec1(data)); + brw_inst_set_mlen(devinfo, insn, msg_length); + brw_inst_set_rlen(devinfo, insn, response_length); + + brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16); brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */ brw_inst_set_pi_nopersp(devinfo, insn, noperspective); brw_inst_set_pi_message_type(devinfo, insn, mode); - brw_inst_set_pi_message_data(devinfo, insn, data); } void diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 5049851c617..0562c5a9981 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -47,7 +47,7 @@ #include "brw_dead_control_flow.h" #include "main/uniforms.h" #include "brw_fs_live_variables.h" -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "program/sampler.h" using namespace brw; @@ -338,6 +338,18 @@ fs_inst::can_do_source_mods(const struct brw_device_info *devinfo) } bool +fs_inst::can_change_types() const +{ + return dst.type == src[0].type && + !src[0].abs && !src[0].negate && !saturate && + (opcode == BRW_OPCODE_MOV || + (opcode == BRW_OPCODE_SEL && + dst.type == src[1].type && + predicate != BRW_PREDICATE_NONE && + !src[1].abs && !src[1].negate)); +} + +bool fs_inst::has_side_effects() const { return this->eot || backend_instruction::has_side_effects(); @@ -1049,11 +1061,11 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name, unsigned int array_elements; if (type->is_array()) { - array_elements = type->length; + array_elements = type->arrays_of_arrays_size(); if (array_elements == 0) { fail("dereferenced array '%s' has length 0\n", name); } - type = type->fields.array; + type = type->without_array(); } else { array_elements = 1; } @@ -1509,25 +1521,14 @@ void fs_visitor::assign_vs_urb_setup() { brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data; - int grf, count, slot, channel, attr; assert(stage == MESA_SHADER_VERTEX); - count = _mesa_bitcount_64(vs_prog_data->inputs_read); + int count = _mesa_bitcount_64(vs_prog_data->inputs_read); if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) count++; /* Each attribute is 4 regs. */ - this->first_non_payload_grf += count * 4; - - unsigned vue_entries = - MAX2(count, vs_prog_data->base.vue_map.num_slots); - - /* URB entry size is counted in units of 64 bytes (for the 3DSTATE_URB_VS - * command). Each attribute is 16 bytes (4 floats/dwords), so each unit - * fits four attributes. - */ - vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4; - vs_prog_data->base.urb_read_length = (count + 1) / 2; + this->first_non_payload_grf += 4 * vs_prog_data->nr_attributes; assert(vs_prog_data->base.urb_read_length <= 15); @@ -1535,25 +1536,10 @@ fs_visitor::assign_vs_urb_setup() foreach_block_and_inst(block, fs_inst, inst, cfg) { for (int i = 0; i < inst->sources; i++) { if (inst->src[i].file == ATTR) { - - if (inst->src[i].reg == VERT_ATTRIB_MAX) { - slot = count - 1; - } else { - /* Attributes come in in a contiguous block, ordered by their - * gl_vert_attrib value. That means we can compute the slot - * number for an attribute by masking out the enabled - * attributes before it and counting the bits. - */ - attr = inst->src[i].reg + inst->src[i].reg_offset / 4; - slot = _mesa_bitcount_64(vs_prog_data->inputs_read & - BITFIELD64_MASK(attr)); - } - - channel = inst->src[i].reg_offset & 3; - - grf = payload.num_regs + - prog_data->curb_read_length + - slot * 4 + channel; + int grf = payload.num_regs + + prog_data->curb_read_length + + inst->src[i].reg + + inst->src[i].reg_offset; inst->src[i].file = HW_REG; inst->src[i].fixed_hw_reg = @@ -5134,41 +5120,140 @@ fs_visitor::run_cs() return !failed; } +/** + * Return a bitfield where bit n is set if barycentric interpolation mode n + * (see enum brw_wm_barycentric_interp_mode) is needed by the fragment shader. + */ +static unsigned +brw_compute_barycentric_interp_modes(const struct brw_device_info *devinfo, + bool shade_model_flat, + bool persample_shading, + const nir_shader *shader) +{ + unsigned barycentric_interp_modes = 0; + + nir_foreach_variable(var, &shader->inputs) { + enum glsl_interp_qualifier interp_qualifier = + (enum glsl_interp_qualifier)var->data.interpolation; + bool is_centroid = var->data.centroid && !persample_shading; + bool is_sample = var->data.sample || persample_shading; + bool is_gl_Color = (var->data.location == VARYING_SLOT_COL0) || + (var->data.location == VARYING_SLOT_COL1); + + /* Ignore WPOS and FACE, because they don't require interpolation. */ + if (var->data.location == VARYING_SLOT_POS || + var->data.location == VARYING_SLOT_FACE) + continue; + + /* Determine the set (or sets) of barycentric coordinates needed to + * interpolate this variable. Note that when + * brw->needs_unlit_centroid_workaround is set, centroid interpolation + * uses PIXEL interpolation for unlit pixels and CENTROID interpolation + * for lit pixels, so we need both sets of barycentric coordinates. + */ + if (interp_qualifier == INTERP_QUALIFIER_NOPERSPECTIVE) { + if (is_centroid) { + barycentric_interp_modes |= + 1 << BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC; + } else if (is_sample) { + barycentric_interp_modes |= + 1 << BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC; + } + if ((!is_centroid && !is_sample) || + devinfo->needs_unlit_centroid_workaround) { + barycentric_interp_modes |= + 1 << BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC; + } + } else if (interp_qualifier == INTERP_QUALIFIER_SMOOTH || + (!(shade_model_flat && is_gl_Color) && + interp_qualifier == INTERP_QUALIFIER_NONE)) { + if (is_centroid) { + barycentric_interp_modes |= + 1 << BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC; + } else if (is_sample) { + barycentric_interp_modes |= + 1 << BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC; + } + if ((!is_centroid && !is_sample) || + devinfo->needs_unlit_centroid_workaround) { + barycentric_interp_modes |= + 1 << BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC; + } + } + } + + return barycentric_interp_modes; +} + +static uint8_t +computed_depth_mode(const nir_shader *shader) +{ + if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { + switch (shader->info.fs.depth_layout) { + case FRAG_DEPTH_LAYOUT_NONE: + case FRAG_DEPTH_LAYOUT_ANY: + return BRW_PSCDEPTH_ON; + case FRAG_DEPTH_LAYOUT_GREATER: + return BRW_PSCDEPTH_ON_GE; + case FRAG_DEPTH_LAYOUT_LESS: + return BRW_PSCDEPTH_ON_LE; + case FRAG_DEPTH_LAYOUT_UNCHANGED: + return BRW_PSCDEPTH_OFF; + } + } + return BRW_PSCDEPTH_OFF; +} + const unsigned * -brw_wm_fs_emit(struct brw_context *brw, +brw_compile_fs(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, const struct brw_wm_prog_key *key, struct brw_wm_prog_data *prog_data, - struct gl_fragment_program *fp, - struct gl_shader_program *prog, + const nir_shader *shader, + struct gl_program *prog, int shader_time_index8, int shader_time_index16, - unsigned *final_assembly_size) + bool use_rep_send, + unsigned *final_assembly_size, + char **error_str) { - /* Now the main event: Visit the shader IR and generate our FS IR for it. + /* key->alpha_test_func means simulating alpha testing via discards, + * so the shader definitely kills pixels. */ - fs_visitor v(brw->intelScreen->compiler, brw, mem_ctx, key, - &prog_data->base, &fp->Base, fp->Base.nir, 8, shader_time_index8); + prog_data->uses_kill = shader->info.fs.uses_discard || key->alpha_test_func; + prog_data->uses_omask = + shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK); + prog_data->computed_depth_mode = computed_depth_mode(shader); + + prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests; + + prog_data->barycentric_interp_modes = + brw_compute_barycentric_interp_modes(compiler->devinfo, + key->flat_shade, + key->persample_shading, + shader); + + fs_visitor v(compiler, log_data, mem_ctx, key, + &prog_data->base, prog, shader, 8, + shader_time_index8); if (!v.run_fs(false /* do_rep_send */)) { - if (prog) { - prog->LinkStatus = false; - ralloc_strcat(&prog->InfoLog, v.fail_msg); - } - - _mesa_problem(NULL, "Failed to compile fragment shader: %s\n", - v.fail_msg); + if (error_str) + *error_str = ralloc_strdup(mem_ctx, v.fail_msg); return NULL; } cfg_t *simd16_cfg = NULL; - fs_visitor v2(brw->intelScreen->compiler, brw, mem_ctx, key, - &prog_data->base, &fp->Base, fp->Base.nir, 16, shader_time_index16); - if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) { + fs_visitor v2(compiler, log_data, mem_ctx, key, + &prog_data->base, prog, shader, 16, + shader_time_index16); + if (likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) { if (!v.simd16_unsupported) { /* Try a SIMD16 compile */ v2.import_uniforms(&v); - if (!v2.run_fs(brw->use_rep_send)) { - perf_debug("SIMD16 shader failed to compile: %s", v2.fail_msg); + if (!v2.run_fs(use_rep_send)) { + compiler->shader_perf_log(log_data, + "SIMD16 shader failed to compile: %s", + v2.fail_msg); } else { simd16_cfg = v2.cfg; } @@ -5176,8 +5261,8 @@ brw_wm_fs_emit(struct brw_context *brw, } cfg_t *simd8_cfg; - int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8; - if ((no_simd8 || brw->gen < 5) && simd16_cfg) { + int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || use_rep_send; + if ((no_simd8 || compiler->devinfo->gen < 5) && simd16_cfg) { simd8_cfg = NULL; prog_data->no_8 = true; } else { @@ -5185,20 +5270,14 @@ brw_wm_fs_emit(struct brw_context *brw, prog_data->no_8 = false; } - fs_generator g(brw->intelScreen->compiler, brw, - mem_ctx, (void *) key, &prog_data->base, - &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS"); + fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base, + v.promoted_constants, v.runtime_check_aads_emit, "FS"); if (unlikely(INTEL_DEBUG & DEBUG_WM)) { - char *name; - if (prog) - name = ralloc_asprintf(mem_ctx, "%s fragment shader %d", - prog->Label ? prog->Label : "unnamed", - prog->Name); - else - name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id); - - g.enable_debug(name); + g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s", + shader->info.label ? shader->info.label : + "unnamed", + shader->info.name)); } if (simd8_cfg) @@ -5283,29 +5362,32 @@ fs_visitor::emit_cs_work_group_id_setup() } const unsigned * -brw_cs_emit(struct brw_context *brw, - void *mem_ctx, - const struct brw_cs_prog_key *key, - struct brw_cs_prog_data *prog_data, - struct gl_compute_program *cp, - struct gl_shader_program *prog, - int shader_time_index, - unsigned *final_assembly_size) +brw_compile_cs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_cs_prog_key *key, + struct brw_cs_prog_data *prog_data, + const nir_shader *shader, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str) { - prog_data->local_size[0] = cp->LocalSize[0]; - prog_data->local_size[1] = cp->LocalSize[1]; - prog_data->local_size[2] = cp->LocalSize[2]; + prog_data->local_size[0] = shader->info.cs.local_size[0]; + prog_data->local_size[1] = shader->info.cs.local_size[1]; + prog_data->local_size[2] = shader->info.cs.local_size[2]; unsigned local_workgroup_size = - cp->LocalSize[0] * cp->LocalSize[1] * cp->LocalSize[2]; - unsigned max_cs_threads = brw->intelScreen->compiler->devinfo->max_cs_threads; + shader->info.cs.local_size[0] * shader->info.cs.local_size[1] * + shader->info.cs.local_size[2]; + + unsigned max_cs_threads = compiler->devinfo->max_cs_threads; cfg_t *cfg = NULL; const char *fail_msg = NULL; /* Now the main event: Visit the shader IR and generate our CS IR for it. */ - fs_visitor v8(brw->intelScreen->compiler, brw, mem_ctx, key, - &prog_data->base, &cp->Base, cp->Base.nir, 8, shader_time_index); + fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base, + NULL, /* Never used in core profile */ + shader, 8, shader_time_index); if (!v8.run_cs()) { fail_msg = v8.fail_msg; } else if (local_workgroup_size <= 8 * max_cs_threads) { @@ -5313,15 +5395,18 @@ brw_cs_emit(struct brw_context *brw, prog_data->simd_size = 8; } - fs_visitor v16(brw->intelScreen->compiler, brw, mem_ctx, key, - &prog_data->base, &cp->Base, cp->Base.nir, 16, shader_time_index); + fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base, + NULL, /* Never used in core profile */ + shader, 16, shader_time_index); if (likely(!(INTEL_DEBUG & DEBUG_NO16)) && !fail_msg && !v8.simd16_unsupported && local_workgroup_size <= 16 * max_cs_threads) { /* Try a SIMD16 compile */ v16.import_uniforms(&v8); if (!v16.run_cs()) { - perf_debug("SIMD16 shader failed to compile: %s", v16.fail_msg); + compiler->shader_perf_log(log_data, + "SIMD16 shader failed to compile: %s", + v16.fail_msg); if (!cfg) { fail_msg = "Couldn't generate SIMD16 program and not " @@ -5335,20 +5420,19 @@ brw_cs_emit(struct brw_context *brw, if (unlikely(cfg == NULL)) { assert(fail_msg); - prog->LinkStatus = false; - ralloc_strcat(&prog->InfoLog, fail_msg); - _mesa_problem(NULL, "Failed to compile compute shader: %s\n", - fail_msg); + if (error_str) + *error_str = ralloc_strdup(mem_ctx, fail_msg); + return NULL; } - fs_generator g(brw->intelScreen->compiler, brw, - mem_ctx, (void*) key, &prog_data->base, &cp->Base, + fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base, v8.promoted_constants, v8.runtime_check_aads_emit, "CS"); if (INTEL_DEBUG & DEBUG_CS) { - char *name = ralloc_asprintf(mem_ctx, "%s compute shader %d", - prog->Label ? prog->Label : "unnamed", - prog->Name); + char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s", + shader->info.label ? shader->info.label : + "unnamed", + shader->info.name); g.enable_debug(name); } diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index e8b511f9ce6..171338dcc0b 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -48,7 +48,7 @@ extern "C" { #include "brw_wm.h" #include "intel_asm_annotation.h" } -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "glsl/ir.h" #include "glsl/nir/nir.h" #include "program/sampler.h" @@ -96,7 +96,7 @@ public: const void *key, struct brw_stage_prog_data *prog_data, struct gl_program *prog, - nir_shader *shader, + const nir_shader *shader, unsigned dispatch_width, int shader_time_index); @@ -400,7 +400,6 @@ public: void *mem_ctx, const void *key, struct brw_stage_prog_data *prog_data, - struct gl_program *fp, unsigned promoted_constants, bool runtime_check_aads_emit, const char *stage_abbrev); @@ -499,8 +498,6 @@ private: const void * const key; struct brw_stage_prog_data * const prog_data; - const struct gl_program *prog; - unsigned dispatch_width; /**< 8 or 16 */ exec_list discard_halt_patches; diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp index 277b6cc3a60..a13d001291c 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp @@ -45,7 +45,7 @@ #include "brw_wm.h" #include "glsl/ir.h" #include "glsl/ir_expression_flattening.h" -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" class ir_channel_expressions_visitor : public ir_hierarchical_visitor { public: diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp index 230b0caec47..5589716239a 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp @@ -275,17 +275,6 @@ is_logic_op(enum opcode opcode) opcode == BRW_OPCODE_NOT); } -static bool -can_change_source_types(fs_inst *inst) -{ - return !inst->src[0].abs && !inst->src[0].negate && - inst->dst.type == inst->src[0].type && - (inst->opcode == BRW_OPCODE_MOV || - (inst->opcode == BRW_OPCODE_SEL && - inst->predicate != BRW_PREDICATE_NONE && - !inst->src[1].abs && !inst->src[1].negate)); -} - bool fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) { @@ -368,7 +357,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) if (has_source_modifiers && entry->dst.type != inst->src[arg].type && - !can_change_source_types(inst)) + !inst->can_change_types()) return false; if (devinfo->gen >= 8 && (entry->src.negate || entry->src.abs) && @@ -438,7 +427,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) * type. If we got here, then we can just change the source and * destination types of the instruction and keep going. */ - assert(can_change_source_types(inst)); + assert(inst->can_change_types()); for (int i = 0; i < inst->sources; i++) { inst->src[i].type = entry->dst.type; } diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index 6f8b75e339f..13c495cd395 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -131,7 +131,6 @@ fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, const void *key, struct brw_stage_prog_data *prog_data, - struct gl_program *prog, unsigned promoted_constants, bool runtime_check_aads_emit, const char *stage_abbrev) @@ -139,7 +138,7 @@ fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data, : compiler(compiler), log_data(log_data), devinfo(compiler->devinfo), key(key), prog_data(prog_data), - prog(prog), promoted_constants(promoted_constants), + promoted_constants(promoted_constants), runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false), stage_abbrev(stage_abbrev), mem_ctx(mem_ctx) { @@ -1377,15 +1376,14 @@ fs_generator::generate_pixel_interpolator_query(fs_inst *inst, struct brw_reg msg_data, unsigned msg_type) { - assert(msg_data.file == BRW_IMMEDIATE_VALUE && - msg_data.type == BRW_REGISTER_TYPE_UD); + assert(msg_data.type == BRW_REGISTER_TYPE_UD); brw_pixel_interpolator_query(p, retype(dst, BRW_REGISTER_TYPE_UW), src, inst->pi_noperspective, msg_type, - msg_data.dw1.ud, + msg_data, inst->mlen, inst->regs_written); } @@ -2188,7 +2186,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) 100.0f * (before_size - after_size) / before_size); dump_assembly(p->store, annotation.ann_count, annotation.ann, - p->devinfo, prog); + p->devinfo); ralloc_free(annotation.ann); } diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp index 19aec92fad1..ce066a9778e 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp @@ -259,16 +259,15 @@ fs_live_variables::compute_start_end() struct block_data *bd = &block_data[block->num]; for (int i = 0; i < num_vars; i++) { - if (BITSET_TEST(bd->livein, i)) { - start[i] = MIN2(start[i], block->start_ip); - end[i] = MAX2(end[i], block->start_ip); - } - - if (BITSET_TEST(bd->liveout, i)) { - start[i] = MIN2(start[i], block->end_ip); - end[i] = MAX2(end[i], block->end_ip); - } + if (BITSET_TEST(bd->livein, i)) { + start[i] = MIN2(start[i], block->start_ip); + end[i] = MAX2(end[i], block->start_ip); + } + if (BITSET_TEST(bd->liveout, i)) { + start[i] = MIN2(start[i], block->end_ip); + end[i] = MAX2(end[i], block->end_ip); + } } } } diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 45c3f4ef3b4..feedbfbb2e3 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -56,61 +56,25 @@ fs_visitor::emit_nir_code() void fs_visitor::nir_setup_inputs() { + if (stage != MESA_SHADER_FRAGMENT) + return; + nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_inputs); nir_foreach_variable(var, &nir->inputs) { - enum brw_reg_type type = brw_type_for_base_type(var->type); fs_reg input = offset(nir_inputs, bld, var->data.driver_location); fs_reg reg; - switch (stage) { - case MESA_SHADER_VERTEX: { - /* Our ATTR file is indexed by VERT_ATTRIB_*, which is the value - * stored in nir_variable::location. - * - * However, NIR's load_input intrinsics use a different index - an - * offset into a single contiguous array containing all inputs. - * This index corresponds to the nir_variable::driver_location field. - * - * So, we need to copy from fs_reg(ATTR, var->location) to - * offset(nir_inputs, var->data.driver_location). - */ - const glsl_type *const t = var->type->without_array(); - const unsigned components = t->components(); - const unsigned cols = t->matrix_columns; - const unsigned elts = t->vector_elements; - unsigned array_length = var->type->is_array() ? var->type->length : 1; - for (unsigned i = 0; i < array_length; i++) { - for (unsigned j = 0; j < cols; j++) { - for (unsigned k = 0; k < elts; k++) { - bld.MOV(offset(retype(input, type), bld, - components * i + elts * j + k), - offset(fs_reg(ATTR, var->data.location + i, type), - bld, 4 * j + k)); - } - } - } - break; - } - case MESA_SHADER_GEOMETRY: - case MESA_SHADER_COMPUTE: - case MESA_SHADER_TESS_CTRL: - case MESA_SHADER_TESS_EVAL: - unreachable("fs_visitor not used for these stages yet."); - break; - case MESA_SHADER_FRAGMENT: - if (var->data.location == VARYING_SLOT_POS) { - reg = *emit_fragcoord_interpolation(var->data.pixel_center_integer, - var->data.origin_upper_left); - emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, bld.dispatch_width(), - input, reg), 0xF); - } else { - emit_general_interpolation(input, var->name, var->type, - (glsl_interp_qualifier) var->data.interpolation, - var->data.location, var->data.centroid, - var->data.sample); - } - break; + if (var->data.location == VARYING_SLOT_POS) { + reg = *emit_fragcoord_interpolation(var->data.pixel_center_integer, + var->data.origin_upper_left); + emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, bld.dispatch_width(), + input, reg), 0xF); + } else { + emit_general_interpolation(input, var->name, var->type, + (glsl_interp_qualifier) var->data.interpolation, + var->data.location, var->data.centroid, + var->data.sample); } } } @@ -125,9 +89,7 @@ fs_visitor::nir_setup_outputs() nir_foreach_variable(var, &nir->outputs) { fs_reg reg = offset(nir_outputs, bld, var->data.driver_location); - int vector_elements = - var->type->is_array() ? var->type->fields.array->vector_elements - : var->type->vector_elements; + int vector_elements = var->type->without_array()->vector_elements; switch (stage) { case MESA_SHADER_VERTEX: @@ -1180,6 +1142,36 @@ get_image_atomic_op(nir_intrinsic_op op, const glsl_type *type) } } +static fs_inst * +emit_pixel_interpolater_send(const fs_builder &bld, + enum opcode opcode, + const fs_reg &dst, + const fs_reg &src, + const fs_reg &desc, + glsl_interp_qualifier interpolation) +{ + fs_inst *inst; + fs_reg payload; + int mlen; + + if (src.file == BAD_FILE) { + /* Dummy payload */ + payload = bld.vgrf(BRW_REGISTER_TYPE_F, 1); + mlen = 1; + } else { + payload = src; + mlen = 2 * bld.dispatch_width() / 8; + } + + inst = bld.emit(opcode, dst, payload, desc); + inst->mlen = mlen; + /* 2 floats per slot returned */ + inst->regs_written = 2 * bld.dispatch_width() / 8; + inst->pi_noperspective = interpolation == INTERP_QUALIFIER_NOPERSPECTIVE; + + return inst; +} + void fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) { @@ -1440,7 +1432,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr */ brw_mark_surface_used(prog_data, stage_prog_data->binding_table.ubo_start + - nir->info.num_ssbos - 1); + nir->info.num_ubos - 1); } if (has_indirect) { @@ -1488,21 +1480,21 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr fs_reg surf_index; if (const_uniform_block) { - unsigned index = stage_prog_data->binding_table.ubo_start + + unsigned index = stage_prog_data->binding_table.ssbo_start + const_uniform_block->u[0]; surf_index = fs_reg(index); brw_mark_surface_used(prog_data, index); } else { surf_index = vgrf(glsl_type::uint_type); bld.ADD(surf_index, get_nir_src(instr->src[0]), - fs_reg(stage_prog_data->binding_table.ubo_start)); + fs_reg(stage_prog_data->binding_table.ssbo_start)); surf_index = bld.emit_uniformize(surf_index); /* Assume this may touch any UBO. It would be nice to provide * a tighter bound, but the array information is already lowered away. */ brw_mark_surface_used(prog_data, - stage_prog_data->binding_table.ubo_start + + stage_prog_data->binding_table.ssbo_start + nir->info.num_ssbos - 1); } @@ -1545,8 +1537,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_load_input: { unsigned index = 0; for (unsigned j = 0; j < instr->num_components; j++) { - fs_reg src = offset(retype(nir_inputs, dest.type), bld, - instr->const_index[0] + index); + fs_reg src; + if (stage == MESA_SHADER_VERTEX) { + src = offset(fs_reg(ATTR, instr->const_index[0], dest.type), bld, index); + } else { + src = offset(retype(nir_inputs, dest.type), bld, + instr->const_index[0] + index); + } if (has_indirect) src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0])); index++; @@ -1583,28 +1580,81 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr ((struct brw_wm_prog_data *) prog_data)->pulls_bary = true; fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2); - - /* For most messages, we need one reg of ignored data; the hardware - * requires mlen==1 even when there is no payload. in the per-slot - * offset case, we'll replace this with the proper source data. - */ - fs_reg src = vgrf(glsl_type::float_type); - int mlen = 1; /* one reg unless overriden */ - fs_inst *inst; + const glsl_interp_qualifier interpolation = + (glsl_interp_qualifier) instr->variables[0]->var->data.interpolation; switch (instr->intrinsic) { case nir_intrinsic_interp_var_at_centroid: - inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, - dst_xy, src, fs_reg(0u)); + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_CENTROID, + dst_xy, + fs_reg(), /* src */ + fs_reg(0u), + interpolation); break; case nir_intrinsic_interp_var_at_sample: { - /* XXX: We should probably handle non-constant sample id's */ nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]); - assert(const_sample); - unsigned msg_data = const_sample ? const_sample->i[0] << 4 : 0; - inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src, - fs_reg(msg_data)); + + if (const_sample) { + unsigned msg_data = const_sample->i[0] << 4; + + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_SAMPLE, + dst_xy, + fs_reg(), /* src */ + fs_reg(msg_data), + interpolation); + } else { + const fs_reg sample_src = retype(get_nir_src(instr->src[0]), + BRW_REGISTER_TYPE_UD); + + if (nir_src_is_dynamically_uniform(instr->src[0])) { + const fs_reg sample_id = bld.emit_uniformize(sample_src); + const fs_reg msg_data = vgrf(glsl_type::uint_type); + bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u)); + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_SAMPLE, + dst_xy, + fs_reg(), /* src */ + msg_data, + interpolation); + } else { + /* Make a loop that sends a message to the pixel interpolater + * for the sample number in each live channel. If there are + * multiple channels with the same sample number then these + * will be handled simultaneously with a single interation of + * the loop. + */ + bld.emit(BRW_OPCODE_DO); + + /* Get the next live sample number into sample_id_reg */ + const fs_reg sample_id = bld.emit_uniformize(sample_src); + + /* Set the flag register so that we can perform the send + * message on all channels that have the same sample number + */ + bld.CMP(bld.null_reg_ud(), + sample_src, sample_id, + BRW_CONDITIONAL_EQ); + const fs_reg msg_data = vgrf(glsl_type::uint_type); + bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u)); + fs_inst *inst = + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_SAMPLE, + dst_xy, + fs_reg(), /* src */ + msg_data, + interpolation); + set_predicate(BRW_PREDICATE_NORMAL, inst); + + /* Continue the loop if there are any live channels left */ + set_predicate_inv(BRW_PREDICATE_NORMAL, + true, /* inverse */ + bld.emit(BRW_OPCODE_WHILE)); + } + } + break; } @@ -1615,10 +1665,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf; unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf; - inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src, - fs_reg(off_x | (off_y << 4))); + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, + dst_xy, + fs_reg(), /* src */ + fs_reg(off_x | (off_y << 4)), + interpolation); } else { - src = vgrf(glsl_type::ivec2_type); + fs_reg src = vgrf(glsl_type::ivec2_type); fs_reg offset_src = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F); for (int i = 0; i < 2; i++) { @@ -1646,9 +1700,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr bld.SEL(offset(src, bld, i), itemp, fs_reg(7))); } - mlen = 2 * dispatch_width / 8; - inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src, - fs_reg(0u)); + const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; + emit_pixel_interpolater_send(bld, + opcode, + dst_xy, + src, + fs_reg(0u), + interpolation); } break; } @@ -1657,12 +1715,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr unreachable("Invalid intrinsic"); } - inst->mlen = mlen; - /* 2 floats per slot returned */ - inst->regs_written = 2 * dispatch_width / 8; - inst->pi_noperspective = instr->variables[0]->var->data.interpolation == - INTERP_QUALIFIER_NOPERSPECTIVE; - for (unsigned j = 0; j < instr->num_components; j++) { fs_reg src = interp_reg(instr->variables[0]->var->data.location, j); src.type = dest.type; @@ -1684,18 +1736,18 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[1]); if (const_uniform_block) { - unsigned index = stage_prog_data->binding_table.ubo_start + + unsigned index = stage_prog_data->binding_table.ssbo_start + const_uniform_block->u[0]; surf_index = fs_reg(index); brw_mark_surface_used(prog_data, index); } else { surf_index = vgrf(glsl_type::uint_type); bld.ADD(surf_index, get_nir_src(instr->src[1]), - fs_reg(stage_prog_data->binding_table.ubo_start)); + fs_reg(stage_prog_data->binding_table.ssbo_start)); surf_index = bld.emit_uniformize(surf_index); brw_mark_surface_used(prog_data, - stage_prog_data->binding_table.ubo_start + + stage_prog_data->binding_table.ssbo_start + nir->info.num_ssbos - 1); } @@ -1780,17 +1832,17 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_ssbo_atomic_add: nir_emit_ssbo_atomic(bld, BRW_AOP_ADD, instr); break; - case nir_intrinsic_ssbo_atomic_min: - if (dest.type == BRW_REGISTER_TYPE_D) - nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr); - else - nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr); + case nir_intrinsic_ssbo_atomic_imin: + nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr); break; - case nir_intrinsic_ssbo_atomic_max: - if (dest.type == BRW_REGISTER_TYPE_D) - nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr); - else - nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr); + case nir_intrinsic_ssbo_atomic_umin: + nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr); + break; + case nir_intrinsic_ssbo_atomic_imax: + nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr); + break; + case nir_intrinsic_ssbo_atomic_umax: + nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr); break; case nir_intrinsic_ssbo_atomic_and: nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr); @@ -1810,7 +1862,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_get_buffer_size: { nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]); - unsigned ubo_index = const_uniform_block ? const_uniform_block->u[0] : 0; + unsigned ssbo_index = const_uniform_block ? const_uniform_block->u[0] : 0; int reg_width = dispatch_width / 8; /* Set LOD = 0 */ @@ -1821,7 +1873,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr BRW_REGISTER_TYPE_UD); bld.LOAD_PAYLOAD(src_payload, &source, 1, 0); - fs_reg surf_index = fs_reg(prog_data->binding_table.ubo_start + ubo_index); + fs_reg surf_index = fs_reg(prog_data->binding_table.ssbo_start + ssbo_index); fs_inst *inst = bld.emit(FS_OPCODE_GET_BUFFER_SIZE, dest, src_payload, surf_index); inst->header_size = 0; @@ -1874,20 +1926,20 @@ fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld, fs_reg surface; nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]); if (const_surface) { - unsigned surf_index = stage_prog_data->binding_table.ubo_start + + unsigned surf_index = stage_prog_data->binding_table.ssbo_start + const_surface->u[0]; surface = fs_reg(surf_index); brw_mark_surface_used(prog_data, surf_index); } else { surface = vgrf(glsl_type::uint_type); bld.ADD(surface, get_nir_src(instr->src[0]), - fs_reg(stage_prog_data->binding_table.ubo_start)); + fs_reg(stage_prog_data->binding_table.ssbo_start)); - /* Assume this may touch any UBO. This is the same we do for other + /* Assume this may touch any SSBO. This is the same we do for other * UBO/SSBO accesses with non-constant surface. */ brw_mark_surface_used(prog_data, - stage_prog_data->binding_table.ubo_start + + stage_prog_data->binding_table.ssbo_start + nir->info.num_ssbos - 1); } diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp index c3a037be4b1..36388fad98d 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp @@ -27,7 +27,7 @@ #include "brw_fs.h" #include "brw_cfg.h" -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "glsl/ir_optimization.h" using namespace brw; diff --git a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp index e406c2899e8..8792a8c7b1d 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp @@ -52,11 +52,12 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block) ip--; if (inst->opcode != BRW_OPCODE_MOV || + !inst->saturate || inst->dst.file != GRF || + inst->dst.type != inst->src[0].type || inst->src[0].file != GRF || inst->src[0].abs || - inst->src[0].negate || - !inst->saturate) + inst->src[0].negate) continue; int src_var = v->live_intervals->var_from_reg(inst->src[0]); @@ -65,7 +66,9 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block) bool interfered = false; foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) { if (scan_inst->overwrites_reg(inst->src[0])) { - if (scan_inst->is_partial_write()) + if (scan_inst->is_partial_write() || + (scan_inst->dst.type != inst->dst.type && + !scan_inst->can_change_types())) break; if (scan_inst->saturate) { @@ -73,6 +76,12 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block) progress = true; } else if (src_end_ip <= ip || inst->dst.equals(inst->src[0])) { if (scan_inst->can_do_saturate()) { + if (scan_inst->dst.type != inst->dst.type) { + scan_inst->dst.type = inst->dst.type; + for (int i = 0; i < scan_inst->sources; i++) { + scan_inst->src[i].type = inst->dst.type; + } + } scan_inst->saturate = true; inst->saturate = false; progress = true; diff --git a/src/mesa/drivers/dri/i965/brw_fs_validate.cpp b/src/mesa/drivers/dri/i965/brw_fs_validate.cpp index d0e04f3bf47..814c551f1be 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_validate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_validate.cpp @@ -32,7 +32,7 @@ #define fsv_assert(cond) \ if (!(cond)) { \ - fprintf(stderr, "ASSERT: FS validation failed!\n"); \ + fprintf(stderr, "ASSERT: Scalar %s validation failed!\n", stage_abbrev); \ dump_instruction(inst, stderr); \ fprintf(stderr, "%s:%d: %s\n", __FILE__, __LINE__, #cond); \ abort(); \ diff --git a/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp b/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp index 6000e35b9b9..cab5af318a2 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp @@ -42,7 +42,7 @@ #include "glsl/ir.h" #include "glsl/ir_visitor.h" #include "glsl/ir_rvalue_visitor.h" -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "util/hash_table.h" static bool debug = false; diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index df1a7ed9b59..f825fed4daf 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -43,7 +43,7 @@ #include "brw_vec4.h" #include "brw_fs.h" #include "main/uniforms.h" -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "glsl/ir_optimization.h" #include "program/sampler.h" @@ -53,7 +53,8 @@ fs_reg * fs_visitor::emit_vs_system_value(int location) { fs_reg *reg = new(this->mem_ctx) - fs_reg(ATTR, VERT_ATTRIB_MAX, BRW_REGISTER_TYPE_D); + fs_reg(ATTR, 4 * _mesa_bitcount_64(nir->info.inputs_read), + BRW_REGISTER_TYPE_D); brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data; switch (location) { @@ -903,12 +904,9 @@ fs_visitor::emit_urb_writes() urb_offset = 0; flush = false; for (slot = 0; slot < vue_map->num_slots; slot++) { - fs_reg reg, src, zero; - int varying = vue_map->slot_to_varying[slot]; switch (varying) { - case VARYING_SLOT_PSIZ: - + case VARYING_SLOT_PSIZ: { /* The point size varying slot is the vue header and is always in the * vue map. But often none of the special varyings that live there * are written and in that case we can skip writing to the vue @@ -920,7 +918,7 @@ fs_visitor::emit_urb_writes() break; } - zero = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); + fs_reg zero(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); bld.MOV(zero, fs_reg(0u)); sources[length++] = zero; @@ -939,7 +937,7 @@ fs_visitor::emit_urb_writes() else sources[length++] = zero; break; - + } case BRW_VARYING_SLOT_NDC: case VARYING_SLOT_EDGE: unreachable("unexpected scalar vs output"); @@ -972,8 +970,8 @@ fs_visitor::emit_urb_writes() * temp register and use that for the payload. */ for (int i = 0; i < 4; i++) { - reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type); - src = offset(this->outputs[varying], bld, i); + fs_reg reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type); + fs_reg src = offset(this->outputs[varying], bld, i); set_saturate(true, bld.MOV(reg, src)); sources[length++] = reg; } @@ -1069,7 +1067,7 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data, const void *key, struct brw_stage_prog_data *prog_data, struct gl_program *prog, - nir_shader *shader, + const nir_shader *shader, unsigned dispatch_width, int shader_time_index) : backend_shader(compiler, log_data, mem_ctx, shader, prog_data), diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c index e0165fb4a23..10a7f28fdab 100644 --- a/src/mesa/drivers/dri/i965/brw_gs.c +++ b/src/mesa/drivers/dri/i965/brw_gs.c @@ -57,6 +57,7 @@ brw_codegen_gs_prog(struct brw_context *brw, struct brw_geometry_program *gp, struct brw_gs_prog_key *key) { + struct gl_shader *shader = prog->_LinkedShaders[MESA_SHADER_GEOMETRY]; struct brw_stage_state *stage_state = &brw->gs.base; struct brw_gs_compile c; memset(&c, 0, sizeof(c)); @@ -300,8 +301,11 @@ brw_codegen_gs_prog(struct brw_context *brw, void *mem_ctx = ralloc_context(NULL); unsigned program_size; + char *error_str; const unsigned *program = - brw_gs_emit(brw, prog, &c, mem_ctx, st_index, &program_size); + brw_compile_gs(brw->intelScreen->compiler, brw, &c, + shader->Program->nir, prog, + mem_ctx, st_index, &program_size, &error_str); if (program == NULL) { ralloc_free(mem_ctx); return false; diff --git a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c index 0bb307432d0..00125c0f405 100644 --- a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c @@ -129,7 +129,7 @@ brw_upload_gs_image_surfaces(struct brw_context *brw) ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY]; if (prog) { - /* BRW_NEW_GS_PROG_DATA, BRW_NEW_IMAGE_UNITS */ + /* BRW_NEW_GS_PROG_DATA, BRW_NEW_IMAGE_UNITS, _NEW_TEXTURE */ brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_GEOMETRY], &brw->gs.base, &brw->gs.prog_data->base.base); } @@ -137,6 +137,7 @@ brw_upload_gs_image_surfaces(struct brw_context *brw) const struct brw_tracked_state brw_gs_image_surfaces = { .dirty = { + .mesa = _NEW_TEXTURE, .brw = BRW_NEW_BATCH | BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_GS_PROG_DATA | diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h index 97c6f8b2500..7726e4b78a0 100644 --- a/src/mesa/drivers/dri/i965/brw_ir_fs.h +++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h @@ -204,6 +204,7 @@ public: unsigned components_read(unsigned i) const; int regs_read(int arg) const; bool can_do_source_mods(const struct brw_device_info *devinfo); + bool can_change_types() const; bool has_side_effects() const; bool reads_flag() const; diff --git a/src/mesa/drivers/dri/i965/brw_ir_vec4.h b/src/mesa/drivers/dri/i965/brw_ir_vec4.h index 96dd633e117..1b57b65db27 100644 --- a/src/mesa/drivers/dri/i965/brw_ir_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_ir_vec4.h @@ -179,6 +179,7 @@ public: int swizzle, int swizzle_mask); void reswizzle(int dst_writemask, int swizzle); bool can_do_source_mods(const struct brw_device_info *devinfo); + bool can_change_types() const; bool reads_flag() { diff --git a/src/mesa/drivers/dri/i965/brw_lower_unnormalized_offset.cpp b/src/mesa/drivers/dri/i965/brw_lower_unnormalized_offset.cpp index 8c59b9e415b..4219d471def 100644 --- a/src/mesa/drivers/dri/i965/brw_lower_unnormalized_offset.cpp +++ b/src/mesa/drivers/dri/i965/brw_lower_unnormalized_offset.cpp @@ -31,7 +31,7 @@ * \author Chris Forbes <[email protected]> */ -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "glsl/ir.h" #include "glsl/ir_builder.h" diff --git a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c index eb201736c6e..fbde3f04204 100644 --- a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c +++ b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c @@ -451,6 +451,11 @@ brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb, if (irb->mt->fast_clear_state == INTEL_FAST_CLEAR_STATE_NO_MCS) clear_type = REP_CLEAR; + if (brw->gen >= 9 && clear_type == FAST_CLEAR) { + perf_debug("fast MCS clears are disabled on gen9"); + clear_type = REP_CLEAR; + } + /* We can't do scissored fast clears because of the restrictions on the * fast clear rectangle size. */ diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c index 0a9c09f1075..dc497770914 100644 --- a/src/mesa/drivers/dri/i965/brw_nir.c +++ b/src/mesa/drivers/dri/i965/brw_nir.c @@ -27,30 +27,112 @@ #include "glsl/nir/glsl_to_nir.h" #include "program/prog_to_nir.h" +static bool +remap_vs_attrs(nir_block *block, void *closure) +{ + GLbitfield64 inputs_read = *((GLbitfield64 *) closure); + + nir_foreach_instr(block, instr) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + /* We set EmitNoIndirect for VS inputs, so there are no indirects. */ + assert(intrin->intrinsic != nir_intrinsic_load_input_indirect); + + if (intrin->intrinsic == nir_intrinsic_load_input) { + /* Attributes come in a contiguous block, ordered by their + * gl_vert_attrib value. That means we can compute the slot + * number for an attribute by masking out the enabled attributes + * before it and counting the bits. + */ + int attr = intrin->const_index[0]; + int slot = _mesa_bitcount_64(inputs_read & BITFIELD64_MASK(attr)); + intrin->const_index[0] = 4 * slot; + } + } + return true; +} + static void brw_nir_lower_inputs(nir_shader *nir, bool is_scalar) { switch (nir->stage) { + case MESA_SHADER_VERTEX: + /* For now, leave the vec4 backend doing the old method. */ + if (!is_scalar) { + nir_assign_var_locations(&nir->inputs, &nir->num_inputs, + type_size_vec4); + break; + } + + /* Start with the location of the variable's base. */ + foreach_list_typed(nir_variable, var, node, &nir->inputs) { + var->data.driver_location = var->data.location; + } + + /* Now use nir_lower_io to walk dereference chains. Attribute arrays + * are loaded as one vec4 per element (or matrix column), so we use + * type_size_vec4 here. + */ + nir_lower_io(nir, nir_var_shader_in, type_size_vec4); + + /* Finally, translate VERT_ATTRIB_* values into the actual registers. + * + * Note that we can use nir->info.inputs_read instead of key->inputs_read + * since the two are identical aside from Gen4-5 edge flag differences. + */ + GLbitfield64 inputs_read = nir->info.inputs_read; + nir_foreach_overload(nir, overload) { + if (overload->impl) { + nir_foreach_block(overload->impl, remap_vs_attrs, &inputs_read); + } + } + break; case MESA_SHADER_GEOMETRY: foreach_list_typed(nir_variable, var, node, &nir->inputs) { var->data.driver_location = var->data.location; } break; - default: + case MESA_SHADER_FRAGMENT: + assert(is_scalar); nir_assign_var_locations(&nir->inputs, &nir->num_inputs, - is_scalar ? type_size_scalar : type_size_vec4); + type_size_scalar); + break; + case MESA_SHADER_COMPUTE: + /* Compute shaders have no inputs. */ + assert(exec_list_is_empty(&nir->inputs)); break; + default: + unreachable("unsupported shader stage"); } } static void brw_nir_lower_outputs(nir_shader *nir, bool is_scalar) { - if (is_scalar) { - nir_assign_var_locations(&nir->outputs, &nir->num_outputs, type_size_scalar); - } else { - nir_foreach_variable(var, &nir->outputs) - var->data.driver_location = var->data.location; + switch (nir->stage) { + case MESA_SHADER_VERTEX: + case MESA_SHADER_GEOMETRY: + if (is_scalar) { + nir_assign_var_locations(&nir->outputs, &nir->num_outputs, + type_size_scalar); + } else { + nir_foreach_variable(var, &nir->outputs) + var->data.driver_location = var->data.location; + } + break; + case MESA_SHADER_FRAGMENT: + nir_assign_var_locations(&nir->outputs, &nir->num_outputs, + type_size_scalar); + break; + case MESA_SHADER_COMPUTE: + /* Compute shaders have no outputs. */ + assert(exec_list_is_empty(&nir->outputs)); + break; + default: + unreachable("unsupported shader stage"); } } diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c index dbd0e50228b..22b0227756e 100644 --- a/src/mesa/drivers/dri/i965/brw_program.c +++ b/src/mesa/drivers/dri/i965/brw_program.c @@ -69,8 +69,7 @@ static struct gl_program *brwNewProgram( struct gl_context *ctx, if (prog) { prog->id = get_new_program_id(brw->intelScreen); - return _mesa_init_vertex_program( ctx, &prog->program, - target, id ); + return _mesa_init_gl_program(&prog->program.Base, target, id); } else return NULL; @@ -81,8 +80,7 @@ static struct gl_program *brwNewProgram( struct gl_context *ctx, if (prog) { prog->id = get_new_program_id(brw->intelScreen); - return _mesa_init_fragment_program( ctx, &prog->program, - target, id ); + return _mesa_init_gl_program(&prog->program.Base, target, id); } else return NULL; @@ -93,7 +91,7 @@ static struct gl_program *brwNewProgram( struct gl_context *ctx, if (prog) { prog->id = get_new_program_id(brw->intelScreen); - return _mesa_init_geometry_program(ctx, &prog->program, target, id); + return _mesa_init_gl_program(&prog->program, target, id); } else { return NULL; } @@ -104,7 +102,7 @@ static struct gl_program *brwNewProgram( struct gl_context *ctx, if (prog) { prog->id = get_new_program_id(brw->intelScreen); - return _mesa_init_compute_program(ctx, &prog->program, target, id); + return _mesa_init_gl_program(&prog->program.Base, target, id); } else { return NULL; } diff --git a/src/mesa/drivers/dri/i965/brw_program.h b/src/mesa/drivers/dri/i965/brw_program.h index cf0522a8b10..f8cf2b062c8 100644 --- a/src/mesa/drivers/dri/i965/brw_program.h +++ b/src/mesa/drivers/dri/i965/brw_program.h @@ -24,129 +24,7 @@ #ifndef BRW_PROGRAM_H #define BRW_PROGRAM_H -/** - * Program key structures. - * - * When drawing, we look for the currently bound shaders in the program - * cache. This is essentially a hash table lookup, and these are the keys. - * - * Sometimes OpenGL features specified as state need to be simulated via - * shader code, due to a mismatch between the API and the hardware. This - * is often referred to as "non-orthagonal state" or "NOS". We store NOS - * in the program key so it's considered when searching for a program. If - * we haven't seen a particular combination before, we have to recompile a - * new specialized version. - * - * Shader compilation should not look up state in gl_context directly, but - * instead use the copy in the program key. This guarantees recompiles will - * happen correctly. - * - * @{ - */ - -enum PACKED gen6_gather_sampler_wa { - WA_SIGN = 1, /* whether we need to sign extend */ - WA_8BIT = 2, /* if we have an 8bit format needing wa */ - WA_16BIT = 4, /* if we have a 16bit format needing wa */ -}; - -/** - * Sampler information needed by VS, WM, and GS program cache keys. - */ -struct brw_sampler_prog_key_data { - /** - * EXT_texture_swizzle and DEPTH_TEXTURE_MODE swizzles. - */ - uint16_t swizzles[MAX_SAMPLERS]; - - uint32_t gl_clamp_mask[3]; - - /** - * For RG32F, gather4's channel select is broken. - */ - uint32_t gather_channel_quirk_mask; - - /** - * Whether this sampler uses the compressed multisample surface layout. - */ - uint32_t compressed_multisample_layout_mask; - - /** - * For Sandybridge, which shader w/a we need for gather quirks. - */ - enum gen6_gather_sampler_wa gen6_gather_wa[MAX_SAMPLERS]; -}; - - -/** The program key for Vertex Shaders. */ -struct brw_vs_prog_key { - unsigned program_string_id; - - /* - * Per-attribute workaround flags - */ - uint8_t gl_attrib_wa_flags[VERT_ATTRIB_MAX]; - - bool copy_edgeflag:1; - - bool clamp_vertex_color:1; - - /** - * How many user clipping planes are being uploaded to the vertex shader as - * push constants. - * - * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to - * clip distances. - */ - unsigned nr_userclip_plane_consts:4; - - /** - * For pre-Gen6 hardware, a bitfield indicating which texture coordinates - * are going to be replaced with point coordinates (as a consequence of a - * call to glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)). Because - * our SF thread requires exact matching between VS outputs and FS inputs, - * these texture coordinates will need to be unconditionally included in - * the VUE, even if they aren't written by the vertex shader. - */ - uint8_t point_coord_replace; - - struct brw_sampler_prog_key_data tex; -}; - -/** The program key for Geometry Shaders. */ -struct brw_gs_prog_key -{ - unsigned program_string_id; - - struct brw_sampler_prog_key_data tex; -}; - -/** The program key for Fragment/Pixel Shaders. */ -struct brw_wm_prog_key { - uint8_t iz_lookup; - bool stats_wm:1; - bool flat_shade:1; - bool persample_shading:1; - bool persample_2x:1; - unsigned nr_color_regions:5; - bool replicate_alpha:1; - bool render_to_fbo:1; - bool clamp_fragment_color:1; - bool compute_pos_offset:1; - bool compute_sample_id:1; - unsigned line_aa:2; - bool high_quality_derivatives:1; - - uint16_t drawable_height; - uint64_t input_slots_valid; - unsigned program_string_id; - GLenum alpha_test_func; /* < For Gen4/5 MRT alpha test */ - float alpha_test_ref; - - struct brw_sampler_prog_key_data tex; -}; - -/** @} */ +#include "brw_compiler.h" #ifdef __cplusplus extern "C" { diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c index c2db5f69560..6d73444dad0 100644 --- a/src/mesa/drivers/dri/i965/brw_sampler_state.c +++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c @@ -44,6 +44,7 @@ #include "main/macros.h" #include "main/samplerobj.h" +#include "util/half_float.h" /** * Emit a 3DSTATE_SAMPLER_STATE_POINTERS_{VS,HS,GS,DS,PS} packet. diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp index 4e43e5ccdbd..b710c60148c 100644 --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp @@ -29,7 +29,7 @@ #include "brw_vec4.h" #include "brw_cfg.h" #include "brw_shader.h" -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "glsl/ir_optimization.h" using namespace brw; diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp index 3a58a58a00b..6be2a6e5b55 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.cpp +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp @@ -660,7 +660,7 @@ brw_abs_immediate(enum brw_reg_type type, struct brw_reg *reg) backend_shader::backend_shader(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, - nir_shader *shader, + const nir_shader *shader, struct brw_stage_prog_data *stage_prog_data) : compiler(compiler), log_data(log_data), @@ -1131,11 +1131,16 @@ brw_assign_common_binding_table_offsets(gl_shader_stage stage, next_binding_table_offset += num_textures; if (shader) { - assert(shader->NumUniformBlocks <= BRW_MAX_COMBINED_UBO_SSBO); + assert(shader->NumUniformBlocks <= BRW_MAX_UBO); stage_prog_data->binding_table.ubo_start = next_binding_table_offset; next_binding_table_offset += shader->NumUniformBlocks; + + assert(shader->NumShaderStorageBlocks <= BRW_MAX_SSBO); + stage_prog_data->binding_table.ssbo_start = next_binding_table_offset; + next_binding_table_offset += shader->NumShaderStorageBlocks; } else { stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0; + stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0; } if (INTEL_DEBUG & DEBUG_SHADER_TIME) { diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h index ad2de5eae2d..b33b08f40d7 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.h +++ b/src/mesa/drivers/dri/i965/brw_shader.h @@ -38,64 +38,6 @@ #define MAX_SAMPLER_MESSAGE_SIZE 11 #define MAX_VGRF_SIZE 16 -struct brw_compiler { - const struct brw_device_info *devinfo; - - struct { - struct ra_regs *regs; - - /** - * Array of the ra classes for the unaligned contiguous register - * block sizes used. - */ - int *classes; - - /** - * Mapping for register-allocated objects in *regs to the first - * GRF for that object. - */ - uint8_t *ra_reg_to_grf; - } vec4_reg_set; - - struct { - struct ra_regs *regs; - - /** - * Array of the ra classes for the unaligned contiguous register - * block sizes used, indexed by register size. - */ - int classes[16]; - - /** - * Mapping from classes to ra_reg ranges. Each of the per-size - * classes corresponds to a range of ra_reg nodes. This array stores - * those ranges in the form of first ra_reg in each class and the - * total number of ra_reg elements in the last array element. This - * way the range of the i'th class is given by: - * [ class_to_ra_reg_range[i], class_to_ra_reg_range[i+1] ) - */ - int class_to_ra_reg_range[17]; - - /** - * Mapping for register-allocated objects in *regs to the first - * GRF for that object. - */ - uint8_t *ra_reg_to_grf; - - /** - * ra class for the aligned pairs we use for PLN, which doesn't - * appear in *classes. - */ - int aligned_pairs_class; - } fs_reg_sets[2]; - - void (*shader_debug_log)(void *, const char *str, ...) PRINTFLIKE(2, 3); - void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3); - - bool scalar_vs; - struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES]; -}; - enum PACKED register_file { BAD_FILE, GRF, @@ -225,7 +167,7 @@ protected: backend_shader(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, - nir_shader *shader, + const nir_shader *shader, struct brw_stage_prog_data *stage_prog_data); public: @@ -234,7 +176,7 @@ public: void *log_data; /* Passed to compiler->*_log functions */ const struct brw_device_info * const devinfo; - nir_shader *nir; + const nir_shader *nir; struct brw_stage_prog_data * const stage_prog_data; /** ralloc context for temporary data used during compile */ diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index e966b96a5ca..befc92445d3 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -280,6 +280,18 @@ vec4_instruction::can_do_source_mods(const struct brw_device_info *devinfo) return true; } +bool +vec4_instruction::can_change_types() const +{ + return dst.type == src[0].type && + !src[0].abs && !src[0].negate && !saturate && + (opcode == BRW_OPCODE_MOV || + (opcode == BRW_OPCODE_SEL && + dst.type == src[1].type && + predicate != BRW_PREDICATE_NONE && + !src[1].abs && !src[1].negate)); +} + /** * Returns how many MRFs an opcode will write over. * @@ -1632,28 +1644,11 @@ vec4_vs_visitor::setup_attributes(int payload_reg) */ if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) { attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes; - nr_attributes++; } lower_attributes_to_hw_regs(attribute_map, false /* interleaved */); - /* The BSpec says we always have to read at least one thing from - * the VF, and it appears that the hardware wedges otherwise. - */ - if (nr_attributes == 0) - nr_attributes = 1; - - prog_data->urb_read_length = (nr_attributes + 1) / 2; - - unsigned vue_entries = - MAX2(nr_attributes, prog_data->vue_map.num_slots); - - if (devinfo->gen == 6) - prog_data->urb_entry_size = ALIGN(vue_entries, 8) / 8; - else - prog_data->urb_entry_size = ALIGN(vue_entries, 4) / 4; - - return payload_reg + nr_attributes; + return payload_reg + vs_prog_data->nr_attributes; } int @@ -1937,51 +1932,76 @@ extern "C" { * Returns the final assembly and the program's size. */ const unsigned * -brw_vs_emit(struct brw_context *brw, - void *mem_ctx, - const struct brw_vs_prog_key *key, - struct brw_vs_prog_data *prog_data, - struct gl_vertex_program *vp, - struct gl_shader_program *prog, - int shader_time_index, - unsigned *final_assembly_size) +brw_compile_vs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_vs_prog_key *key, + struct brw_vs_prog_data *prog_data, + const nir_shader *shader, + gl_clip_plane *clip_planes, + bool use_legacy_snorm_formula, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str) { const unsigned *assembly = NULL; - if (brw->intelScreen->compiler->scalar_vs) { + unsigned nr_attributes = _mesa_bitcount_64(prog_data->inputs_read); + + /* gl_VertexID and gl_InstanceID are system values, but arrive via an + * incoming vertex attribute. So, add an extra slot. + */ + if (shader->info.system_values_read & + (BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) | + BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) { + nr_attributes++; + } + + /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry + * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in + * vec4 mode, the hardware appears to wedge unless we read something. + */ + if (compiler->scalar_vs) + prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attributes, 2); + else + prog_data->base.urb_read_length = DIV_ROUND_UP(MAX2(nr_attributes, 1), 2); + + prog_data->nr_attributes = nr_attributes; + + /* Since vertex shaders reuse the same VUE entry for inputs and outputs + * (overwriting the original contents), we need to make sure the size is + * the larger of the two. + */ + const unsigned vue_entries = + MAX2(nr_attributes, (unsigned)prog_data->base.vue_map.num_slots); + + if (compiler->devinfo->gen == 6) + prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8); + else + prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4); + + if (compiler->scalar_vs) { prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8; - fs_visitor v(brw->intelScreen->compiler, brw, - mem_ctx, key, &prog_data->base.base, + fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base.base, NULL, /* prog; Only used for TEXTURE_RECTANGLE on gen < 8 */ - vp->Base.nir, 8, shader_time_index); - if (!v.run_vs(brw_select_clip_planes(&brw->ctx))) { - if (prog) { - prog->LinkStatus = false; - ralloc_strcat(&prog->InfoLog, v.fail_msg); - } - - _mesa_problem(NULL, "Failed to compile vertex shader: %s\n", - v.fail_msg); + shader, 8, shader_time_index); + if (!v.run_vs(clip_planes)) { + if (error_str) + *error_str = ralloc_strdup(mem_ctx, v.fail_msg); return NULL; } - fs_generator g(brw->intelScreen->compiler, brw, - mem_ctx, (void *) key, &prog_data->base.base, - &vp->Base, v.promoted_constants, + fs_generator g(compiler, log_data, mem_ctx, (void *) key, + &prog_data->base.base, v.promoted_constants, v.runtime_check_aads_emit, "VS"); if (INTEL_DEBUG & DEBUG_VS) { - char *name; - if (prog) { - name = ralloc_asprintf(mem_ctx, "%s vertex shader %d", - prog->Label ? prog->Label : "unnamed", - prog->Name); - } else { - name = ralloc_asprintf(mem_ctx, "vertex program %d", - vp->Base.Id); - } - g.enable_debug(name); + const char *debug_name = + ralloc_asprintf(mem_ctx, "%s vertex shader %s", + shader->info.label ? shader->info.label : "unnamed", + shader->info.name); + + g.enable_debug(debug_name); } g.generate_code(v.cfg, 8); assembly = g.get_assembly(final_assembly_size); @@ -1990,26 +2010,19 @@ brw_vs_emit(struct brw_context *brw, if (!assembly) { prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT; - vec4_vs_visitor v(brw->intelScreen->compiler, brw, key, prog_data, - vp->Base.nir, brw_select_clip_planes(&brw->ctx), - mem_ctx, shader_time_index, - !_mesa_is_gles3(&brw->ctx)); + vec4_vs_visitor v(compiler, log_data, key, prog_data, + shader, clip_planes, mem_ctx, + shader_time_index, use_legacy_snorm_formula); if (!v.run()) { - if (prog) { - prog->LinkStatus = false; - ralloc_strcat(&prog->InfoLog, v.fail_msg); - } - - _mesa_problem(NULL, "Failed to compile vertex shader: %s\n", - v.fail_msg); + if (error_str) + *error_str = ralloc_strdup(mem_ctx, v.fail_msg); return NULL; } - vec4_generator g(brw->intelScreen->compiler, brw, - prog, &vp->Base, &prog_data->base, + vec4_generator g(compiler, log_data, &prog_data->base, mem_ctx, INTEL_DEBUG & DEBUG_VS, "vertex", "VS"); - assembly = g.generate_assembly(v.cfg, final_assembly_size); + assembly = g.generate_assembly(v.cfg, final_assembly_size, shader); } return assembly; diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index 5e3500c0c9a..d861b2e85df 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -72,7 +72,7 @@ public: void *log_data, const struct brw_sampler_prog_key_data *key, struct brw_vue_prog_data *prog_data, - nir_shader *shader, + const nir_shader *shader, void *mem_ctx, bool no_spills, int shader_time_index); @@ -391,8 +391,6 @@ class vec4_generator { public: vec4_generator(const struct brw_compiler *compiler, void *log_data, - struct gl_shader_program *shader_prog, - struct gl_program *prog, struct brw_vue_prog_data *prog_data, void *mem_ctx, bool debug_flag, @@ -400,10 +398,11 @@ public: const char *stage_abbrev); ~vec4_generator(); - const unsigned *generate_assembly(const cfg_t *cfg, unsigned *asm_size); + const unsigned *generate_assembly(const cfg_t *cfg, unsigned *asm_size, + const nir_shader *nir); private: - void generate_code(const cfg_t *cfg); + void generate_code(const cfg_t *cfg, const nir_shader *nir); void generate_math1_gen4(vec4_instruction *inst, struct brw_reg dst, @@ -485,9 +484,6 @@ private: struct brw_codegen *p; - struct gl_shader_program *shader_prog; - const struct gl_program *prog; - struct brw_vue_prog_data *prog_data; void *mem_ctx; diff --git a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp index 610caef7dce..db99ecba35a 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp @@ -256,18 +256,6 @@ try_constant_propagate(const struct brw_device_info *devinfo, } static bool -can_change_source_types(vec4_instruction *inst) -{ - return inst->dst.type == inst->src[0].type && - !inst->src[0].abs && !inst->src[0].negate && !inst->saturate && - (inst->opcode == BRW_OPCODE_MOV || - (inst->opcode == BRW_OPCODE_SEL && - inst->dst.type == inst->src[1].type && - inst->predicate != BRW_PREDICATE_NONE && - !inst->src[1].abs && !inst->src[1].negate)); -} - -static bool try_copy_propagate(const struct brw_device_info *devinfo, vec4_instruction *inst, int arg, struct copy_entry *entry) @@ -325,7 +313,7 @@ try_copy_propagate(const struct brw_device_info *devinfo, if (has_source_modifiers && value.type != inst->src[arg].type && - !can_change_source_types(inst)) + !inst->can_change_types()) return false; if (has_source_modifiers && @@ -394,7 +382,7 @@ try_copy_propagate(const struct brw_device_info *devinfo, value.swizzle = composed_swizzle; if (has_source_modifiers && value.type != inst->src[arg].type) { - assert(can_change_source_types(inst)); + assert(inst->can_change_types()); for (int i = 0; i < 3; i++) { inst->src[i].type = value.type; } diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp index dcacc900540..a84f6c47471 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp @@ -21,6 +21,7 @@ */ #include <ctype.h> +#include "glsl/glsl_parser_extras.h" #include "brw_vec4.h" #include "brw_cfg.h" @@ -137,15 +138,13 @@ vec4_instruction::get_src(const struct brw_vue_prog_data *prog_data, int i) vec4_generator::vec4_generator(const struct brw_compiler *compiler, void *log_data, - struct gl_shader_program *shader_prog, - struct gl_program *prog, struct brw_vue_prog_data *prog_data, void *mem_ctx, bool debug_flag, const char *stage_name, const char *stage_abbrev) : compiler(compiler), log_data(log_data), devinfo(compiler->devinfo), - shader_prog(shader_prog), prog(prog), prog_data(prog_data), + prog_data(prog_data), mem_ctx(mem_ctx), stage_name(stage_name), stage_abbrev(stage_abbrev), debug_flag(debug_flag) { @@ -1142,7 +1141,7 @@ vec4_generator::generate_set_simd4x2_header_gen9(vec4_instruction *inst, } void -vec4_generator::generate_code(const cfg_t *cfg) +vec4_generator::generate_code(const cfg_t *cfg, const nir_shader *nir) { struct annotation_info annotation; memset(&annotation, 0, sizeof(annotation)); @@ -1648,14 +1647,10 @@ vec4_generator::generate_code(const cfg_t *cfg) int after_size = p->next_insn_offset; if (unlikely(debug_flag)) { - if (shader_prog) { - fprintf(stderr, "Native code for %s %s shader %d:\n", - shader_prog->Label ? shader_prog->Label : "unnamed", - stage_name, shader_prog->Name); - } else { - fprintf(stderr, "Native code for %s program %d:\n", stage_name, - prog->Id); - } + fprintf(stderr, "Native code for %s %s shader %s:\n", + nir->info.label ? nir->info.label : "unnamed", + _mesa_shader_stage_to_string(nir->stage), nir->info.name); + fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. Compacted %d to %d" " bytes (%.0f%%)\n", stage_abbrev, @@ -1663,7 +1658,7 @@ vec4_generator::generate_code(const cfg_t *cfg) 100.0f * (before_size - after_size) / before_size); dump_assembly(p->store, annotation.ann_count, annotation.ann, - p->devinfo, prog); + p->devinfo); ralloc_free(annotation.ann); } @@ -1676,10 +1671,11 @@ vec4_generator::generate_code(const cfg_t *cfg) const unsigned * vec4_generator::generate_assembly(const cfg_t *cfg, - unsigned *assembly_size) + unsigned *assembly_size, + const nir_shader *nir) { brw_set_default_access_mode(p, BRW_ALIGN_16); - generate_code(cfg); + generate_code(cfg, nir); return brw_get_program(p, assembly_size); } diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp index 4ce471e0669..a715cf5a6cb 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp @@ -30,14 +30,12 @@ #include "brw_vec4_gs_visitor.h" #include "gen6_gs_visitor.h" -const unsigned MAX_GS_INPUT_VERTICES = 6; - namespace brw { vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler, void *log_data, struct brw_gs_compile *c, - nir_shader *shader, + const nir_shader *shader, void *mem_ctx, bool no_spills, int shader_time_index) @@ -598,32 +596,17 @@ vec4_gs_visitor::gs_end_primitive() emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask)); } -static const unsigned * -generate_assembly(struct brw_context *brw, - struct gl_shader_program *shader_prog, - struct gl_program *prog, - struct brw_vue_prog_data *prog_data, - void *mem_ctx, - const cfg_t *cfg, - unsigned *final_assembly_size) -{ - vec4_generator g(brw->intelScreen->compiler, brw, - shader_prog, prog, prog_data, mem_ctx, - INTEL_DEBUG & DEBUG_GS, "geometry", "GS"); - return g.generate_assembly(cfg, final_assembly_size); -} - extern "C" const unsigned * -brw_gs_emit(struct brw_context *brw, - struct gl_shader_program *prog, - struct brw_gs_compile *c, - void *mem_ctx, - int shader_time_index, - unsigned *final_assembly_size) +brw_compile_gs(const struct brw_compiler *compiler, void *log_data, + struct brw_gs_compile *c, + const nir_shader *shader, + struct gl_shader_program *shader_prog, + void *mem_ctx, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str) { - struct gl_shader *shader = prog->_LinkedShaders[MESA_SHADER_GEOMETRY]; - - if (brw->gen >= 7) { + if (compiler->devinfo->gen >= 7) { /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do * so without spilling. If the GS invocations count > 1, then we can't use * dual object mode. @@ -632,13 +615,12 @@ brw_gs_emit(struct brw_context *brw, likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) { c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT; - vec4_gs_visitor v(brw->intelScreen->compiler, brw, - c, shader->Program->nir, + vec4_gs_visitor v(compiler, log_data, c, shader, mem_ctx, true /* no_spills */, shader_time_index); if (v.run()) { - return generate_assembly(brw, prog, &c->gp->program.Base, - &c->prog_data.base, mem_ctx, v.cfg, - final_assembly_size); + vec4_generator g(compiler, log_data, &c->prog_data.base, mem_ctx, + INTEL_DEBUG & DEBUG_GS, "geometry", "GS"); + return g.generate_assembly(v.cfg, final_assembly_size, shader); } } } @@ -666,7 +648,7 @@ brw_gs_emit(struct brw_context *brw, * mode is more performant when invocations > 1. Gen6 only supports * SINGLE mode. */ - if (c->prog_data.invocations <= 1 || brw->gen < 7) + if (c->prog_data.invocations <= 1 || compiler->devinfo->gen < 7) c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X1_SINGLE; else c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_INSTANCE; @@ -674,24 +656,22 @@ brw_gs_emit(struct brw_context *brw, vec4_gs_visitor *gs = NULL; const unsigned *ret = NULL; - if (brw->gen >= 7) - gs = new vec4_gs_visitor(brw->intelScreen->compiler, brw, - c, shader->Program->nir, + if (compiler->devinfo->gen >= 7) + gs = new vec4_gs_visitor(compiler, log_data, c, shader, mem_ctx, false /* no_spills */, shader_time_index); else - gs = new gen6_gs_visitor(brw->intelScreen->compiler, brw, - c, prog, shader->Program->nir, + gs = new gen6_gs_visitor(compiler, log_data, c, shader_prog, shader, mem_ctx, false /* no_spills */, shader_time_index); if (!gs->run()) { - prog->LinkStatus = false; - ralloc_strcat(&prog->InfoLog, gs->fail_msg); + if (error_str) + *error_str = ralloc_strdup(mem_ctx, gs->fail_msg); } else { - ret = generate_assembly(brw, prog, &c->gp->program.Base, - &c->prog_data.base, mem_ctx, gs->cfg, - final_assembly_size); + vec4_generator g(compiler, log_data, &c->prog_data.base, mem_ctx, + INTEL_DEBUG & DEBUG_GS, "geometry", "GS"); + ret = g.generate_assembly(gs->cfg, final_assembly_size, shader); } delete gs; diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h index 3ff195c3e68..c52552768c8 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h @@ -32,36 +32,6 @@ #include "brw_vec4.h" -/** - * Scratch data used when compiling a GLSL geometry shader. - */ -struct brw_gs_compile -{ - struct brw_gs_prog_key key; - struct brw_gs_prog_data prog_data; - struct brw_vue_map input_vue_map; - - struct brw_geometry_program *gp; - - unsigned control_data_bits_per_vertex; - unsigned control_data_header_size_bits; -}; - -#ifdef __cplusplus -extern "C" { -#endif - -const unsigned *brw_gs_emit(struct brw_context *brw, - struct gl_shader_program *prog, - struct brw_gs_compile *c, - void *mem_ctx, - int shader_time_index, - unsigned *final_assembly_size); - -#ifdef __cplusplus -} /* extern "C" */ -#endif - #ifdef __cplusplus namespace brw { @@ -71,7 +41,7 @@ public: vec4_gs_visitor(const struct brw_compiler *compiler, void *log_data, struct brw_gs_compile *c, - nir_shader *shader, + const nir_shader *shader, void *mem_ctx, bool no_spills, int shader_time_index); diff --git a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp index cc688ef8083..678237901f2 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp @@ -291,15 +291,15 @@ vec4_visitor::calculate_live_intervals() struct block_data *bd = &live_intervals->block_data[block->num]; for (int i = 0; i < live_intervals->num_vars; i++) { - if (BITSET_TEST(bd->livein, i)) { - start[i] = MIN2(start[i], block->start_ip); - end[i] = MAX2(end[i], block->start_ip); - } + if (BITSET_TEST(bd->livein, i)) { + start[i] = MIN2(start[i], block->start_ip); + end[i] = MAX2(end[i], block->start_ip); + } - if (BITSET_TEST(bd->liveout, i)) { - start[i] = MIN2(start[i], block->end_ip); - end[i] = MAX2(end[i], block->end_ip); - } + if (BITSET_TEST(bd->liveout, i)) { + start[i] = MIN2(start[i], block->end_ip); + end[i] = MAX2(end[i], block->end_ip); + } } } } diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp index 41bd80df377..ea1e3e7bbcf 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp @@ -423,10 +423,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) case nir_intrinsic_get_buffer_size: { nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]); - unsigned ubo_index = const_uniform_block ? const_uniform_block->u[0] : 0; + unsigned ssbo_index = const_uniform_block ? const_uniform_block->u[0] : 0; - src_reg surf_index = src_reg(prog_data->base.binding_table.ubo_start + - ubo_index); + src_reg surf_index = src_reg(prog_data->base.binding_table.ssbo_start + + ssbo_index); dst_reg result_dst = get_nir_dest(instr->dest); vec4_instruction *inst = new(mem_ctx) vec4_instruction(VS_OPCODE_GET_BUFFER_SIZE, result_dst); @@ -456,18 +456,18 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[1]); if (const_uniform_block) { - unsigned index = prog_data->base.binding_table.ubo_start + + unsigned index = prog_data->base.binding_table.ssbo_start + const_uniform_block->u[0]; surf_index = src_reg(index); brw_mark_surface_used(&prog_data->base, index); } else { surf_index = src_reg(this, glsl_type::uint_type); emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[1], 1), - src_reg(prog_data->base.binding_table.ubo_start))); + src_reg(prog_data->base.binding_table.ssbo_start))); surf_index = emit_uniformize(surf_index); brw_mark_surface_used(&prog_data->base, - prog_data->base.binding_table.ubo_start + + prog_data->base.binding_table.ssbo_start + nir->info.num_ssbos - 1); } @@ -599,7 +599,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) src_reg surf_index; if (const_uniform_block) { - unsigned index = prog_data->base.binding_table.ubo_start + + unsigned index = prog_data->base.binding_table.ssbo_start + const_uniform_block->u[0]; surf_index = src_reg(index); @@ -607,14 +607,14 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) } else { surf_index = src_reg(this, glsl_type::uint_type); emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[0], 1), - src_reg(prog_data->base.binding_table.ubo_start))); + src_reg(prog_data->base.binding_table.ssbo_start))); surf_index = emit_uniformize(surf_index); /* Assume this may touch any UBO. It would be nice to provide * a tighter bound, but the array information is already lowered away. */ brw_mark_surface_used(&prog_data->base, - prog_data->base.binding_table.ubo_start + + prog_data->base.binding_table.ssbo_start + nir->info.num_ssbos - 1); } @@ -645,17 +645,17 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) case nir_intrinsic_ssbo_atomic_add: nir_emit_ssbo_atomic(BRW_AOP_ADD, instr); break; - case nir_intrinsic_ssbo_atomic_min: - if (dest.type == BRW_REGISTER_TYPE_D) - nir_emit_ssbo_atomic(BRW_AOP_IMIN, instr); - else - nir_emit_ssbo_atomic(BRW_AOP_UMIN, instr); + case nir_intrinsic_ssbo_atomic_imin: + nir_emit_ssbo_atomic(BRW_AOP_IMIN, instr); + break; + case nir_intrinsic_ssbo_atomic_umin: + nir_emit_ssbo_atomic(BRW_AOP_UMIN, instr); break; - case nir_intrinsic_ssbo_atomic_max: - if (dest.type == BRW_REGISTER_TYPE_D) - nir_emit_ssbo_atomic(BRW_AOP_IMAX, instr); - else - nir_emit_ssbo_atomic(BRW_AOP_UMAX, instr); + case nir_intrinsic_ssbo_atomic_imax: + nir_emit_ssbo_atomic(BRW_AOP_IMAX, instr); + break; + case nir_intrinsic_ssbo_atomic_umax: + nir_emit_ssbo_atomic(BRW_AOP_UMAX, instr); break; case nir_intrinsic_ssbo_atomic_and: nir_emit_ssbo_atomic(BRW_AOP_AND, instr); @@ -765,7 +765,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) */ brw_mark_surface_used(&prog_data->base, prog_data->base.binding_table.ubo_start + - nir->info.num_ssbos - 1); + nir->info.num_ubos - 1); } unsigned const_offset = instr->const_index[0]; @@ -821,20 +821,20 @@ vec4_visitor::nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr) src_reg surface; nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]); if (const_surface) { - unsigned surf_index = prog_data->base.binding_table.ubo_start + + unsigned surf_index = prog_data->base.binding_table.ssbo_start + const_surface->u[0]; surface = src_reg(surf_index); brw_mark_surface_used(&prog_data->base, surf_index); } else { surface = src_reg(this, glsl_type::uint_type); emit(ADD(dst_reg(surface), get_nir_src(instr->src[0]), - src_reg(prog_data->base.binding_table.ubo_start))); + src_reg(prog_data->base.binding_table.ssbo_start))); /* Assume this may touch any UBO. This is the same we do for other * UBO/SSBO accesses with non-constant surface. */ brw_mark_surface_used(&prog_data->base, - prog_data->base.binding_table.ubo_start + + prog_data->base.binding_table.ssbo_start + nir->info.num_ssbos - 1); } @@ -1237,14 +1237,8 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) break; case nir_op_b2i: - emit(AND(dst, op[0], src_reg(1))); - break; - case nir_op_b2f: - op[0].type = BRW_REGISTER_TYPE_D; - dst.type = BRW_REGISTER_TYPE_D; - emit(AND(dst, op[0], src_reg(0x3f800000u))); - dst.type = BRW_REGISTER_TYPE_F; + emit(MOV(dst, negate(op[0]))); break; case nir_op_f2b: diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp index 98ea9be6ee4..5be9c6a6b2d 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp @@ -1815,7 +1815,7 @@ vec4_visitor::vec4_visitor(const struct brw_compiler *compiler, void *log_data, const struct brw_sampler_prog_key_data *key_tex, struct brw_vue_prog_data *prog_data, - nir_shader *shader, + const nir_shader *shader, void *mem_ctx, bool no_spills, int shader_time_index) diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp index b6e1971c2ee..485a80ee2fc 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp @@ -301,7 +301,7 @@ vec4_vs_visitor::vec4_vs_visitor(const struct brw_compiler *compiler, void *log_data, const struct brw_vs_prog_key *key, struct brw_vs_prog_data *vs_prog_data, - nir_shader *shader, + const nir_shader *shader, gl_clip_plane *clip_planes, void *mem_ctx, int shader_time_index, diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c index 38de98fab86..ba680a98f7e 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.c +++ b/src/mesa/drivers/dri/i965/brw_vs.c @@ -31,6 +31,7 @@ #include "main/compiler.h" +#include "main/context.h" #include "brw_context.h" #include "brw_vs.h" #include "brw_util.h" @@ -57,18 +58,6 @@ brw_codegen_vs_prog(struct brw_context *brw, bool start_busy = false; double start_time = 0; - if (!vp->program.Base.nir) { - /* Normally we generate NIR in LinkShader() or - * ProgramStringNotify(), but Mesa's fixed-function vertex program - * handling doesn't notify the driver at all. Just do it here, at - * the last minute, even though it's lame. - */ - assert(vp->program.Base.Id == 0 && prog == NULL); - vp->program.Base.nir = - brw_create_nir(brw, NULL, &vp->program.Base, MESA_SHADER_VERTEX, - brw->intelScreen->compiler->scalar_vs); - } - if (prog) vs = (struct brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX]; @@ -171,7 +160,7 @@ brw_codegen_vs_prog(struct brw_context *brw, } if (unlikely(INTEL_DEBUG & DEBUG_VS)) - brw_dump_ir("vertex", prog, &vs->base, &vp->program.Base); + brw_dump_ir("vertex", prog, vs ? &vs->base : NULL, &vp->program.Base); int st_index = -1; if (INTEL_DEBUG & DEBUG_SHADER_TIME) @@ -179,9 +168,20 @@ brw_codegen_vs_prog(struct brw_context *brw, /* Emit GEN4 code. */ - program = brw_vs_emit(brw, mem_ctx, key, &prog_data, - &vp->program, prog, st_index, &program_size); + char *error_str; + program = brw_compile_vs(brw->intelScreen->compiler, brw, mem_ctx, key, + &prog_data, vp->program.Base.nir, + brw_select_clip_planes(&brw->ctx), + !_mesa_is_gles3(&brw->ctx), + st_index, &program_size, &error_str); if (program == NULL) { + if (prog) { + prog->LinkStatus = false; + ralloc_strcat(&prog->InfoLog, error_str); + } + + _mesa_problem(NULL, "Failed to compile vertex shader: %s\n", error_str); + ralloc_free(mem_ctx); return false; } diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h index f1242f61b33..bcb5e7b0b2a 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.h +++ b/src/mesa/drivers/dri/i965/brw_vs.h @@ -54,14 +54,6 @@ extern "C" { #endif -const unsigned *brw_vs_emit(struct brw_context *brw, - void *mem_ctx, - const struct brw_vs_prog_key *key, - struct brw_vs_prog_data *prog_data, - struct gl_vertex_program *vp, - struct gl_shader_program *shader_prog, - int shader_time_index, - unsigned *program_size); void brw_vs_debug_recompile(struct brw_context *brw, struct gl_shader_program *prog, const struct brw_vs_prog_key *key); @@ -88,7 +80,7 @@ public: void *log_data, const struct brw_vs_prog_key *key, struct brw_vs_prog_data *vs_prog_data, - nir_shader *shader, + const nir_shader *shader, gl_clip_plane *clip_planes, void *mem_ctx, int shader_time_index, diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c index 9bb48eb2e27..f65258a52a5 100644 --- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c @@ -201,7 +201,7 @@ brw_upload_vs_image_surfaces(struct brw_context *brw) ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX]; if (prog) { - /* BRW_NEW_VS_PROG_DATA, BRW_NEW_IMAGE_UNITS */ + /* BRW_NEW_VS_PROG_DATA, BRW_NEW_IMAGE_UNITS, _NEW_TEXTURE */ brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_VERTEX], &brw->vs.base, &brw->vs.prog_data->base.base); } @@ -209,6 +209,7 @@ brw_upload_vs_image_surfaces(struct brw_context *brw) const struct brw_tracked_state brw_vs_image_surfaces = { .dirty = { + .mesa = _NEW_TEXTURE, .brw = BRW_NEW_BATCH | BRW_NEW_IMAGE_UNITS | BRW_NEW_VERTEX_PROGRAM | diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c index 4d5e7f67bd6..5c49db9e63e 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.c +++ b/src/mesa/drivers/dri/i965/brw_wm.c @@ -39,89 +39,6 @@ #include "util/ralloc.h" -/** - * Return a bitfield where bit n is set if barycentric interpolation mode n - * (see enum brw_wm_barycentric_interp_mode) is needed by the fragment shader. - */ -static unsigned -brw_compute_barycentric_interp_modes(const struct brw_device_info *devinfo, - bool shade_model_flat, - bool persample_shading, - nir_shader *shader) -{ - unsigned barycentric_interp_modes = 0; - - nir_foreach_variable(var, &shader->inputs) { - enum glsl_interp_qualifier interp_qualifier = var->data.interpolation; - bool is_centroid = var->data.centroid && !persample_shading; - bool is_sample = var->data.sample || persample_shading; - bool is_gl_Color = (var->data.location == VARYING_SLOT_COL0) || - (var->data.location == VARYING_SLOT_COL1); - - /* Ignore WPOS and FACE, because they don't require interpolation. */ - if (var->data.location == VARYING_SLOT_POS || - var->data.location == VARYING_SLOT_FACE) - continue; - - /* Determine the set (or sets) of barycentric coordinates needed to - * interpolate this variable. Note that when - * brw->needs_unlit_centroid_workaround is set, centroid interpolation - * uses PIXEL interpolation for unlit pixels and CENTROID interpolation - * for lit pixels, so we need both sets of barycentric coordinates. - */ - if (interp_qualifier == INTERP_QUALIFIER_NOPERSPECTIVE) { - if (is_centroid) { - barycentric_interp_modes |= - 1 << BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC; - } else if (is_sample) { - barycentric_interp_modes |= - 1 << BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC; - } - if ((!is_centroid && !is_sample) || - devinfo->needs_unlit_centroid_workaround) { - barycentric_interp_modes |= - 1 << BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC; - } - } else if (interp_qualifier == INTERP_QUALIFIER_SMOOTH || - (!(shade_model_flat && is_gl_Color) && - interp_qualifier == INTERP_QUALIFIER_NONE)) { - if (is_centroid) { - barycentric_interp_modes |= - 1 << BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC; - } else if (is_sample) { - barycentric_interp_modes |= - 1 << BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC; - } - if ((!is_centroid && !is_sample) || - devinfo->needs_unlit_centroid_workaround) { - barycentric_interp_modes |= - 1 << BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC; - } - } - } - - return barycentric_interp_modes; -} - -static uint8_t -computed_depth_mode(struct gl_fragment_program *fp) -{ - if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { - switch (fp->FragDepthLayout) { - case FRAG_DEPTH_LAYOUT_NONE: - case FRAG_DEPTH_LAYOUT_ANY: - return BRW_PSCDEPTH_ON; - case FRAG_DEPTH_LAYOUT_GREATER: - return BRW_PSCDEPTH_ON_GE; - case FRAG_DEPTH_LAYOUT_LESS: - return BRW_PSCDEPTH_ON_LE; - case FRAG_DEPTH_LAYOUT_UNCHANGED: - return BRW_PSCDEPTH_OFF; - } - } - return BRW_PSCDEPTH_OFF; -} - static void assign_fs_binding_table_offsets(const struct brw_device_info *devinfo, const struct gl_shader_program *shader_prog, @@ -166,15 +83,6 @@ brw_codegen_wm_prog(struct brw_context *brw, fs = (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; memset(&prog_data, 0, sizeof(prog_data)); - /* key->alpha_test_func means simulating alpha testing via discards, - * so the shader definitely kills pixels. - */ - prog_data.uses_kill = fp->program.UsesKill || key->alpha_test_func; - prog_data.uses_omask = - fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK); - prog_data.computed_depth_mode = computed_depth_mode(&fp->program); - - prog_data.early_fragment_tests = fs && fs->base.EarlyFragmentTests; /* Use ALT floating point mode for ARB programs so that 0^0 == 1. */ if (!prog) @@ -209,12 +117,6 @@ brw_codegen_wm_prog(struct brw_context *brw, &prog_data.base); } - prog_data.barycentric_interp_modes = - brw_compute_barycentric_interp_modes(brw->intelScreen->devinfo, - key->flat_shade, - key->persample_shading, - fp->program.Base.nir); - if (unlikely(brw->perf_debug)) { start_busy = (brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo)); @@ -222,7 +124,7 @@ brw_codegen_wm_prog(struct brw_context *brw, } if (unlikely(INTEL_DEBUG & DEBUG_WM)) - brw_dump_ir("fragment", prog, &fs->base, &fp->program.Base); + brw_dump_ir("fragment", prog, fs ? &fs->base : NULL, &fp->program.Base); int st_index8 = -1, st_index16 = -1; if (INTEL_DEBUG & DEBUG_SHADER_TIME) { @@ -230,9 +132,19 @@ brw_codegen_wm_prog(struct brw_context *brw, st_index16 = brw_get_shader_time_index(brw, prog, &fp->program.Base, ST_FS16); } - program = brw_wm_fs_emit(brw, mem_ctx, key, &prog_data, - &fp->program, prog, st_index8, st_index16, &program_size); + char *error_str = NULL; + program = brw_compile_fs(brw->intelScreen->compiler, brw, mem_ctx, + key, &prog_data, fp->program.Base.nir, + &fp->program.Base, st_index8, st_index16, + brw->use_rep_send, &program_size, &error_str); if (program == NULL) { + if (prog) { + prog->LinkStatus = false; + ralloc_strcat(&prog->InfoLog, error_str); + } + + _mesa_problem(NULL, "Failed to compile fragment shader: %s\n", error_str); + ralloc_free(mem_ctx); return false; } diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h index 6ee22b2f907..53a642ee8bb 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.h +++ b/src/mesa/drivers/dri/i965/brw_wm.h @@ -61,21 +61,6 @@ extern "C" { #endif -/** - * Compile a fragment shader. - * - * Returns the final assembly and the program's size. - */ -const unsigned *brw_wm_fs_emit(struct brw_context *brw, - void *mem_ctx, - const struct brw_wm_prog_key *key, - struct brw_wm_prog_data *prog_data, - struct gl_fragment_program *fp, - struct gl_shader_program *prog, - int shader_time_index8, - int shader_time_index16, - unsigned *final_assembly_size); - GLboolean brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog); struct gl_shader *brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type); diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c index c671e23827e..6ebe6481c32 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c @@ -34,6 +34,7 @@ #include "main/blend.h" #include "main/mtypes.h" #include "main/samplerobj.h" +#include "main/shaderimage.h" #include "program/prog_parameter.h" #include "main/framebuffer.h" @@ -925,54 +926,53 @@ brw_upload_ubo_surfaces(struct brw_context *brw, if (!shader) return; - uint32_t *surf_offsets = + uint32_t *ubo_surf_offsets = &stage_state->surf_offset[prog_data->binding_table.ubo_start]; for (int i = 0; i < shader->NumUniformBlocks; i++) { - struct intel_buffer_object *intel_bo; + struct gl_uniform_buffer_binding *binding = + &ctx->UniformBufferBindings[shader->UniformBlocks[i]->Binding]; - /* Because behavior for referencing outside of the binding's size in the - * glBindBufferRange case is undefined, we can just bind the whole buffer - * glBindBufferBase wants and be a correct implementation. - */ - if (!shader->UniformBlocks[i].IsShaderStorage) { - struct gl_uniform_buffer_binding *binding; - binding = - &ctx->UniformBufferBindings[shader->UniformBlocks[i].Binding]; - if (binding->BufferObject == ctx->Shared->NullBufferObj) { - brw->vtbl.emit_null_surface_state(brw, 1, 1, 1, &surf_offsets[i]); - } else { - intel_bo = intel_buffer_object(binding->BufferObject); - drm_intel_bo *bo = - intel_bufferobj_buffer(brw, intel_bo, - binding->Offset, - binding->BufferObject->Size - binding->Offset); - brw_create_constant_surface(brw, bo, binding->Offset, - binding->BufferObject->Size - binding->Offset, - &surf_offsets[i], - dword_pitch); - } + if (binding->BufferObject == ctx->Shared->NullBufferObj) { + brw->vtbl.emit_null_surface_state(brw, 1, 1, 1, &ubo_surf_offsets[i]); } else { - struct gl_shader_storage_buffer_binding *binding; - binding = - &ctx->ShaderStorageBufferBindings[shader->UniformBlocks[i].Binding]; - if (binding->BufferObject == ctx->Shared->NullBufferObj) { - brw->vtbl.emit_null_surface_state(brw, 1, 1, 1, &surf_offsets[i]); - } else { - intel_bo = intel_buffer_object(binding->BufferObject); - drm_intel_bo *bo = - intel_bufferobj_buffer(brw, intel_bo, - binding->Offset, - binding->BufferObject->Size - binding->Offset); - brw_create_buffer_surface(brw, bo, binding->Offset, - binding->BufferObject->Size - binding->Offset, - &surf_offsets[i], - dword_pitch); - } + struct intel_buffer_object *intel_bo = + intel_buffer_object(binding->BufferObject); + drm_intel_bo *bo = + intel_bufferobj_buffer(brw, intel_bo, + binding->Offset, + binding->BufferObject->Size - binding->Offset); + brw_create_constant_surface(brw, bo, binding->Offset, + binding->BufferObject->Size - binding->Offset, + &ubo_surf_offsets[i], + dword_pitch); + } + } + + uint32_t *ssbo_surf_offsets = + &stage_state->surf_offset[prog_data->binding_table.ssbo_start]; + + for (int i = 0; i < shader->NumShaderStorageBlocks; i++) { + struct gl_shader_storage_buffer_binding *binding = + &ctx->ShaderStorageBufferBindings[shader->ShaderStorageBlocks[i]->Binding]; + + if (binding->BufferObject == ctx->Shared->NullBufferObj) { + brw->vtbl.emit_null_surface_state(brw, 1, 1, 1, &ssbo_surf_offsets[i]); + } else { + struct intel_buffer_object *intel_bo = + intel_buffer_object(binding->BufferObject); + drm_intel_bo *bo = + intel_bufferobj_buffer(brw, intel_bo, + binding->Offset, + binding->BufferObject->Size - binding->Offset); + brw_create_buffer_surface(brw, bo, binding->Offset, + binding->BufferObject->Size - binding->Offset, + &ssbo_surf_offsets[i], + dword_pitch); } } - if (shader->NumUniformBlocks) + if (shader->NumUniformBlocks || shader->NumShaderStorageBlocks) brw->ctx.NewDriverState |= BRW_NEW_SURFACES; } @@ -1112,7 +1112,7 @@ brw_upload_cs_image_surfaces(struct brw_context *brw) ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE]; if (prog) { - /* BRW_NEW_CS_PROG_DATA, BRW_NEW_IMAGE_UNITS */ + /* BRW_NEW_CS_PROG_DATA, BRW_NEW_IMAGE_UNITS, _NEW_TEXTURE */ brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_COMPUTE], &brw->cs.base, &brw->cs.prog_data->base); } @@ -1120,7 +1120,7 @@ brw_upload_cs_image_surfaces(struct brw_context *brw) const struct brw_tracked_state brw_cs_image_surfaces = { .dirty = { - .mesa = _NEW_PROGRAM, + .mesa = _NEW_TEXTURE | _NEW_PROGRAM, .brw = BRW_NEW_BATCH | BRW_NEW_CS_PROG_DATA | BRW_NEW_IMAGE_UNITS @@ -1253,7 +1253,7 @@ update_image_surface(struct brw_context *brw, uint32_t *surf_offset, struct brw_image_param *param) { - if (u->_Valid) { + if (_mesa_is_image_unit_valid(&brw->ctx, u)) { struct gl_texture_object *obj = u->TexObj; const unsigned format = get_image_format(brw, u->_ActualFormat, access); @@ -1338,7 +1338,7 @@ brw_upload_wm_image_surfaces(struct brw_context *brw) struct gl_shader_program *prog = ctx->Shader._CurrentFragmentProgram; if (prog) { - /* BRW_NEW_FS_PROG_DATA, BRW_NEW_IMAGE_UNITS */ + /* BRW_NEW_FS_PROG_DATA, BRW_NEW_IMAGE_UNITS, _NEW_TEXTURE */ brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_FRAGMENT], &brw->wm.base, &brw->wm.prog_data->base); } @@ -1346,6 +1346,7 @@ brw_upload_wm_image_surfaces(struct brw_context *brw) const struct brw_tracked_state brw_wm_image_surfaces = { .dirty = { + .mesa = _NEW_TEXTURE, .brw = BRW_NEW_BATCH | BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp index 59a76559103..671a535a5bd 100644 --- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp @@ -31,8 +31,6 @@ #include "gen6_gs_visitor.h" -const unsigned MAX_GS_INPUT_VERTICES = 6; - namespace brw { void diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h index e75d6aa10b8..d02c67d8a74 100644 --- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h +++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h @@ -39,7 +39,7 @@ public: void *log_data, struct brw_gs_compile *c, struct gl_shader_program *prog, - nir_shader *shader, + const nir_shader *shader, void *mem_ctx, bool no_spills, int shader_time_index) : diff --git a/src/mesa/drivers/dri/i965/gen7_gs_state.c b/src/mesa/drivers/dri/i965/gen7_gs_state.c index 497ecec8e45..8d6d3fe1d34 100644 --- a/src/mesa/drivers/dri/i965/gen7_gs_state.c +++ b/src/mesa/drivers/dri/i965/gen7_gs_state.c @@ -59,9 +59,7 @@ upload_gs_state(struct brw_context *brw) OUT_BATCH(((ALIGN(stage_state->sampler_count, 4)/4) << GEN6_GS_SAMPLER_COUNT_SHIFT) | ((brw->gs.prog_data->base.base.binding_table.size_bytes / 4) << - GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT) | - (brw->is_haswell && prog_data->base.nr_image_params ? - HSW_GS_UAV_ACCESS_ENABLE : 0)); + GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); if (brw->gs.prog_data->base.base.total_scratch) { OUT_RELOC(stage_state->scratch_bo, diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c index b7e48585482..a18dc697651 100644 --- a/src/mesa/drivers/dri/i965/gen7_vs_state.c +++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c @@ -126,9 +126,7 @@ upload_vs_state(struct brw_context *brw) ((ALIGN(stage_state->sampler_count, 4)/4) << GEN6_VS_SAMPLER_COUNT_SHIFT) | ((brw->vs.prog_data->base.base.binding_table.size_bytes / 4) << - GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) | - (brw->is_haswell && prog_data->base.nr_image_params ? - HSW_VS_UAV_ACCESS_ENABLE : 0)); + GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); if (prog_data->base.total_scratch) { OUT_RELOC(stage_state->scratch_bo, diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c index fd6dab5be8b..06d5e65786b 100644 --- a/src/mesa/drivers/dri/i965/gen7_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c @@ -113,7 +113,14 @@ upload_wm_state(struct brw_context *brw) else if (prog_data->base.nr_image_params) dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC; - /* _NEW_BUFFERS | _NEW_COLOR */ + /* The "UAV access enable" bits are unnecessary on HSW because they only + * seem to have an effect on the HW-assisted coherency mechanism which we + * don't need, and the rasterization-related UAV_ONLY flag and the + * DISPATCH_ENABLE bit can be set independently from it. + * C.f. gen8_upload_ps_extra(). + * + * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS | _NEW_COLOR + */ if (brw->is_haswell && !(brw_color_buffer_write_enabled(brw) || writes_depth) && prog_data->base.nr_image_params) @@ -221,9 +228,6 @@ gen7_upload_ps_state(struct brw_context *brw, _mesa_get_min_invocations_per_fragment(ctx, fp, false); assert(min_inv_per_frag >= 1); - if (brw->is_haswell && prog_data->base.nr_image_params) - dw4 |= HSW_PS_UAV_ACCESS_ENABLE; - if (prog_data->prog_offset_16 || prog_data->no_8) { dw4 |= GEN7_PS_16_DISPATCH_ENABLE; if (!prog_data->no_8 && min_inv_per_frag == 1) { diff --git a/src/mesa/drivers/dri/i965/gen8_gs_state.c b/src/mesa/drivers/dri/i965/gen8_gs_state.c index 4195f4cf4a7..d766ca7bebf 100644 --- a/src/mesa/drivers/dri/i965/gen8_gs_state.c +++ b/src/mesa/drivers/dri/i965/gen8_gs_state.c @@ -52,9 +52,7 @@ gen8_upload_gs_state(struct brw_context *brw) ((ALIGN(stage_state->sampler_count, 4)/4) << GEN6_GS_SAMPLER_COUNT_SHIFT) | ((prog_data->base.binding_table.size_bytes / 4) << - GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT) | - (prog_data->base.nr_image_params ? - HSW_GS_UAV_ACCESS_ENABLE : 0)); + GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); if (brw->gs.prog_data->base.base.total_scratch) { OUT_RELOC64(stage_state->scratch_bo, diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c index a686fed704f..8f0507413a7 100644 --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c @@ -25,6 +25,7 @@ #include "program/program.h" #include "brw_state.h" #include "brw_defines.h" +#include "brw_wm.h" #include "intel_batchbuffer.h" void @@ -65,8 +66,33 @@ gen8_upload_ps_extra(struct brw_context *brw, if (brw->gen >= 9 && prog_data->pulls_bary) dw1 |= GEN9_PSX_SHADER_PULLS_BARY; - if (_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx) || - prog_data->base.nr_image_params) + /* The stricter cross-primitive coherency guarantees that the hardware + * gives us with the "Accesses UAV" bit set for at least one shader stage + * and the "UAV coherency required" bit set on the 3DPRIMITIVE command are + * redundant within the current image, atomic counter and SSBO GL APIs, + * which all have very loose ordering and coherency requirements and + * generally rely on the application to insert explicit barriers when a + * shader invocation is expected to see the memory writes performed by the + * invocations of some previous primitive. Regardless of the value of "UAV + * coherency required", the "Accesses UAV" bits will implicitly cause an in + * most cases useless DC flush when the lowermost stage with the bit set + * finishes execution. + * + * It would be nice to disable it, but in some cases we can't because on + * Gen8+ it also has an influence on rasterization via the PS UAV-only + * signal (which could be set independently from the coherency mechanism in + * the 3DSTATE_WM command on Gen7), and because in some cases it will + * determine whether the hardware skips execution of the fragment shader or + * not via the ThreadDispatchEnable signal. However if we know that + * GEN8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and + * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any + * difference so we may just disable it here. + * + * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR + */ + if ((_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx) || + prog_data->base.nr_image_params) && + !brw_color_buffer_write_enabled(brw)) dw1 |= GEN8_PSX_SHADER_HAS_UAV; BEGIN_BATCH(2); @@ -91,7 +117,7 @@ upload_ps_extra(struct brw_context *brw) const struct brw_tracked_state gen8_ps_extra = { .dirty = { - .mesa = 0, + .mesa = _NEW_BUFFERS | _NEW_COLOR, .brw = BRW_NEW_CONTEXT | BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c b/src/mesa/drivers/dri/i965/gen8_surface_state.c index e1e7704655d..18b86652fd2 100644 --- a/src/mesa/drivers/dri/i965/gen8_surface_state.c +++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c @@ -221,8 +221,8 @@ gen8_emit_texture_surface_state(struct brw_context *brw, * "When Auxiliary Surface Mode is set to AUX_CCS_D or AUX_CCS_E, HALIGN * 16 must be used." */ - assert(brw->gen < 9 || mt->halign == 16); - assert(brw->gen < 8 || mt->num_samples > 1 || mt->halign == 16); + if (brw->gen >= 9 || mt->num_samples == 1) + assert(mt->halign == 16); } const uint32_t surf_type = translate_tex_target(target); @@ -470,8 +470,8 @@ gen8_update_renderbuffer_surface(struct brw_context *brw, * "When Auxiliary Surface Mode is set to AUX_CCS_D or AUX_CCS_E, HALIGN * 16 must be used." */ - assert(brw->gen < 9 || mt->halign == 16); - assert(brw->gen < 8 || mt->num_samples > 1 || mt->halign == 16); + if (brw->gen >= 9 || mt->num_samples == 1) + assert(mt->halign == 16); } uint32_t *surf = allocate_surface_state(brw, &offset, surf_index); diff --git a/src/mesa/drivers/dri/i965/gen8_vs_state.c b/src/mesa/drivers/dri/i965/gen8_vs_state.c index 8b5048bee7e..28f5adddf14 100644 --- a/src/mesa/drivers/dri/i965/gen8_vs_state.c +++ b/src/mesa/drivers/dri/i965/gen8_vs_state.c @@ -53,9 +53,7 @@ upload_vs_state(struct brw_context *brw) ((ALIGN(stage_state->sampler_count, 4) / 4) << GEN6_VS_SAMPLER_COUNT_SHIFT) | ((prog_data->base.binding_table.size_bytes / 4) << - GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) | - (prog_data->base.nr_image_params ? - HSW_VS_UAV_ACCESS_ENABLE : 0)); + GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); if (prog_data->base.total_scratch) { OUT_RELOC64(stage_state->scratch_bo, diff --git a/src/mesa/drivers/dri/i965/intel_asm_annotation.c b/src/mesa/drivers/dri/i965/intel_asm_annotation.c index bb8bb8d38c9..b3d6324a5fe 100644 --- a/src/mesa/drivers/dri/i965/intel_asm_annotation.c +++ b/src/mesa/drivers/dri/i965/intel_asm_annotation.c @@ -33,8 +33,7 @@ void dump_assembly(void *assembly, int num_annotations, struct annotation *annotation, - const struct brw_device_info *devinfo, - const struct gl_program *prog) + const struct brw_device_info *devinfo) { const char *last_annotation_string = NULL; const void *last_annotation_ir = NULL; @@ -57,19 +56,7 @@ dump_assembly(void *assembly, int num_annotations, struct annotation *annotation last_annotation_ir = annotation[i].ir; if (last_annotation_ir) { fprintf(stderr, " "); - if (prog->nir) - nir_print_instr(annotation[i].ir, stderr); - else if (!prog->Instructions) - fprint_ir(stderr, annotation[i].ir); - else { - const struct prog_instruction *pi = - (const struct prog_instruction *)annotation[i].ir; - fprintf(stderr, "%d: ", - (int)(pi - prog->Instructions)); - _mesa_fprint_instruction_opt(stderr, - pi, - 0, PROG_PRINT_DEBUG, NULL); - } + nir_print_instr(annotation[i].ir, stderr); fprintf(stderr, "\n"); } } diff --git a/src/mesa/drivers/dri/i965/intel_asm_annotation.h b/src/mesa/drivers/dri/i965/intel_asm_annotation.h index d9c69bc41b0..6c72326f058 100644 --- a/src/mesa/drivers/dri/i965/intel_asm_annotation.h +++ b/src/mesa/drivers/dri/i965/intel_asm_annotation.h @@ -60,8 +60,7 @@ struct annotation_info { void dump_assembly(void *assembly, int num_annotations, struct annotation *annotation, - const struct brw_device_info *devinfo, - const struct gl_program *prog); + const struct brw_device_info *devinfo); void annotate(const struct brw_device_info *devinfo, diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c index a169c41790e..b6e35205727 100644 --- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c +++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c @@ -201,6 +201,14 @@ intel_miptree_supports_non_msrt_fast_clear(struct brw_context *brw, if (brw->gen < 7) return false; + if (brw->gen >= 9) { + /* FINISHME: Enable singlesample fast MCS clears on SKL after all GPU + * FINISHME: hangs are resolved. + */ + perf_debug("singlesample fast MCS clears disabled on gen9"); + return false; + } + if (mt->disable_aux_buffers) return false; diff --git a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp index 8adb626d420..5f80f90a91d 100644 --- a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp +++ b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp @@ -66,7 +66,7 @@ void cmod_propagation_test::SetUp() v = new cmod_propagation_fs_visitor(compiler, prog_data, shader); - _mesa_init_fragment_program(ctx, &fp->program, GL_FRAGMENT_SHADER, 0); + _mesa_init_gl_program(&fp->program.Base, GL_FRAGMENT_SHADER, 0); devinfo->gen = 4; } diff --git a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp index f77b18e7db8..32e8b8f8867 100644 --- a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp +++ b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp @@ -66,7 +66,7 @@ void saturate_propagation_test::SetUp() v = new saturate_propagation_fs_visitor(compiler, prog_data, shader); - _mesa_init_fragment_program(ctx, &fp->program, GL_FRAGMENT_SHADER, 0); + _mesa_init_gl_program(&fp->program.Base, GL_FRAGMENT_SHADER, 0); devinfo->gen = 4; } diff --git a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp index 40253961a65..e80b71b558d 100644 --- a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp @@ -98,7 +98,7 @@ void copy_propagation_test::SetUp() v = new copy_propagation_vec4_visitor(compiler, shader); - _mesa_init_vertex_program(ctx, &vp->program, GL_VERTEX_SHADER, 0); + _mesa_init_gl_program(&vp->program.Base, GL_VERTEX_SHADER, 0); devinfo->gen = 4; } diff --git a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp index 76028d36311..2f824617454 100644 --- a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp +++ b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp @@ -101,7 +101,7 @@ void register_coalesce_test::SetUp() v = new register_coalesce_vec4_visitor(compiler, shader); - _mesa_init_vertex_program(ctx, &vp->program, GL_VERTEX_SHADER, 0); + _mesa_init_gl_program(&vp->program.Base, GL_VERTEX_SHADER, 0); devinfo->gen = 4; } diff --git a/src/mesa/drivers/dri/r200/r200_vertprog.c b/src/mesa/drivers/dri/r200/r200_vertprog.c index d43eaf977fc..628c5708090 100644 --- a/src/mesa/drivers/dri/r200/r200_vertprog.c +++ b/src/mesa/drivers/dri/r200/r200_vertprog.c @@ -1200,18 +1200,19 @@ r200BindProgram(struct gl_context *ctx, GLenum target, struct gl_program *prog) static struct gl_program * r200NewProgram(struct gl_context *ctx, GLenum target, GLuint id) { - struct r200_vertex_program *vp; - switch(target){ - case GL_VERTEX_PROGRAM_ARB: - vp = CALLOC_STRUCT(r200_vertex_program); - return _mesa_init_vertex_program(ctx, &vp->mesa_program, target, id); - case GL_FRAGMENT_PROGRAM_ARB: - return _mesa_init_fragment_program( ctx, CALLOC_STRUCT(gl_fragment_program), target, id ); + case GL_VERTEX_PROGRAM_ARB: { + struct r200_vertex_program *vp = CALLOC_STRUCT(r200_vertex_program); + return _mesa_init_gl_program(&vp->mesa_program.Base, target, id); + } + case GL_FRAGMENT_PROGRAM_ARB: { + struct gl_fragment_program *prog = CALLOC_STRUCT(gl_fragment_program); + return _mesa_init_gl_program(&prog->Base, target, id); + } default: _mesa_problem(ctx, "Bad target in r200NewProgram"); + return NULL; } - return NULL; } diff --git a/src/mesa/drivers/x11/SConscript b/src/mesa/drivers/x11/SConscript index d29f9874f44..cd5cccda0d1 100644 --- a/src/mesa/drivers/x11/SConscript +++ b/src/mesa/drivers/x11/SConscript @@ -4,6 +4,8 @@ env = env.Clone() env.Append(CPPPATH = [ '#/src', + '#/src/glsl', + '#/src/glsl/nir', '#/src/mapi', '#/src/mesa', '#/src/mesa/main', diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c index dee5e29d5b8..20aa4980935 100644 --- a/src/mesa/main/blend.c +++ b/src/mesa/main/blend.c @@ -190,6 +190,19 @@ update_uses_dual_src(struct gl_context *ctx, int buf) blend_factor_is_dual_src(ctx->Color.Blend[buf].DstA)); } + +/** + * Return the number of per-buffer blend states to update in + * glBlendFunc, glBlendFuncSeparate, glBlendEquation, etc. + */ +static inline unsigned +num_buffers(const struct gl_context *ctx) +{ + return ctx->Extensions.ARB_draw_buffers_blend + ? ctx->Const.MaxDrawBuffers : 1; +} + + /** * Set the separate blend source/dest factors for all draw buffers. * @@ -202,9 +215,10 @@ void GLAPIENTRY _mesa_BlendFuncSeparate( GLenum sfactorRGB, GLenum dfactorRGB, GLenum sfactorA, GLenum dfactorA ) { - GLuint buf, numBuffers; - GLboolean changed; GET_CURRENT_CONTEXT(ctx); + const unsigned numBuffers = num_buffers(ctx); + unsigned buf; + bool changed = false; if (MESA_VERBOSE & VERBOSE_API) _mesa_debug(ctx, "glBlendFuncSeparate %s %s %s %s\n", @@ -213,28 +227,38 @@ _mesa_BlendFuncSeparate( GLenum sfactorRGB, GLenum dfactorRGB, _mesa_enum_to_string(sfactorA), _mesa_enum_to_string(dfactorA)); - if (!validate_blend_factors(ctx, "glBlendFuncSeparate", - sfactorRGB, dfactorRGB, - sfactorA, dfactorA)) { - return; + /* Check if we're really changing any state. If not, return early. */ + if (ctx->Color._BlendFuncPerBuffer) { + /* Check all per-buffer states */ + for (buf = 0; buf < numBuffers; buf++) { + if (ctx->Color.Blend[buf].SrcRGB != sfactorRGB || + ctx->Color.Blend[buf].DstRGB != dfactorRGB || + ctx->Color.Blend[buf].SrcA != sfactorA || + ctx->Color.Blend[buf].DstA != dfactorA) { + changed = true; + break; + } + } } - - numBuffers = ctx->Extensions.ARB_draw_buffers_blend - ? ctx->Const.MaxDrawBuffers : 1; - - changed = GL_FALSE; - for (buf = 0; buf < numBuffers; buf++) { - if (ctx->Color.Blend[buf].SrcRGB != sfactorRGB || - ctx->Color.Blend[buf].DstRGB != dfactorRGB || - ctx->Color.Blend[buf].SrcA != sfactorA || - ctx->Color.Blend[buf].DstA != dfactorA) { - changed = GL_TRUE; - break; + else { + /* only need to check 0th per-buffer state */ + if (ctx->Color.Blend[0].SrcRGB != sfactorRGB || + ctx->Color.Blend[0].DstRGB != dfactorRGB || + ctx->Color.Blend[0].SrcA != sfactorA || + ctx->Color.Blend[0].DstA != dfactorA) { + changed = true; } } + if (!changed) return; + if (!validate_blend_factors(ctx, "glBlendFuncSeparate", + sfactorRGB, dfactorRGB, + sfactorA, dfactorA)) { + return; + } + FLUSH_VERTICES(ctx, _NEW_COLOR); for (buf = 0; buf < numBuffers; buf++) { @@ -242,8 +266,13 @@ _mesa_BlendFuncSeparate( GLenum sfactorRGB, GLenum dfactorRGB, ctx->Color.Blend[buf].DstRGB = dfactorRGB; ctx->Color.Blend[buf].SrcA = sfactorA; ctx->Color.Blend[buf].DstA = dfactorA; - update_uses_dual_src(ctx, buf); } + + update_uses_dual_src(ctx, 0); + for (buf = 1; buf < numBuffers; buf++) { + ctx->Color.Blend[buf]._UsesDualSrc = ctx->Color.Blend[0]._UsesDualSrc; + } + ctx->Color._BlendFuncPerBuffer = GL_FALSE; if (ctx->Driver.BlendFuncSeparate) { @@ -283,18 +312,18 @@ _mesa_BlendFuncSeparateiARB(GLuint buf, GLenum sfactorRGB, GLenum dfactorRGB, return; } - if (!validate_blend_factors(ctx, "glBlendFuncSeparatei", - sfactorRGB, dfactorRGB, - sfactorA, dfactorA)) { - return; - } - if (ctx->Color.Blend[buf].SrcRGB == sfactorRGB && ctx->Color.Blend[buf].DstRGB == dfactorRGB && ctx->Color.Blend[buf].SrcA == sfactorA && ctx->Color.Blend[buf].DstA == dfactorA) return; /* no change */ + if (!validate_blend_factors(ctx, "glBlendFuncSeparatei", + sfactorRGB, dfactorRGB, + sfactorA, dfactorA)) { + return; + } + FLUSH_VERTICES(ctx, _NEW_COLOR); ctx->Color.Blend[buf].SrcRGB = sfactorRGB; @@ -331,34 +360,43 @@ legal_blend_equation(const struct gl_context *ctx, GLenum mode) void GLAPIENTRY _mesa_BlendEquation( GLenum mode ) { - GLuint buf, numBuffers; - GLboolean changed; GET_CURRENT_CONTEXT(ctx); + const unsigned numBuffers = num_buffers(ctx); + unsigned buf; + bool changed = false; if (MESA_VERBOSE & VERBOSE_API) _mesa_debug(ctx, "glBlendEquation(%s)\n", _mesa_enum_to_string(mode)); - if (!legal_blend_equation(ctx, mode)) { - _mesa_error(ctx, GL_INVALID_ENUM, "glBlendEquation"); - return; + if (ctx->Color._BlendEquationPerBuffer) { + /* Check all per-buffer states */ + for (buf = 0; buf < numBuffers; buf++) { + if (ctx->Color.Blend[buf].EquationRGB != mode || + ctx->Color.Blend[buf].EquationA != mode) { + changed = true; + break; + } + } } - - numBuffers = ctx->Extensions.ARB_draw_buffers_blend - ? ctx->Const.MaxDrawBuffers : 1; - - changed = GL_FALSE; - for (buf = 0; buf < numBuffers; buf++) { - if (ctx->Color.Blend[buf].EquationRGB != mode || - ctx->Color.Blend[buf].EquationA != mode) { - changed = GL_TRUE; - break; + else { + /* only need to check 0th per-buffer state */ + if (ctx->Color.Blend[0].EquationRGB != mode || + ctx->Color.Blend[0].EquationA != mode) { + changed = true; } } + if (!changed) return; + if (!legal_blend_equation(ctx, mode)) { + _mesa_error(ctx, GL_INVALID_ENUM, "glBlendEquation"); + return; + } + FLUSH_VERTICES(ctx, _NEW_COLOR); + for (buf = 0; buf < numBuffers; buf++) { ctx->Color.Blend[buf].EquationRGB = mode; ctx->Color.Blend[buf].EquationA = mode; @@ -383,7 +421,7 @@ _mesa_BlendEquationiARB(GLuint buf, GLenum mode) buf, _mesa_enum_to_string(mode)); if (buf >= ctx->Const.MaxDrawBuffers) { - _mesa_error(ctx, GL_INVALID_VALUE, "glBlendFuncSeparatei(buffer=%u)", + _mesa_error(ctx, GL_INVALID_VALUE, "glBlendEquationi(buffer=%u)", buf); return; } @@ -407,15 +445,37 @@ _mesa_BlendEquationiARB(GLuint buf, GLenum mode) void GLAPIENTRY _mesa_BlendEquationSeparate( GLenum modeRGB, GLenum modeA ) { - GLuint buf, numBuffers; - GLboolean changed; GET_CURRENT_CONTEXT(ctx); + const unsigned numBuffers = num_buffers(ctx); + unsigned buf; + bool changed = false; if (MESA_VERBOSE & VERBOSE_API) _mesa_debug(ctx, "glBlendEquationSeparateEXT(%s %s)\n", _mesa_enum_to_string(modeRGB), _mesa_enum_to_string(modeA)); + if (ctx->Color._BlendEquationPerBuffer) { + /* Check all per-buffer states */ + for (buf = 0; buf < numBuffers; buf++) { + if (ctx->Color.Blend[buf].EquationRGB != modeRGB || + ctx->Color.Blend[buf].EquationA != modeA) { + changed = true; + break; + } + } + } + else { + /* only need to check 0th per-buffer state */ + if (ctx->Color.Blend[0].EquationRGB != modeRGB || + ctx->Color.Blend[0].EquationA != modeA) { + changed = true; + } + } + + if (!changed) + return; + if ( (modeRGB != modeA) && !ctx->Extensions.EXT_blend_equation_separate ) { _mesa_error(ctx, GL_INVALID_OPERATION, "glBlendEquationSeparateEXT not supported by driver"); @@ -432,21 +492,8 @@ _mesa_BlendEquationSeparate( GLenum modeRGB, GLenum modeA ) return; } - numBuffers = ctx->Extensions.ARB_draw_buffers_blend - ? ctx->Const.MaxDrawBuffers : 1; - - changed = GL_FALSE; - for (buf = 0; buf < numBuffers; buf++) { - if (ctx->Color.Blend[buf].EquationRGB != modeRGB || - ctx->Color.Blend[buf].EquationA != modeA) { - changed = GL_TRUE; - break; - } - } - if (!changed) - return; - FLUSH_VERTICES(ctx, _NEW_COLOR); + for (buf = 0; buf < numBuffers; buf++) { ctx->Color.Blend[buf].EquationRGB = modeRGB; ctx->Color.Blend[buf].EquationA = modeA; diff --git a/src/mesa/main/es1_conversion.c b/src/mesa/main/es1_conversion.c index b254a6ef1c7..1dfe8278e71 100644 --- a/src/mesa/main/es1_conversion.c +++ b/src/mesa/main/es1_conversion.c @@ -1,3 +1,4 @@ + #include <stdbool.h> #include "api_loopback.h" @@ -326,7 +327,24 @@ _mesa_GetTexEnvxv(GLenum target, GLenum pname, GLfixed *params) } break; case GL_TEXTURE_ENV: - if (pname != GL_TEXTURE_ENV_COLOR && pname != GL_RGB_SCALE && pname != GL_ALPHA_SCALE && pname != GL_TEXTURE_ENV_MODE && pname != GL_COMBINE_RGB && pname != GL_COMBINE_ALPHA && pname != GL_SRC0_RGB && pname != GL_SRC1_RGB && pname != GL_SRC2_RGB && pname != GL_SRC0_ALPHA && pname != GL_SRC1_ALPHA && pname != GL_SRC2_ALPHA && pname != GL_OPERAND0_RGB && pname != GL_OPERAND1_RGB && pname != GL_OPERAND2_RGB && pname != GL_OPERAND0_ALPHA && pname != GL_OPERAND1_ALPHA && pname != GL_OPERAND2_ALPHA) { + if (pname != GL_TEXTURE_ENV_COLOR && + pname != GL_RGB_SCALE && + pname != GL_ALPHA_SCALE && + pname != GL_TEXTURE_ENV_MODE && + pname != GL_COMBINE_RGB && + pname != GL_COMBINE_ALPHA && + pname != GL_SRC0_RGB && + pname != GL_SRC1_RGB && + pname != GL_SRC2_RGB && + pname != GL_SRC0_ALPHA && + pname != GL_SRC1_ALPHA && + pname != GL_SRC2_ALPHA && + pname != GL_OPERAND0_RGB && + pname != GL_OPERAND1_RGB && + pname != GL_OPERAND2_RGB && + pname != GL_OPERAND0_ALPHA && + pname != GL_OPERAND1_ALPHA && + pname != GL_OPERAND2_ALPHA) { _mesa_error(_mesa_get_current_context(), GL_INVALID_ENUM, "glGetTexEnvxv(target=0x%x)", target); return; diff --git a/src/mesa/main/ff_fragment_shader.cpp b/src/mesa/main/ff_fragment_shader.cpp index e4e2a18c1da..e63d0f1ec55 100644 --- a/src/mesa/main/ff_fragment_shader.cpp +++ b/src/mesa/main/ff_fragment_shader.cpp @@ -40,7 +40,7 @@ #include "glsl/ir_optimization.h" #include "glsl/glsl_parser_extras.h" #include "glsl/glsl_symbol_table.h" -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "program/ir_to_mesa.h" #include "program/program.h" #include "program/programopt.h" @@ -975,13 +975,11 @@ static void load_texture( texenv_fragment_program *p, GLuint unit ) ir_var_uniform); p->top_instructions->push_head(sampler); - /* Set the texture unit for this sampler. The linker will pick this value - * up and do-the-right-thing. - * - * NOTE: The cast to int is important. Without it, the constant will have - * type uint, and things later on may get confused. + /* Set the texture unit for this sampler in the same way that + * layout(binding=X) would. */ - sampler->constant_value = new(p->mem_ctx) ir_constant(int(unit)); + sampler->data.explicit_binding = true; + sampler->data.binding = unit; deref = new(p->mem_ctx) ir_dereference_variable(sampler); tex->set_sampler(deref, glsl_type::vec4_type); diff --git a/src/mesa/main/ffvertex_prog.c b/src/mesa/main/ffvertex_prog.c index a6183b47e2e..34cc9218add 100644 --- a/src/mesa/main/ffvertex_prog.c +++ b/src/mesa/main/ffvertex_prog.c @@ -1690,11 +1690,10 @@ _mesa_get_fixed_func_vertex_program(struct gl_context *ctx) ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].OptimizeForAOS, ctx->Const.Program[MESA_SHADER_VERTEX].MaxTemps ); -#if 0 if (ctx->Driver.ProgramStringNotify) ctx->Driver.ProgramStringNotify( ctx, GL_VERTEX_PROGRAM_ARB, &prog->Base ); -#endif + _mesa_program_cache_insert(ctx, ctx->VertexProgram.Cache, &key, sizeof(key), &prog->Base); } diff --git a/src/mesa/main/format_utils.h b/src/mesa/main/format_utils.h index 618f43d0aaa..378997b38b2 100644 --- a/src/mesa/main/format_utils.h +++ b/src/mesa/main/format_utils.h @@ -34,6 +34,7 @@ #include "imports.h" #include "macros.h" #include "util/rounding.h" +#include "util/half_float.h" extern const mesa_array_format RGBA32_FLOAT; extern const mesa_array_format RGBA8_UBYTE; diff --git a/src/mesa/main/imports.c b/src/mesa/main/imports.c index 350e6752c8b..230ebbc67f4 100644 --- a/src/mesa/main/imports.c +++ b/src/mesa/main/imports.c @@ -307,154 +307,6 @@ _mesa_bitcount_64(uint64_t n) } #endif - -/** - * Convert a 4-byte float to a 2-byte half float. - * - * Not all float32 values can be represented exactly as a float16 value. We - * round such intermediate float32 values to the nearest float16. When the - * float32 lies exactly between to float16 values, we round to the one with - * an even mantissa. - * - * This rounding behavior has several benefits: - * - It has no sign bias. - * - * - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's - * GPU ISA. - * - * - By reproducing the behavior of the GPU (at least on Intel hardware), - * compile-time evaluation of constant packHalf2x16 GLSL expressions will - * result in the same value as if the expression were executed on the GPU. - */ -GLhalfARB -_mesa_float_to_half(float val) -{ - const fi_type fi = {val}; - const int flt_m = fi.i & 0x7fffff; - const int flt_e = (fi.i >> 23) & 0xff; - const int flt_s = (fi.i >> 31) & 0x1; - int s, e, m = 0; - GLhalfARB result; - - /* sign bit */ - s = flt_s; - - /* handle special cases */ - if ((flt_e == 0) && (flt_m == 0)) { - /* zero */ - /* m = 0; - already set */ - e = 0; - } - else if ((flt_e == 0) && (flt_m != 0)) { - /* denorm -- denorm float maps to 0 half */ - /* m = 0; - already set */ - e = 0; - } - else if ((flt_e == 0xff) && (flt_m == 0)) { - /* infinity */ - /* m = 0; - already set */ - e = 31; - } - else if ((flt_e == 0xff) && (flt_m != 0)) { - /* NaN */ - m = 1; - e = 31; - } - else { - /* regular number */ - const int new_exp = flt_e - 127; - if (new_exp < -14) { - /* The float32 lies in the range (0.0, min_normal16) and is rounded - * to a nearby float16 value. The result will be either zero, subnormal, - * or normal. - */ - e = 0; - m = _mesa_lroundevenf((1 << 24) * fabsf(fi.f)); - } - else if (new_exp > 15) { - /* map this value to infinity */ - /* m = 0; - already set */ - e = 31; - } - else { - /* The float32 lies in the range - * [min_normal16, max_normal16 + max_step16) - * and is rounded to a nearby float16 value. The result will be - * either normal or infinite. - */ - e = new_exp + 15; - m = _mesa_lroundevenf(flt_m / (float) (1 << 13)); - } - } - - assert(0 <= m && m <= 1024); - if (m == 1024) { - /* The float32 was rounded upwards into the range of the next exponent, - * so bump the exponent. This correctly handles the case where f32 - * should be rounded up to float16 infinity. - */ - ++e; - m = 0; - } - - result = (s << 15) | (e << 10) | m; - return result; -} - - -/** - * Convert a 2-byte half float to a 4-byte float. - * Based on code from: - * http://www.opengl.org/discussion_boards/ubb/Forum3/HTML/008786.html - */ -float -_mesa_half_to_float(GLhalfARB val) -{ - /* XXX could also use a 64K-entry lookup table */ - const int m = val & 0x3ff; - const int e = (val >> 10) & 0x1f; - const int s = (val >> 15) & 0x1; - int flt_m, flt_e, flt_s; - fi_type fi; - float result; - - /* sign bit */ - flt_s = s; - - /* handle special cases */ - if ((e == 0) && (m == 0)) { - /* zero */ - flt_m = 0; - flt_e = 0; - } - else if ((e == 0) && (m != 0)) { - /* denorm -- denorm half will fit in non-denorm single */ - const float half_denorm = 1.0f / 16384.0f; /* 2^-14 */ - float mantissa = ((float) (m)) / 1024.0f; - float sign = s ? -1.0f : 1.0f; - return sign * mantissa * half_denorm; - } - else if ((e == 31) && (m == 0)) { - /* infinity */ - flt_e = 0xff; - flt_m = 0; - } - else if ((e == 31) && (m != 0)) { - /* NaN */ - flt_e = 0xff; - flt_m = 1; - } - else { - /* regular */ - flt_e = e + 112; - flt_m = m << 13; - } - - fi.i = (flt_s << 31) | (flt_e << 23) | flt_m; - result = fi.f; - return result; -} - /*@}*/ diff --git a/src/mesa/main/imports.h b/src/mesa/main/imports.h index 90247587be3..042147fd8bb 100644 --- a/src/mesa/main/imports.h +++ b/src/mesa/main/imports.h @@ -396,13 +396,6 @@ _mesa_flsll(uint64_t n) #endif } - -extern GLhalfARB -_mesa_float_to_half(float f); - -extern float -_mesa_half_to_float(GLhalfARB h); - static inline bool _mesa_half_is_negative(GLhalfARB h) { diff --git a/src/mesa/main/matrix.c b/src/mesa/main/matrix.c index 2b8016a4a72..5ff5ac5bfe1 100644 --- a/src/mesa/main/matrix.c +++ b/src/mesa/main/matrix.c @@ -151,7 +151,6 @@ _mesa_MatrixMode( GLenum mode ) if (ctx->Transform.MatrixMode == mode && mode != GL_TEXTURE) return; - FLUSH_VERTICES(ctx, _NEW_TRANSFORM); switch (mode) { case GL_MODELVIEW: diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c index ab16c2854a8..50469956c6e 100644 --- a/src/mesa/main/mipmap.c +++ b/src/mesa/main/mipmap.c @@ -37,6 +37,7 @@ #include "texstore.h" #include "image.h" #include "macros.h" +#include "util/half_float.h" #include "../../gallium/auxiliary/util/u_format_rgb9e5.h" #include "../../gallium/auxiliary/util/u_format_r11g11b10f.h" diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index cbfb15522f0..e57b98a412d 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -42,7 +42,7 @@ #include "main/config.h" #include "glapi/glapi.h" #include "math/m_matrix.h" /* GLmatrix */ -#include "glsl/shader_enums.h" +#include "glsl/nir/shader_enums.h" #include "main/formats.h" /* MESA_FORMAT_COUNT */ @@ -94,11 +94,6 @@ struct vbo_context; #define PRIM_OUTSIDE_BEGIN_END (PRIM_MAX + 1) #define PRIM_UNKNOWN (PRIM_MAX + 2) -#define VARYING_SLOT_MAX (VARYING_SLOT_VAR0 + MAX_VARYING) -#define VARYING_SLOT_PATCH0 (VARYING_SLOT_MAX) -#define VARYING_SLOT_TESS_MAX (VARYING_SLOT_PATCH0 + MAX_VARYING) -#define FRAG_RESULT_MAX (FRAG_RESULT_DATA0 + MAX_DRAW_BUFFERS) - /** * Determine if the given gl_varying_slot appears in the fragment shader. */ @@ -487,26 +482,24 @@ struct gl_colorbuffer_attrib struct gl_current_attrib { /** - * \name Current vertex attributes. + * \name Current vertex attributes (color, texcoords, etc). * \note Values are valid only after FLUSH_VERTICES has been called. * \note Index and Edgeflag current values are stored as floats in the * SIX and SEVEN attribute slots. + * \note We need double storage for 64-bit vertex attributes */ - /* we need double storage for this for vertex attrib 64bit */ - GLfloat Attrib[VERT_ATTRIB_MAX][4*2]; /**< Position, color, texcoords, etc */ + GLfloat Attrib[VERT_ATTRIB_MAX][4*2]; /** - * \name Current raster position attributes (always valid). - * \note This set of attributes is very similar to the SWvertex struct. + * \name Current raster position attributes (always up to date after a + * glRasterPos call). */ - /*@{*/ GLfloat RasterPos[4]; GLfloat RasterDistance; GLfloat RasterColor[4]; GLfloat RasterSecondaryColor[4]; GLfloat RasterTexCoords[MAX_TEXTURE_COORD_UNITS][4]; GLboolean RasterPosValid; - /*@}*/ }; @@ -1866,24 +1859,6 @@ typedef enum /** - * \brief Layout qualifiers for gl_FragDepth. - * - * Extension AMD_conservative_depth allows gl_FragDepth to be redeclared with - * a layout qualifier. - * - * \see enum ir_depth_layout - */ -enum gl_frag_depth_layout -{ - FRAG_DEPTH_LAYOUT_NONE, /**< No layout is specified. */ - FRAG_DEPTH_LAYOUT_ANY, - FRAG_DEPTH_LAYOUT_GREATER, - FRAG_DEPTH_LAYOUT_LESS, - FRAG_DEPTH_LAYOUT_UNCHANGED -}; - - -/** * Base class for any kind of program object */ struct gl_program @@ -2286,12 +2261,34 @@ struct gl_shader unsigned num_combined_uniform_components; /** - * This shader's uniform block information. + * This shader's uniform/ssbo block information. * * These fields are only set post-linking. + * + * BufferInterfaceBlocks is a list containing both UBOs and SSBOs. This is + * useful during the linking process so that we don't have to handle SSBOs + * specifically. + * + * UniformBlocks is a list of UBOs. This is useful for backends that need + * or prefer to see separate index spaces for UBOS and SSBOs like the GL + * API specifies. + * + * ShaderStorageBlocks is a list of SSBOs. This is useful for backends that + * need or prefer to see separate index spaces for UBOS and SSBOs like the + * GL API specifies. + * + * UniformBlocks and ShaderStorageBlocks only have pointers into + * BufferInterfaceBlocks so the actual resource information is not + * duplicated. */ + unsigned NumBufferInterfaceBlocks; + struct gl_uniform_block *BufferInterfaceBlocks; + unsigned NumUniformBlocks; - struct gl_uniform_block *UniformBlocks; + struct gl_uniform_block **UniformBlocks; + + unsigned NumShaderStorageBlocks; + struct gl_uniform_block **ShaderStorageBlocks; struct exec_list *ir; struct exec_list *packed_varyings; @@ -2694,8 +2691,33 @@ struct gl_shader_program */ unsigned LastClipDistanceArraySize; + /** + * This shader's uniform/ssbo block information. + * + * BufferInterfaceBlocks is a list containing both UBOs and SSBOs. This is + * useful during the linking process so that we don't have to handle SSBOs + * specifically. + * + * UniformBlocks is a list of UBOs. This is useful for backends that need + * or prefer to see separate index spaces for UBOS and SSBOs like the GL + * API specifies. + * + * ShaderStorageBlocks is a list of SSBOs. This is useful for backends that + * need or prefer to see separate index spaces for UBOS and SSBOs like the + * GL API specifies. + * + * UniformBlocks and ShaderStorageBlocks only have pointers into + * BufferInterfaceBlocks so the actual resource information is not + * duplicated and are only set after linking. + */ unsigned NumBufferInterfaceBlocks; - struct gl_uniform_block *UniformBlocks; + struct gl_uniform_block *BufferInterfaceBlocks; + + unsigned NumUniformBlocks; + struct gl_uniform_block **UniformBlocks; + + unsigned NumShaderStorageBlocks; + struct gl_uniform_block **ShaderStorageBlocks; /** * Indices into the _LinkedShaders's UniformBlocks[] array for each stage @@ -4076,13 +4098,6 @@ struct gl_image_unit GLboolean Layered; /** - * GL_TRUE if the state of this image unit is valid and access from - * the shader is allowed. Otherwise loads from this unit should - * return zero and stores should have no effect. - */ - GLboolean _Valid; - - /** * Layer of the texture object bound to this unit as specified by the * application. */ diff --git a/src/mesa/main/pack.c b/src/mesa/main/pack.c index 00e31b05c99..89faf515443 100644 --- a/src/mesa/main/pack.c +++ b/src/mesa/main/pack.c @@ -1073,6 +1073,21 @@ _mesa_pack_depth_span( struct gl_context *ctx, GLuint n, GLvoid *dest, } } break; + case GL_UNSIGNED_INT_24_8: + { + const GLdouble scale = (GLdouble) 0xffffff; + GLuint *dst = (GLuint *) dest; + GLuint i; + for (i = 0; i < n; i++) { + GLuint z = (GLuint) (depthSpan[i] * scale); + assert(z <= 0xffffff); + dst[i] = (z << 8); + } + if (dstPacking->SwapBytes) { + _mesa_swap4( (GLuint *) dst, n ); + } + break; + } case GL_UNSIGNED_INT: { GLuint *dst = (GLuint *) dest; diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp index 6d73e3bdcf2..8182d3dcc04 100644 --- a/src/mesa/main/shader_query.cpp +++ b/src/mesa/main/shader_query.cpp @@ -485,8 +485,14 @@ _mesa_program_resource_array_size(struct gl_program_resource *res) case GL_COMPUTE_SUBROUTINE_UNIFORM: case GL_TESS_CONTROL_SUBROUTINE_UNIFORM: case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM: - case GL_BUFFER_VARIABLE: return RESOURCE_UNI(res)->array_elements; + case GL_BUFFER_VARIABLE: + /* Unsized arrays */ + if (RESOURCE_UNI(res)->array_stride > 0 && + RESOURCE_UNI(res)->array_elements == 0) + return 1; + else + return RESOURCE_UNI(res)->array_elements; case GL_VERTEX_SUBROUTINE: case GL_GEOMETRY_SUBROUTINE: case GL_FRAGMENT_SUBROUTINE: @@ -833,193 +839,6 @@ program_resource_location(struct gl_shader_program *shProg, } } -static char* -get_top_level_name(const char *name) -{ - const char *first_dot = strchr(name, '.'); - const char *first_square_bracket = strchr(name, '['); - int name_size = 0; - /* From ARB_program_interface_query spec: - * - * "For the property TOP_LEVEL_ARRAY_SIZE, a single integer identifying the - * number of active array elements of the top-level shader storage block - * member containing to the active variable is written to <params>. If the - * top-level block member is not declared as an array, the value one is - * written to <params>. If the top-level block member is an array with no - * declared size, the value zero is written to <params>. - */ - - /* The buffer variable is on top level.*/ - if (!first_square_bracket && !first_dot) - name_size = strlen(name); - else if ((!first_square_bracket || - (first_dot && first_dot < first_square_bracket))) - name_size = first_dot - name; - else - name_size = first_square_bracket - name; - - return strndup(name, name_size); -} - -static char* -get_var_name(const char *name) -{ - const char *first_dot = strchr(name, '.'); - - if (!first_dot) - return strdup(name); - - return strndup(first_dot+1, strlen(first_dot) - 1); -} - -static GLint -program_resource_top_level_array_size(struct gl_shader_program *shProg, - struct gl_program_resource *res, - const char *name) -{ - int block_index = RESOURCE_UNI(res)->block_index; - int array_size = -1; - char *var_name = get_top_level_name(name); - char *interface_name = - get_top_level_name(shProg->UniformBlocks[block_index].Name); - - if (strcmp(var_name, interface_name) == 0) { - /* Deal with instanced array of SSBOs */ - char *temp_name = get_var_name(name); - free(var_name); - var_name = get_top_level_name(temp_name); - free(temp_name); - } - - for (unsigned i = 0; i < shProg->NumShaders; i++) { - if (shProg->Shaders[i] == NULL) - continue; - - const gl_shader *stage = shProg->Shaders[i]; - foreach_in_list(ir_instruction, node, stage->ir) { - ir_variable *var = node->as_variable(); - if (!var || !var->get_interface_type() || - var->data.mode != ir_var_shader_storage) - continue; - - const glsl_type *interface = var->get_interface_type(); - - if (strcmp(interface_name, interface->name) != 0) - continue; - - for (unsigned i = 0; i < interface->length; i++) { - const glsl_struct_field *field = &interface->fields.structure[i]; - if (strcmp(field->name, var_name) != 0) - continue; - /* From GL_ARB_program_interface_query spec: - * - * "For the property TOP_LEVEL_ARRAY_SIZE, a single integer - * identifying the number of active array elements of the top-level - * shader storage block member containing to the active variable is - * written to <params>. If the top-level block member is not - * declared as an array, the value one is written to <params>. If - * the top-level block member is an array with no declared size, - * the value zero is written to <params>. - */ - if (field->type->is_unsized_array()) - array_size = 0; - else if (field->type->is_array()) - array_size = field->type->length; - else - array_size = 1; - goto found_top_level_array_size; - } - } - } -found_top_level_array_size: - free(interface_name); - free(var_name); - return array_size; -} - -static GLint -program_resource_top_level_array_stride(struct gl_shader_program *shProg, - struct gl_program_resource *res, - const char *name) -{ - int block_index = RESOURCE_UNI(res)->block_index; - int array_stride = -1; - char *var_name = get_top_level_name(name); - char *interface_name = - get_top_level_name(shProg->UniformBlocks[block_index].Name); - - if (strcmp(var_name, interface_name) == 0) { - /* Deal with instanced array of SSBOs */ - char *temp_name = get_var_name(name); - free(var_name); - var_name = get_top_level_name(temp_name); - free(temp_name); - } - - for (unsigned i = 0; i < shProg->NumShaders; i++) { - if (shProg->Shaders[i] == NULL) - continue; - - const gl_shader *stage = shProg->Shaders[i]; - foreach_in_list(ir_instruction, node, stage->ir) { - ir_variable *var = node->as_variable(); - if (!var || !var->get_interface_type() || - var->data.mode != ir_var_shader_storage) - continue; - - const glsl_type *interface = var->get_interface_type(); - - if (strcmp(interface_name, interface->name) != 0) { - continue; - } - - for (unsigned i = 0; i < interface->length; i++) { - const glsl_struct_field *field = &interface->fields.structure[i]; - if (strcmp(field->name, var_name) != 0) - continue; - /* From GL_ARB_program_interface_query: - * - * "For the property TOP_LEVEL_ARRAY_STRIDE, a single integer - * identifying the stride between array elements of the top-level - * shader storage block member containing the active variable is - * written to <params>. For top-level block members declared as - * arrays, the value written is the difference, in basic machine - * units, between the offsets of the active variable for - * consecutive elements in the top-level array. For top-level - * block members not declared as an array, zero is written to - * <params>." - */ - if (field->type->is_array()) { - const enum glsl_matrix_layout matrix_layout = - glsl_matrix_layout(field->matrix_layout); - bool row_major = matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR; - const glsl_type *array_type = field->type->fields.array; - - if (interface->interface_packing != GLSL_INTERFACE_PACKING_STD430) { - if (array_type->is_record() || array_type->is_array()) { - array_stride = array_type->std140_size(row_major); - array_stride = glsl_align(array_stride, 16); - } else { - unsigned element_base_align = 0; - element_base_align = array_type->std140_base_alignment(row_major); - array_stride = MAX2(element_base_align, 16); - } - } else { - array_stride = array_type->std430_array_stride(row_major); - } - } else { - array_stride = 0; - } - goto found_top_level_array_size; - } - } - } -found_top_level_array_size: - free(interface_name); - free(var_name); - return array_stride; -} - /** * Function implements following location queries: * glGetUniformLocation @@ -1133,7 +952,8 @@ get_buffer_property(struct gl_shader_program *shProg, (*val)++; } return 1; - case GL_ACTIVE_VARIABLES: + case GL_ACTIVE_VARIABLES: { + unsigned num_values = 0; for (unsigned i = 0; i < RESOURCE_UBO(res)->NumUniforms; i++) { const char *iname = RESOURCE_UBO(res)->Uniforms[i].IndexName; struct gl_program_resource *uni = @@ -1143,8 +963,10 @@ get_buffer_property(struct gl_shader_program *shProg, continue; *val++ = _mesa_program_resource_index(shProg, uni); + num_values++; } - return RESOURCE_UBO(res)->NumUniforms; + return num_values; + } } } else if (res->Type == GL_SHADER_STORAGE_BLOCK) { switch (prop) { @@ -1166,7 +988,8 @@ get_buffer_property(struct gl_shader_program *shProg, (*val)++; } return 1; - case GL_ACTIVE_VARIABLES: + case GL_ACTIVE_VARIABLES: { + unsigned num_values = 0; for (unsigned i = 0; i < RESOURCE_UBO(res)->NumUniforms; i++) { const char *iname = RESOURCE_UBO(res)->Uniforms[i].IndexName; struct gl_program_resource *uni = @@ -1176,8 +999,10 @@ get_buffer_property(struct gl_shader_program *shProg, continue; *val++ = _mesa_program_resource_index(shProg, uni); + num_values++; } - return RESOURCE_UBO(res)->NumUniforms; + return num_values; + } } } else if (res->Type == GL_ATOMIC_COUNTER_BUFFER) { switch (prop) { @@ -1251,8 +1076,15 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg, switch (res->Type) { case GL_UNIFORM: case GL_BUFFER_VARIABLE: + /* Test if a buffer variable is an array or an unsized array. + * Unsized arrays return zero as array size. + */ + if (RESOURCE_UNI(res)->is_shader_storage && + RESOURCE_UNI(res)->array_stride > 0) + *val = RESOURCE_UNI(res)->array_elements; + else *val = MAX2(RESOURCE_UNI(res)->array_elements, 1); - return 1; + return 1; case GL_PROGRAM_INPUT: case GL_PROGRAM_OUTPUT: *val = MAX2(_mesa_program_resource_array_size(res), 1); @@ -1374,14 +1206,12 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg, case GL_TOP_LEVEL_ARRAY_SIZE: VALIDATE_TYPE(GL_BUFFER_VARIABLE); - *val = program_resource_top_level_array_size(shProg, res, - _mesa_program_resource_name(res)); + *val = RESOURCE_UNI(res)->top_level_array_size; return 1; case GL_TOP_LEVEL_ARRAY_STRIDE: VALIDATE_TYPE(GL_BUFFER_VARIABLE); - *val = program_resource_top_level_array_stride(shProg, res, - _mesa_program_resource_name(res)); + *val = RESOURCE_UNI(res)->top_level_array_stride; return 1; /* GL_ARB_tessellation_shader */ diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c index 9dd1054c8ee..18e463d4ccc 100644 --- a/src/mesa/main/shaderapi.c +++ b/src/mesa/main/shaderapi.c @@ -713,10 +713,10 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname, if (!has_ubo) break; - for (i = 0; i < shProg->NumBufferInterfaceBlocks; i++) { + for (i = 0; i < shProg->NumUniformBlocks; i++) { /* Add one for the terminating NUL character. */ - const GLint len = strlen(shProg->UniformBlocks[i].Name) + 1; + const GLint len = strlen(shProg->UniformBlocks[i]->Name) + 1; if (len > max_len) max_len = len; @@ -729,11 +729,7 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname, if (!has_ubo) break; - *params = 0; - for (unsigned i = 0; i < shProg->NumBufferInterfaceBlocks; i++) { - if (!shProg->UniformBlocks[i].IsShaderStorage) - (*params)++; - } + *params = shProg->NumUniformBlocks; return; case GL_PROGRAM_BINARY_RETRIEVABLE_HINT: /* This enum isn't part of the OES extension for OpenGL ES 2.0. It is diff --git a/src/mesa/main/shaderimage.c b/src/mesa/main/shaderimage.c index bd4b7c7be3b..c4ebf4201fb 100644 --- a/src/mesa/main/shaderimage.c +++ b/src/mesa/main/shaderimage.c @@ -415,8 +415,8 @@ _mesa_init_image_units(struct gl_context *ctx) ctx->ImageUnits[i] = _mesa_default_image_unit(ctx); } -static GLboolean -validate_image_unit(struct gl_context *ctx, struct gl_image_unit *u) +GLboolean +_mesa_is_image_unit_valid(struct gl_context *ctx, struct gl_image_unit *u) { struct gl_texture_object *t = u->TexObj; mesa_format tex_format; @@ -424,7 +424,8 @@ validate_image_unit(struct gl_context *ctx, struct gl_image_unit *u) if (!t) return GL_FALSE; - _mesa_test_texobj_completeness(ctx, t); + if (!t->_BaseComplete && !t->_MipmapComplete) + _mesa_test_texobj_completeness(ctx, t); if (u->Level < t->BaseLevel || u->Level > t->_MaxLevel || @@ -473,17 +474,6 @@ validate_image_unit(struct gl_context *ctx, struct gl_image_unit *u) return GL_TRUE; } -void -_mesa_validate_image_units(struct gl_context *ctx) -{ - unsigned i; - - for (i = 0; i < ctx->Const.MaxImageUnits; ++i) { - struct gl_image_unit *u = &ctx->ImageUnits[i]; - u->_Valid = validate_image_unit(ctx, u); - } -} - static GLboolean validate_bind_image_texture(struct gl_context *ctx, GLuint unit, GLuint texture, GLint level, GLboolean layered, @@ -567,7 +557,6 @@ _mesa_BindImageTexture(GLuint unit, GLuint texture, GLint level, u->Access = access; u->Format = format; u->_ActualFormat = _mesa_get_shader_image_format(format); - u->_Valid = validate_image_unit(ctx, u); if (u->TexObj && _mesa_tex_target_is_layered(u->TexObj->Target)) { u->Layered = layered; @@ -703,7 +692,6 @@ _mesa_BindImageTextures(GLuint first, GLsizei count, const GLuint *textures) u->Access = GL_READ_WRITE; u->Format = tex_format; u->_ActualFormat = _mesa_get_shader_image_format(tex_format); - u->_Valid = validate_image_unit(ctx, u); } else { /* Unbind the texture from the unit */ _mesa_reference_texobj(&u->TexObj, NULL); @@ -713,7 +701,6 @@ _mesa_BindImageTextures(GLuint first, GLsizei count, const GLuint *textures) u->Access = GL_READ_ONLY; u->Format = GL_R8; u->_ActualFormat = MESA_FORMAT_R_UNORM8; - u->_Valid = GL_FALSE; } } diff --git a/src/mesa/main/shaderimage.h b/src/mesa/main/shaderimage.h index bbe088a2459..94ee814a716 100644 --- a/src/mesa/main/shaderimage.h +++ b/src/mesa/main/shaderimage.h @@ -55,13 +55,15 @@ void _mesa_init_image_units(struct gl_context *ctx); /** - * Recalculate the \c _Valid flag of a context's shader image units. + * Return GL_TRUE if the state of the image unit passed as argument is valid + * and access from the shader is allowed. Otherwise loads from this unit + * should return zero and stores should have no effect. * - * To be called when the state of any texture bound to an image unit - * changes. + * The result depends on context state other than the passed image unit, part + * of the _NEW_TEXTURE set. */ -void -_mesa_validate_image_units(struct gl_context *ctx); +GLboolean +_mesa_is_image_unit_valid(struct gl_context *ctx, struct gl_image_unit *u); void GLAPIENTRY _mesa_BindImageTexture(GLuint unit, GLuint texture, GLint level, diff --git a/src/mesa/main/shaderobj.c b/src/mesa/main/shaderobj.c index 4e85fda24b4..ffc71931fec 100644 --- a/src/mesa/main/shaderobj.c +++ b/src/mesa/main/shaderobj.c @@ -290,8 +290,8 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg) ralloc_free(shProg->InfoLog); shProg->InfoLog = ralloc_strdup(shProg, ""); - ralloc_free(shProg->UniformBlocks); - shProg->UniformBlocks = NULL; + ralloc_free(shProg->BufferInterfaceBlocks); + shProg->BufferInterfaceBlocks = NULL; shProg->NumBufferInterfaceBlocks = 0; for (i = 0; i < MESA_SHADER_STAGES; i++) { ralloc_free(shProg->UniformBlockStageIndex[i]); diff --git a/src/mesa/main/shared.c b/src/mesa/main/shared.c index 1acaf59f432..c37b31d1753 100644 --- a/src/mesa/main/shared.c +++ b/src/mesa/main/shared.c @@ -107,6 +107,11 @@ _mesa_alloc_shared_state(struct gl_context *ctx) }; STATIC_ASSERT(ARRAY_SIZE(targets) == NUM_TEXTURE_TARGETS); shared->DefaultTex[i] = ctx->Driver.NewTextureObject(ctx, 0, targets[i]); + /* Need to explicitly set/overwrite the TargetIndex field here since + * the call to _mesa_tex_target_to_index() in NewTextureObject() may + * fail if the texture target is not supported. + */ + shared->DefaultTex[i]->TargetIndex = i; } /* sanity check */ diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c index d3b1c72b08d..4043c4f2057 100644 --- a/src/mesa/main/state.c +++ b/src/mesa/main/state.c @@ -391,8 +391,12 @@ _mesa_update_state_locked( struct gl_context *ctx ) GLbitfield new_state = ctx->NewState; GLbitfield prog_flags = _NEW_PROGRAM; GLbitfield new_prog_state = 0x0; + const GLbitfield computed_states = ~(_NEW_CURRENT_ATTRIB | _NEW_LINE); - if (new_state == _NEW_CURRENT_ATTRIB) + /* we can skip a bunch of state validation checks if the dirty + * state matches one or more bits in 'computed_states'. + */ + if ((new_state & computed_states) == 0) goto out; if (MESA_VERBOSE & VERBOSE_STATE) diff --git a/src/mesa/main/texcompress_bptc.c b/src/mesa/main/texcompress_bptc.c index f0f6553a01b..26e59158007 100644 --- a/src/mesa/main/texcompress_bptc.c +++ b/src/mesa/main/texcompress_bptc.c @@ -30,6 +30,7 @@ #include "texcompress.h" #include "texcompress_bptc.h" #include "util/format_srgb.h" +#include "util/half_float.h" #include "texstore.h" #include "macros.h" #include "image.h" diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c index 173e43c817c..547055ecf39 100644 --- a/src/mesa/main/texobj.c +++ b/src/mesa/main/texobj.c @@ -286,6 +286,12 @@ _mesa_initialize_texture_object( struct gl_context *ctx, obj->RefCount = 1; obj->Name = name; obj->Target = target; + if (target != 0) { + obj->TargetIndex = _mesa_tex_target_to_index(ctx, target); + } + else { + obj->TargetIndex = NUM_TEXTURE_TARGETS; /* invalid/error value */ + } obj->Priority = 1.0F; obj->BaseLevel = 0; obj->MaxLevel = 1000; @@ -340,6 +346,10 @@ finish_texture_init(struct gl_context *ctx, GLenum target, GLenum filter = GL_LINEAR; assert(obj->Target == 0); + obj->Target = target; + obj->TargetIndex = _mesa_tex_target_to_index(ctx, target); + assert(obj->TargetIndex < NUM_TEXTURE_TARGETS); + switch (target) { case GL_TEXTURE_2D_MULTISAMPLE: case GL_TEXTURE_2D_MULTISAMPLE_ARRAY: @@ -1185,46 +1195,26 @@ invalidate_tex_image_error_check(struct gl_context *ctx, GLuint texture, return t; } -/** - * Wrapper for the driver function. Need this because _mesa_new_texture_object - * permits a target of 0 and does not initialize targetIndex. - */ -struct gl_texture_object * -_mesa_create_nameless_texture(struct gl_context *ctx, GLenum target) -{ - struct gl_texture_object *texObj = NULL; - GLint targetIndex; - - if (target == 0) - return texObj; - - texObj = ctx->Driver.NewTextureObject(ctx, 0, target); - targetIndex = _mesa_tex_target_to_index(ctx, texObj->Target); - assert(targetIndex < NUM_TEXTURE_TARGETS); - texObj->TargetIndex = targetIndex; - - return texObj; -} /** * Helper function for glCreateTextures and glGenTextures. Need this because * glCreateTextures should throw errors if target = 0. This is not exposed to * the rest of Mesa to encourage Mesa internals to use nameless textures, * which do not require expensive hash lookups. + * \param target either 0 or a a valid / error-checked texture target enum */ static void create_textures(struct gl_context *ctx, GLenum target, - GLsizei n, GLuint *textures, bool dsa) + GLsizei n, GLuint *textures, const char *caller) { GLuint first; GLint i; - const char *func = dsa ? "Create" : "Gen"; if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE)) - _mesa_debug(ctx, "gl%sTextures %d\n", func, n); + _mesa_debug(ctx, "%s %d\n", caller, n); if (n < 0) { - _mesa_error( ctx, GL_INVALID_VALUE, "gl%sTextures(n < 0)", func ); + _mesa_error(ctx, GL_INVALID_VALUE, "%s(n < 0)", caller); return; } @@ -1241,28 +1231,14 @@ create_textures(struct gl_context *ctx, GLenum target, /* Allocate new, empty texture objects */ for (i = 0; i < n; i++) { struct gl_texture_object *texObj; - GLint targetIndex; GLuint name = first + i; texObj = ctx->Driver.NewTextureObject(ctx, name, target); if (!texObj) { mtx_unlock(&ctx->Shared->Mutex); - _mesa_error(ctx, GL_OUT_OF_MEMORY, "gl%sTextures", func); + _mesa_error(ctx, GL_OUT_OF_MEMORY, "gl%sTextures", caller); return; } - /* Initialize the target index if target is non-zero. */ - if (target != 0) { - targetIndex = _mesa_tex_target_to_index(ctx, texObj->Target); - if (targetIndex < 0) { /* Bad Target */ - mtx_unlock(&ctx->Shared->Mutex); - _mesa_error(ctx, GL_INVALID_ENUM, "gl%sTextures(target = %s)", - func, _mesa_enum_to_string(texObj->Target)); - return; - } - assert(targetIndex < NUM_TEXTURE_TARGETS); - texObj->TargetIndex = targetIndex; - } - /* insert into hash table */ _mesa_HashInsert(ctx->Shared->TexObjects, texObj->Name, texObj); @@ -1296,7 +1272,7 @@ void GLAPIENTRY _mesa_GenTextures(GLsizei n, GLuint *textures) { GET_CURRENT_CONTEXT(ctx); - create_textures(ctx, 0, n, textures, false); + create_textures(ctx, 0, n, textures, "glGenTextures"); } /** @@ -1329,7 +1305,7 @@ _mesa_CreateTextures(GLenum target, GLsizei n, GLuint *textures) return; } - create_textures(ctx, target, n, textures, true); + create_textures(ctx, target, n, textures, "glCreateTextures"); } /** @@ -1383,8 +1359,12 @@ unbind_texobj_from_texunits(struct gl_context *ctx, const gl_texture_index index = texObj->TargetIndex; GLuint u; - if (texObj->Target == 0) + if (texObj->Target == 0) { + /* texture was never bound */ return; + } + + assert(index < NUM_TEXTURE_TARGETS); for (u = 0; u < ctx->Texture.NumCurrentTexUsed; u++) { struct gl_texture_unit *unit = &ctx->Texture.Unit[u]; @@ -1752,10 +1732,11 @@ _mesa_BindTexture( GLenum target, GLuint texName ) _mesa_HashInsert(ctx->Shared->TexObjects, texName, newTexObj); mtx_unlock(&ctx->Shared->Mutex); } - newTexObj->Target = target; - newTexObj->TargetIndex = targetIndex; } + assert(newTexObj->Target == target); + assert(newTexObj->TargetIndex == targetIndex); + bind_texture(ctx, ctx->Texture.CurrentUnit, newTexObj); } @@ -1778,19 +1759,12 @@ _mesa_BindTextureUnit(GLuint unit, GLuint texture) { GET_CURRENT_CONTEXT(ctx); struct gl_texture_object *texObj; - struct gl_texture_unit *texUnit; if (unit >= _mesa_max_tex_unit(ctx)) { _mesa_error(ctx, GL_INVALID_VALUE, "glBindTextureUnit(unit=%u)", unit); return; } - texUnit = _mesa_get_tex_unit(ctx, unit); - assert(texUnit); - if (!texUnit) { - return; - } - if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE)) _mesa_debug(ctx, "glBindTextureUnit %s %d\n", _mesa_enum_to_string(GL_TEXTURE0+unit), (GLint) texture); @@ -1812,7 +1786,7 @@ _mesa_BindTextureUnit(GLuint unit, GLuint texture) /* Error checking */ if (!texObj) { _mesa_error(ctx, GL_INVALID_OPERATION, - "glBindTextureUnit(non-gen name)"); + "glBindTextureUnit(non-gen name)"); return; } if (texObj->Target == 0) { diff --git a/src/mesa/main/texobj.h b/src/mesa/main/texobj.h index 690878c85fc..8421337de4d 100644 --- a/src/mesa/main/texobj.h +++ b/src/mesa/main/texobj.h @@ -202,9 +202,6 @@ _mesa_unlock_context_textures( struct gl_context *ctx ); extern void _mesa_lock_context_textures( struct gl_context *ctx ); -extern struct gl_texture_object * -_mesa_create_nameless_texture(struct gl_context *ctx, GLenum target); - extern void _mesa_delete_nameless_texture(struct gl_context *ctx, struct gl_texture_object *texObj); diff --git a/src/mesa/main/texstate.c b/src/mesa/main/texstate.c index 9b5928c4306..cb147fac476 100644 --- a/src/mesa/main/texstate.c +++ b/src/mesa/main/texstate.c @@ -34,7 +34,6 @@ #include "context.h" #include "enums.h" #include "macros.h" -#include "shaderimage.h" #include "texobj.h" #include "teximage.h" #include "texstate.h" @@ -741,8 +740,6 @@ update_texture_state( struct gl_context *ctx ) if (!prog[MESA_SHADER_FRAGMENT] || !prog[MESA_SHADER_VERTEX]) update_texgen(ctx); - - _mesa_validate_image_units(ctx); } diff --git a/src/mesa/main/textureview.c b/src/mesa/main/textureview.c index 5a3282a40c1..04b7d73da5c 100644 --- a/src/mesa/main/textureview.c +++ b/src/mesa/main/textureview.c @@ -681,6 +681,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture, texObj->Immutable = GL_TRUE; texObj->ImmutableLevels = origTexObj->ImmutableLevels; texObj->Target = target; + texObj->TargetIndex = _mesa_tex_target_to_index(ctx, target); + assert(texObj->TargetIndex < NUM_TEXTURE_TARGETS); if (ctx->Driver.TextureView != NULL && !ctx->Driver.TextureView(ctx, texObj, origTexObj)) { diff --git a/src/mesa/main/uniform_query.cpp b/src/mesa/main/uniform_query.cpp index d48729778ae..083087d6baa 100644 --- a/src/mesa/main/uniform_query.cpp +++ b/src/mesa/main/uniform_query.cpp @@ -318,19 +318,12 @@ _mesa_get_uniform(struct gl_context *ctx, GLuint program, GLint location, return; } - if ((uni->type->base_type == GLSL_TYPE_DOUBLE && - returnType != GLSL_TYPE_DOUBLE) || - (uni->type->base_type != GLSL_TYPE_DOUBLE && - returnType == GLSL_TYPE_DOUBLE)) { - _mesa_error( ctx, GL_INVALID_OPERATION, - "glGetnUniform*vARB(incompatible uniform types)"); - return; - } { unsigned elements = (uni->type->is_sampler()) ? 1 : uni->type->components(); const int dmul = uni->type->base_type == GLSL_TYPE_DOUBLE ? 2 : 1; + const int rmul = returnType == GLSL_TYPE_DOUBLE ? 2 : 1; /* Calculate the source base address *BEFORE* modifying elements to * account for the size of the user's buffer. @@ -342,7 +335,7 @@ _mesa_get_uniform(struct gl_context *ctx, GLuint program, GLint location, returnType == GLSL_TYPE_UINT || returnType == GLSL_TYPE_DOUBLE); /* doubles have a different size than the other 3 types */ - unsigned bytes = sizeof(src[0]) * elements * dmul; + unsigned bytes = sizeof(src[0]) * elements * rmul; if (bufSize < 0 || bytes > (unsigned) bufSize) { _mesa_error( ctx, GL_INVALID_OPERATION, "glGetnUniform*vARB(out of bounds: bufSize is %d," @@ -366,32 +359,57 @@ _mesa_get_uniform(struct gl_context *ctx, GLuint program, GLint location, } else { union gl_constant_value *const dst = (union gl_constant_value *) paramsOut; - /* This code could be optimized by putting the loop inside the switch * statements. However, this is not expected to be * performance-critical code. */ for (unsigned i = 0; i < elements; i++) { + int sidx = i * dmul; + int didx = i * rmul; + switch (returnType) { case GLSL_TYPE_FLOAT: switch (uni->type->base_type) { case GLSL_TYPE_UINT: - dst[i].f = (float) src[i].u; + dst[didx].f = (float) src[sidx].u; break; case GLSL_TYPE_INT: case GLSL_TYPE_SAMPLER: case GLSL_TYPE_IMAGE: - dst[i].f = (float) src[i].i; + dst[didx].f = (float) src[sidx].i; break; case GLSL_TYPE_BOOL: - dst[i].f = src[i].i ? 1.0f : 0.0f; + dst[didx].f = src[sidx].i ? 1.0f : 0.0f; + break; + case GLSL_TYPE_DOUBLE: + dst[didx].f = *(double *)&src[sidx].f; + break; + default: + assert(!"Should not get here."); + break; + } + break; + case GLSL_TYPE_DOUBLE: + switch (uni->type->base_type) { + case GLSL_TYPE_UINT: + *(double *)&dst[didx].f = (double) src[sidx].u; + break; + case GLSL_TYPE_INT: + case GLSL_TYPE_SAMPLER: + case GLSL_TYPE_IMAGE: + *(double *)&dst[didx].f = (double) src[sidx].i; + break; + case GLSL_TYPE_BOOL: + *(double *)&dst[didx].f = src[sidx].i ? 1.0f : 0.0f; + break; + case GLSL_TYPE_FLOAT: + *(double *)&dst[didx].f = (double) src[sidx].f; break; default: assert(!"Should not get here."); break; } break; - case GLSL_TYPE_INT: case GLSL_TYPE_UINT: switch (uni->type->base_type) { @@ -413,10 +431,13 @@ _mesa_get_uniform(struct gl_context *ctx, GLuint program, GLint location, * a floating-point value is rounded to the * nearest integer..." */ - dst[i].i = IROUND(src[i].f); + dst[didx].i = IROUND(src[sidx].f); break; case GLSL_TYPE_BOOL: - dst[i].i = src[i].i ? 1 : 0; + dst[didx].i = src[sidx].i ? 1 : 0; + break; + case GLSL_TYPE_DOUBLE: + dst[didx].i = *(double *)&src[sidx].f; break; default: assert(!"Should not get here."); diff --git a/src/mesa/main/uniforms.c b/src/mesa/main/uniforms.c index 04cc81f9809..bc235380d97 100644 --- a/src/mesa/main/uniforms.c +++ b/src/mesa/main/uniforms.c @@ -1016,21 +1016,21 @@ _mesa_UniformBlockBinding(GLuint program, return; } - if (shProg->UniformBlocks[uniformBlockIndex].Binding != + if (shProg->BufferInterfaceBlocks[uniformBlockIndex].Binding != uniformBlockBinding) { int i; FLUSH_VERTICES(ctx, 0); ctx->NewDriverState |= ctx->DriverFlags.NewUniformBuffer; - shProg->UniformBlocks[uniformBlockIndex].Binding = uniformBlockBinding; + shProg->BufferInterfaceBlocks[uniformBlockIndex].Binding = uniformBlockBinding; for (i = 0; i < MESA_SHADER_STAGES; i++) { int stage_index = shProg->UniformBlockStageIndex[i][uniformBlockIndex]; if (stage_index != -1) { struct gl_shader *sh = shProg->_LinkedShaders[i]; - sh->UniformBlocks[stage_index].Binding = uniformBlockBinding; + sh->BufferInterfaceBlocks[stage_index].Binding = uniformBlockBinding; } } } @@ -1069,21 +1069,21 @@ _mesa_ShaderStorageBlockBinding(GLuint program, return; } - if (shProg->UniformBlocks[shaderStorageBlockIndex].Binding != + if (shProg->BufferInterfaceBlocks[shaderStorageBlockIndex].Binding != shaderStorageBlockBinding) { int i; FLUSH_VERTICES(ctx, 0); ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer; - shProg->UniformBlocks[shaderStorageBlockIndex].Binding = shaderStorageBlockBinding; + shProg->BufferInterfaceBlocks[shaderStorageBlockIndex].Binding = shaderStorageBlockBinding; for (i = 0; i < MESA_SHADER_STAGES; i++) { int stage_index = shProg->UniformBlockStageIndex[i][shaderStorageBlockIndex]; if (stage_index != -1) { struct gl_shader *sh = shProg->_LinkedShaders[i]; - sh->UniformBlocks[stage_index].Binding = shaderStorageBlockBinding; + sh->BufferInterfaceBlocks[stage_index].Binding = shaderStorageBlockBinding; } } } diff --git a/src/mesa/main/uniforms.h b/src/mesa/main/uniforms.h index bec035cdc97..2f88b65043d 100644 --- a/src/mesa/main/uniforms.h +++ b/src/mesa/main/uniforms.h @@ -27,7 +27,7 @@ #define UNIFORMS_H #include "main/glheader.h" -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "glsl/ir_uniform.h" #include "program/prog_parameter.h" diff --git a/src/mesa/main/version.c b/src/mesa/main/version.c index 498b2f867d0..5635a643200 100644 --- a/src/mesa/main/version.c +++ b/src/mesa/main/version.c @@ -24,6 +24,7 @@ #include <stdio.h> +#include "context.h" #include "imports.h" #include "mtypes.h" #include "version.h" @@ -181,7 +182,23 @@ _mesa_override_gl_version(struct gl_context *ctx) { if (_mesa_override_gl_version_contextless(&ctx->Const, &ctx->API, &ctx->Version)) { - create_version_string(ctx, ""); + /* We need to include API in version string for OpenGL ES, otherwise + * application can not detect GLES via glGetString(GL_VERSION) query. + * + * From OpenGL ES 3.2 spec, Page 436: + * + * "The VERSION string is laid out as follows: + * + * OpenGL ES N.M vendor-specific information" + * + * From OpenGL 4.5 spec, Page 538: + * + * "The VERSION and SHADING_LANGUAGE_VERSION strings are laid out as + * follows: + * + * <version number><space><vendor-specific information>" + */ + create_version_string(ctx, _mesa_is_gles(ctx) ? "OpenGL ES " : ""); } } diff --git a/src/mesa/program/Android.mk b/src/mesa/program/Android.mk index ccb0fa5f32b..cc67f8aeadd 100644 --- a/src/mesa/program/Android.mk +++ b/src/mesa/program/Android.mk @@ -75,6 +75,7 @@ LOCAL_C_INCLUDES := \ $(MESA_TOP)/src/mapi \ $(MESA_TOP)/src/mesa \ $(MESA_TOP)/src/glsl \ + $(MESA_TOP)/src/glsl/nir \ $(MESA_TOP)/src/gallium/auxiliary \ $(MESA_TOP)/src/gallium/include diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp index 0214b8e684c..1099d79d834 100644 --- a/src/mesa/program/ir_to_mesa.cpp +++ b/src/mesa/program/ir_to_mesa.cpp @@ -42,7 +42,7 @@ #include "glsl/ir_optimization.h" #include "glsl/ir_uniform.h" #include "glsl/glsl_parser_extras.h" -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "glsl/linker.h" #include "glsl/program.h" #include "program/hash_table.h" diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c index fc00534028f..539e3c05312 100644 --- a/src/mesa/program/prog_to_nir.c +++ b/src/mesa/program/prog_to_nir.c @@ -923,7 +923,7 @@ ptn_add_output_stores(struct ptn_compile *c) { nir_builder *b = &c->build; - foreach_list_typed(nir_variable, var, node, &b->shader->outputs) { + nir_foreach_variable(var, &b->shader->outputs) { nir_intrinsic_instr *store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_var); store->num_components = glsl_get_vector_elements(var->type); @@ -958,11 +958,10 @@ setup_registers_and_variables(struct ptn_compile *c) for (int i = 0; i < num_inputs; i++) { if (!(c->prog->InputsRead & BITFIELD64_BIT(i))) continue; - nir_variable *var = rzalloc(shader, nir_variable); - var->type = glsl_vec4_type(); - var->data.read_only = true; - var->data.mode = nir_var_shader_in; - var->name = ralloc_asprintf(var, "in_%d", i); + + nir_variable *var = + nir_variable_create(shader, nir_var_shader_in, glsl_vec4_type(), + ralloc_asprintf(shader, "in_%d", i)); var->data.location = i; var->data.index = 0; @@ -992,12 +991,9 @@ setup_registers_and_variables(struct ptn_compile *c) nir_ssa_def *f001 = nir_vec4(b, &load_x->dest.ssa, nir_imm_float(b, 0.0), nir_imm_float(b, 0.0), nir_imm_float(b, 1.0)); - nir_variable *fullvar = rzalloc(shader, nir_variable); - fullvar->type = glsl_vec4_type(); - fullvar->data.mode = nir_var_local; - fullvar->name = "fogcoord_tmp"; - exec_list_push_tail(&b->impl->locals, &fullvar->node); - + nir_variable *fullvar = + nir_local_variable_create(b->impl, glsl_vec4_type(), + "fogcoord_tmp"); nir_intrinsic_instr *store = nir_intrinsic_instr_create(shader, nir_intrinsic_store_var); store->num_components = 4; @@ -1005,17 +1001,15 @@ setup_registers_and_variables(struct ptn_compile *c) store->src[0] = nir_src_for_ssa(f001); nir_builder_instr_insert(b, &store->instr); - /* Insert the real input into the list so the driver has real - * inputs, but set c->input_vars[i] to the temporary so we use + /* We inserted the real input into the list so the driver has real + * inputs, but we set c->input_vars[i] to the temporary so we use * the splatted value. */ - exec_list_push_tail(&shader->inputs, &var->node); c->input_vars[i] = fullvar; continue; } } - exec_list_push_tail(&shader->inputs, &var->node); c->input_vars[i] = var; } @@ -1135,6 +1129,12 @@ prog_to_nir(const struct gl_program *prog, s->info.uses_clip_distance_out = false; s->info.separate_shader = false; + if (stage == MESA_SHADER_FRAGMENT) { + struct gl_fragment_program *fp = (struct gl_fragment_program *)prog; + + s->info.fs.uses_discard = fp->UsesKill; + } + fail: if (c->error) { ralloc_free(s); diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c index e94c1021258..0e78e6ab25d 100644 --- a/src/mesa/program/program.c +++ b/src/mesa/program/program.c @@ -173,57 +173,15 @@ _mesa_set_program_error(struct gl_context *ctx, GLint pos, const char *string) /** - * Find the line number and column for 'pos' within 'string'. - * Return a copy of the line which contains 'pos'. Free the line with - * free(). - * \param string the program string - * \param pos the position within the string - * \param line returns the line number corresponding to 'pos'. - * \param col returns the column number corresponding to 'pos'. - * \return copy of the line containing 'pos'. - */ -const GLubyte * -_mesa_find_line_column(const GLubyte *string, const GLubyte *pos, - GLint *line, GLint *col) -{ - const GLubyte *lineStart = string; - const GLubyte *p = string; - GLubyte *s; - int len; - - *line = 1; - - while (p != pos) { - if (*p == (GLubyte) '\n') { - (*line)++; - lineStart = p + 1; - } - p++; - } - - *col = (pos - lineStart) + 1; - - /* return copy of this line */ - while (*p != 0 && *p != '\n') - p++; - len = p - lineStart; - s = malloc(len + 1); - memcpy(s, lineStart, len); - s[len] = 0; - - return s; -} - - -/** * Initialize a new gl_program object. */ -static void -init_program_struct(struct gl_program *prog, GLenum target, GLuint id) +struct gl_program * +_mesa_init_gl_program(struct gl_program *prog, GLenum target, GLuint id) { GLuint i; - assert(prog); + if (!prog) + return NULL; memset(prog, 0, sizeof(*prog)); mtx_init(&prog->Mutex, mtx_plain); @@ -235,102 +193,8 @@ init_program_struct(struct gl_program *prog, GLenum target, GLuint id) /* default mapping from samplers to texture units */ for (i = 0; i < MAX_SAMPLERS; i++) prog->SamplerUnits[i] = i; -} - - -/** - * Initialize a new fragment program object. - */ -struct gl_program * -_mesa_init_fragment_program(struct gl_context *ctx, - struct gl_fragment_program *prog, - GLenum target, GLuint id) -{ - if (prog) { - init_program_struct(&prog->Base, target, id); - return &prog->Base; - } - return NULL; -} - - -/** - * Initialize a new vertex program object. - */ -struct gl_program * -_mesa_init_vertex_program(struct gl_context *ctx, - struct gl_vertex_program *prog, - GLenum target, GLuint id) -{ - if (prog) { - init_program_struct(&prog->Base, target, id); - return &prog->Base; - } - return NULL; -} - - -/** - * Initialize a new compute program object. - */ -struct gl_program * -_mesa_init_compute_program(struct gl_context *ctx, - struct gl_compute_program *prog, - GLenum target, GLuint id) -{ - if (prog) { - init_program_struct(&prog->Base, target, id); - return &prog->Base; - } - return NULL; -} - - -/** - * Initialize a new tessellation control program object. - */ -struct gl_program * -_mesa_init_tess_ctrl_program(struct gl_context *ctx, - struct gl_tess_ctrl_program *prog, - GLenum target, GLuint id) -{ - if (prog) { - init_program_struct(&prog->Base, target, id); - return &prog->Base; - } - return NULL; -} - -/** - * Initialize a new tessellation evaluation program object. - */ -struct gl_program * -_mesa_init_tess_eval_program(struct gl_context *ctx, - struct gl_tess_eval_program *prog, - GLenum target, GLuint id) -{ - if (prog) { - init_program_struct(&prog->Base, target, id); - return &prog->Base; - } - return NULL; -} - - -/** - * Initialize a new geometry program object. - */ -struct gl_program * -_mesa_init_geometry_program(struct gl_context *ctx, - struct gl_geometry_program *prog, - GLenum target, GLuint id) -{ - if (prog) { - init_program_struct(&prog->Base, target, id); - return &prog->Base; - } - return NULL; + return prog; } @@ -349,43 +213,36 @@ _mesa_init_geometry_program(struct gl_context *ctx, struct gl_program * _mesa_new_program(struct gl_context *ctx, GLenum target, GLuint id) { - struct gl_program *prog; switch (target) { - case GL_VERTEX_PROGRAM_ARB: /* == GL_VERTEX_PROGRAM_NV */ - prog = _mesa_init_vertex_program(ctx, CALLOC_STRUCT(gl_vertex_program), - target, id ); - break; + case GL_VERTEX_PROGRAM_ARB: { /* == GL_VERTEX_PROGRAM_NV */ + struct gl_vertex_program *prog = CALLOC_STRUCT(gl_vertex_program); + return _mesa_init_gl_program(&prog->Base, target, id); + } case GL_FRAGMENT_PROGRAM_NV: - case GL_FRAGMENT_PROGRAM_ARB: - prog =_mesa_init_fragment_program(ctx, - CALLOC_STRUCT(gl_fragment_program), - target, id ); - break; - case GL_GEOMETRY_PROGRAM_NV: - prog = _mesa_init_geometry_program(ctx, - CALLOC_STRUCT(gl_geometry_program), - target, id); - break; - case GL_TESS_CONTROL_PROGRAM_NV: - prog = _mesa_init_tess_ctrl_program(ctx, - CALLOC_STRUCT(gl_tess_ctrl_program), - target, id); - break; - case GL_TESS_EVALUATION_PROGRAM_NV: - prog = _mesa_init_tess_eval_program(ctx, - CALLOC_STRUCT(gl_tess_eval_program), - target, id); - break; - case GL_COMPUTE_PROGRAM_NV: - prog = _mesa_init_compute_program(ctx, - CALLOC_STRUCT(gl_compute_program), - target, id); - break; + case GL_FRAGMENT_PROGRAM_ARB: { + struct gl_fragment_program *prog = CALLOC_STRUCT(gl_fragment_program); + return _mesa_init_gl_program(&prog->Base, target, id); + } + case GL_GEOMETRY_PROGRAM_NV: { + struct gl_geometry_program *prog = CALLOC_STRUCT(gl_geometry_program); + return _mesa_init_gl_program(&prog->Base, target, id); + } + case GL_TESS_CONTROL_PROGRAM_NV: { + struct gl_tess_ctrl_program *prog = CALLOC_STRUCT(gl_tess_ctrl_program); + return _mesa_init_gl_program(&prog->Base, target, id); + } + case GL_TESS_EVALUATION_PROGRAM_NV: { + struct gl_tess_eval_program *prog = CALLOC_STRUCT(gl_tess_eval_program); + return _mesa_init_gl_program(&prog->Base, target, id); + } + case GL_COMPUTE_PROGRAM_NV: { + struct gl_compute_program *prog = CALLOC_STRUCT(gl_compute_program); + return _mesa_init_gl_program(&prog->Base, target, id); + } default: _mesa_problem(ctx, "bad target in _mesa_new_program"); - prog = NULL; + return NULL; } - return prog; } @@ -494,123 +351,6 @@ _mesa_reference_program_(struct gl_context *ctx, /** - * Return a copy of a program. - * XXX Problem here if the program object is actually OO-derivation - * made by a device driver. - */ -struct gl_program * -_mesa_clone_program(struct gl_context *ctx, const struct gl_program *prog) -{ - struct gl_program *clone; - - clone = ctx->Driver.NewProgram(ctx, prog->Target, prog->Id); - if (!clone) - return NULL; - - assert(clone->Target == prog->Target); - assert(clone->RefCount == 1); - - clone->String = (GLubyte *) strdup((char *) prog->String); - clone->Format = prog->Format; - clone->Instructions = _mesa_alloc_instructions(prog->NumInstructions); - if (!clone->Instructions) { - _mesa_reference_program(ctx, &clone, NULL); - return NULL; - } - _mesa_copy_instructions(clone->Instructions, prog->Instructions, - prog->NumInstructions); - clone->InputsRead = prog->InputsRead; - clone->OutputsWritten = prog->OutputsWritten; - clone->SamplersUsed = prog->SamplersUsed; - clone->ShadowSamplers = prog->ShadowSamplers; - memcpy(clone->TexturesUsed, prog->TexturesUsed, sizeof(prog->TexturesUsed)); - - if (prog->Parameters) - clone->Parameters = _mesa_clone_parameter_list(prog->Parameters); - if (prog->LocalParams) { - clone->LocalParams = malloc(MAX_PROGRAM_LOCAL_PARAMS * - sizeof(float[4])); - if (!clone->LocalParams) { - _mesa_reference_program(ctx, &clone, NULL); - return NULL; - } - memcpy(clone->LocalParams, prog->LocalParams, - MAX_PROGRAM_LOCAL_PARAMS * sizeof(float[4])); - } - clone->IndirectRegisterFiles = prog->IndirectRegisterFiles; - clone->NumInstructions = prog->NumInstructions; - clone->NumTemporaries = prog->NumTemporaries; - clone->NumParameters = prog->NumParameters; - clone->NumAttributes = prog->NumAttributes; - clone->NumAddressRegs = prog->NumAddressRegs; - clone->NumNativeInstructions = prog->NumNativeInstructions; - clone->NumNativeTemporaries = prog->NumNativeTemporaries; - clone->NumNativeParameters = prog->NumNativeParameters; - clone->NumNativeAttributes = prog->NumNativeAttributes; - clone->NumNativeAddressRegs = prog->NumNativeAddressRegs; - clone->NumAluInstructions = prog->NumAluInstructions; - clone->NumTexInstructions = prog->NumTexInstructions; - clone->NumTexIndirections = prog->NumTexIndirections; - clone->NumNativeAluInstructions = prog->NumNativeAluInstructions; - clone->NumNativeTexInstructions = prog->NumNativeTexInstructions; - clone->NumNativeTexIndirections = prog->NumNativeTexIndirections; - - switch (prog->Target) { - case GL_VERTEX_PROGRAM_ARB: - { - const struct gl_vertex_program *vp = gl_vertex_program_const(prog); - struct gl_vertex_program *vpc = gl_vertex_program(clone); - vpc->IsPositionInvariant = vp->IsPositionInvariant; - } - break; - case GL_FRAGMENT_PROGRAM_ARB: - { - const struct gl_fragment_program *fp = gl_fragment_program_const(prog); - struct gl_fragment_program *fpc = gl_fragment_program(clone); - fpc->UsesKill = fp->UsesKill; - fpc->UsesDFdy = fp->UsesDFdy; - fpc->OriginUpperLeft = fp->OriginUpperLeft; - fpc->PixelCenterInteger = fp->PixelCenterInteger; - } - break; - case GL_GEOMETRY_PROGRAM_NV: - { - const struct gl_geometry_program *gp = gl_geometry_program_const(prog); - struct gl_geometry_program *gpc = gl_geometry_program(clone); - gpc->VerticesOut = gp->VerticesOut; - gpc->InputType = gp->InputType; - gpc->Invocations = gp->Invocations; - gpc->OutputType = gp->OutputType; - gpc->UsesEndPrimitive = gp->UsesEndPrimitive; - gpc->UsesStreams = gp->UsesStreams; - } - break; - case GL_TESS_CONTROL_PROGRAM_NV: - { - const struct gl_tess_ctrl_program *tcp = gl_tess_ctrl_program_const(prog); - struct gl_tess_ctrl_program *tcpc = gl_tess_ctrl_program(clone); - tcpc->VerticesOut = tcp->VerticesOut; - } - break; - case GL_TESS_EVALUATION_PROGRAM_NV: - { - const struct gl_tess_eval_program *tep = gl_tess_eval_program_const(prog); - struct gl_tess_eval_program *tepc = gl_tess_eval_program(clone); - tepc->PrimitiveMode = tep->PrimitiveMode; - tepc->Spacing = tep->Spacing; - tepc->VertexOrder = tep->VertexOrder; - tepc->PointMode = tep->PointMode; - } - break; - default: - _mesa_problem(NULL, "Unexpected target in _mesa_clone_program"); - } - - return clone; -} - - -/** * Insert 'count' NOP instructions at 'start' in the given program. * Adjust branch targets accordingly. */ @@ -707,190 +447,6 @@ _mesa_delete_instructions(struct gl_program *prog, GLuint start, GLuint count) /** - * Search instructions for registers that match (oldFile, oldIndex), - * replacing them with (newFile, newIndex). - */ -static void -replace_registers(struct prog_instruction *inst, GLuint numInst, - GLuint oldFile, GLuint oldIndex, - GLuint newFile, GLuint newIndex) -{ - GLuint i, j; - for (i = 0; i < numInst; i++) { - /* src regs */ - for (j = 0; j < _mesa_num_inst_src_regs(inst[i].Opcode); j++) { - if (inst[i].SrcReg[j].File == oldFile && - inst[i].SrcReg[j].Index == oldIndex) { - inst[i].SrcReg[j].File = newFile; - inst[i].SrcReg[j].Index = newIndex; - } - } - /* dst reg */ - if (inst[i].DstReg.File == oldFile && inst[i].DstReg.Index == oldIndex) { - inst[i].DstReg.File = newFile; - inst[i].DstReg.Index = newIndex; - } - } -} - - -/** - * Search instructions for references to program parameters. When found, - * increment the parameter index by 'offset'. - * Used when combining programs. - */ -static void -adjust_param_indexes(struct prog_instruction *inst, GLuint numInst, - GLuint offset) -{ - GLuint i, j; - for (i = 0; i < numInst; i++) { - for (j = 0; j < _mesa_num_inst_src_regs(inst[i].Opcode); j++) { - GLuint f = inst[i].SrcReg[j].File; - if (f == PROGRAM_CONSTANT || - f == PROGRAM_UNIFORM || - f == PROGRAM_STATE_VAR) { - inst[i].SrcReg[j].Index += offset; - } - } - } -} - - -/** - * Combine two programs into one. Fix instructions so the outputs of - * the first program go to the inputs of the second program. - */ -struct gl_program * -_mesa_combine_programs(struct gl_context *ctx, - const struct gl_program *progA, - const struct gl_program *progB) -{ - struct prog_instruction *newInst; - struct gl_program *newProg; - const GLuint lenA = progA->NumInstructions - 1; /* omit END instr */ - const GLuint lenB = progB->NumInstructions; - const GLuint numParamsA = _mesa_num_parameters(progA->Parameters); - const GLuint newLength = lenA + lenB; - GLboolean usedTemps[MAX_PROGRAM_TEMPS]; - GLuint firstTemp = 0; - GLbitfield64 inputsB; - GLuint i; - - assert(progA->Target == progB->Target); - - newInst = _mesa_alloc_instructions(newLength); - if (!newInst) - return GL_FALSE; - - _mesa_copy_instructions(newInst, progA->Instructions, lenA); - _mesa_copy_instructions(newInst + lenA, progB->Instructions, lenB); - - /* adjust branch / instruction addresses for B's instructions */ - for (i = 0; i < lenB; i++) { - newInst[lenA + i].BranchTarget += lenA; - } - - newProg = ctx->Driver.NewProgram(ctx, progA->Target, 0); - newProg->Instructions = newInst; - newProg->NumInstructions = newLength; - - /* find used temp regs (we may need new temps below) */ - _mesa_find_used_registers(newProg, PROGRAM_TEMPORARY, - usedTemps, MAX_PROGRAM_TEMPS); - - if (newProg->Target == GL_FRAGMENT_PROGRAM_ARB) { - const struct gl_fragment_program *fprogA, *fprogB; - struct gl_fragment_program *newFprog; - GLbitfield64 progB_inputsRead = progB->InputsRead; - GLint progB_colorFile, progB_colorIndex; - - fprogA = gl_fragment_program_const(progA); - fprogB = gl_fragment_program_const(progB); - newFprog = gl_fragment_program(newProg); - - newFprog->UsesKill = fprogA->UsesKill || fprogB->UsesKill; - newFprog->UsesDFdy = fprogA->UsesDFdy || fprogB->UsesDFdy; - - /* We'll do a search and replace for instances - * of progB_colorFile/progB_colorIndex below... - */ - progB_colorFile = PROGRAM_INPUT; - progB_colorIndex = VARYING_SLOT_COL0; - - /* - * The fragment program may get color from a state var rather than - * a fragment input (vertex output) if it's constant. - * See the texenvprogram.c code. - * So, search the program's parameter list now to see if the program - * gets color from a state var instead of a conventional fragment - * input register. - */ - for (i = 0; i < progB->Parameters->NumParameters; i++) { - struct gl_program_parameter *p = &progB->Parameters->Parameters[i]; - if (p->Type == PROGRAM_STATE_VAR && - p->StateIndexes[0] == STATE_INTERNAL && - p->StateIndexes[1] == STATE_CURRENT_ATTRIB && - (int) p->StateIndexes[2] == (int) VERT_ATTRIB_COLOR0) { - progB_inputsRead |= VARYING_BIT_COL0; - progB_colorFile = PROGRAM_STATE_VAR; - progB_colorIndex = i; - break; - } - } - - /* Connect color outputs of fprogA to color inputs of fprogB, via a - * new temporary register. - */ - if ((progA->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_COLOR)) && - (progB_inputsRead & VARYING_BIT_COL0)) { - GLint tempReg = _mesa_find_free_register(usedTemps, MAX_PROGRAM_TEMPS, - firstTemp); - if (tempReg < 0) { - _mesa_problem(ctx, "No free temp regs found in " - "_mesa_combine_programs(), using 31"); - tempReg = 31; - } - firstTemp = tempReg + 1; - - /* replace writes to result.color[0] with tempReg */ - replace_registers(newInst, lenA, - PROGRAM_OUTPUT, FRAG_RESULT_COLOR, - PROGRAM_TEMPORARY, tempReg); - /* replace reads from the input color with tempReg */ - replace_registers(newInst + lenA, lenB, - progB_colorFile, progB_colorIndex, /* search for */ - PROGRAM_TEMPORARY, tempReg /* replace with */ ); - } - - /* compute combined program's InputsRead */ - inputsB = progB_inputsRead; - if (progA->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_COLOR)) { - inputsB &= ~(1 << VARYING_SLOT_COL0); - } - newProg->InputsRead = progA->InputsRead | inputsB; - newProg->OutputsWritten = progB->OutputsWritten; - newProg->SamplersUsed = progA->SamplersUsed | progB->SamplersUsed; - } - else { - /* vertex program */ - assert(0); /* XXX todo */ - } - - /* - * Merge parameters (uniforms, constants, etc) - */ - newProg->Parameters = _mesa_combine_parameter_lists(progA->Parameters, - progB->Parameters); - - adjust_param_indexes(newInst + lenA, lenB, numParamsA); - - - return newProg; -} - - -/** * Populate the 'used' array with flags indicating which registers (TEMPs, * INPUTs, OUTPUTs, etc, are used by the given program. * \param file type of register to scan for @@ -952,140 +508,6 @@ _mesa_find_free_register(const GLboolean used[], } - -/** - * Check if the given register index is valid (doesn't exceed implementation- - * dependent limits). - * \return GL_TRUE if OK, GL_FALSE if bad index - */ -GLboolean -_mesa_valid_register_index(const struct gl_context *ctx, - gl_shader_stage shaderType, - gl_register_file file, GLint index) -{ - const struct gl_program_constants *c; - - assert(0 <= shaderType && shaderType < MESA_SHADER_STAGES); - c = &ctx->Const.Program[shaderType]; - - switch (file) { - case PROGRAM_UNDEFINED: - return GL_TRUE; /* XXX or maybe false? */ - - case PROGRAM_TEMPORARY: - return index >= 0 && index < (GLint) c->MaxTemps; - - case PROGRAM_UNIFORM: - case PROGRAM_STATE_VAR: - /* aka constant buffer */ - return index >= 0 && index < (GLint) c->MaxUniformComponents / 4; - - case PROGRAM_CONSTANT: - /* constant buffer w/ possible relative negative addressing */ - return (index > (int) c->MaxUniformComponents / -4 && - index < (int) c->MaxUniformComponents / 4); - - case PROGRAM_INPUT: - if (index < 0) - return GL_FALSE; - - switch (shaderType) { - case MESA_SHADER_VERTEX: - return index < VERT_ATTRIB_GENERIC0 + (GLint) c->MaxAttribs; - case MESA_SHADER_FRAGMENT: - return index < VARYING_SLOT_VAR0 + (GLint) ctx->Const.MaxVarying; - case MESA_SHADER_GEOMETRY: - return index < VARYING_SLOT_VAR0 + (GLint) ctx->Const.MaxVarying; - default: - return GL_FALSE; - } - - case PROGRAM_OUTPUT: - if (index < 0) - return GL_FALSE; - - switch (shaderType) { - case MESA_SHADER_VERTEX: - return index < VARYING_SLOT_VAR0 + (GLint) ctx->Const.MaxVarying; - case MESA_SHADER_FRAGMENT: - return index < FRAG_RESULT_DATA0 + (GLint) ctx->Const.MaxDrawBuffers; - case MESA_SHADER_GEOMETRY: - return index < VARYING_SLOT_VAR0 + (GLint) ctx->Const.MaxVarying; - default: - return GL_FALSE; - } - - case PROGRAM_ADDRESS: - return index >= 0 && index < (GLint) c->MaxAddressRegs; - - default: - _mesa_problem(ctx, - "unexpected register file in _mesa_valid_register_index()"); - return GL_FALSE; - } -} - - - -/** - * "Post-process" a GPU program. This is intended to be used for debugging. - * Example actions include no-op'ing instructions or changing instruction - * behaviour. - */ -void -_mesa_postprocess_program(struct gl_context *ctx, struct gl_program *prog) -{ - static const GLfloat white[4] = { 0.5, 0.5, 0.5, 0.5 }; - GLuint i; - GLuint whiteSwizzle; - GLint whiteIndex = _mesa_add_unnamed_constant(prog->Parameters, - (gl_constant_value *) white, - 4, &whiteSwizzle); - - (void) whiteIndex; - - for (i = 0; i < prog->NumInstructions; i++) { - struct prog_instruction *inst = prog->Instructions + i; - const GLuint n = _mesa_num_inst_src_regs(inst->Opcode); - - (void) n; - - if (_mesa_is_tex_instruction(inst->Opcode)) { -#if 0 - /* replace TEX/TXP/TXB with MOV */ - inst->Opcode = OPCODE_MOV; - inst->DstReg.WriteMask = WRITEMASK_XYZW; - inst->SrcReg[0].Swizzle = SWIZZLE_XYZW; - inst->SrcReg[0].Negate = NEGATE_NONE; -#endif - -#if 0 - /* disable shadow texture mode */ - inst->TexShadow = 0; -#endif - } - - if (inst->Opcode == OPCODE_TXP) { -#if 0 - inst->Opcode = OPCODE_MOV; - inst->DstReg.WriteMask = WRITEMASK_XYZW; - inst->SrcReg[0].File = PROGRAM_CONSTANT; - inst->SrcReg[0].Index = whiteIndex; - inst->SrcReg[0].Swizzle = SWIZZLE_XYZW; - inst->SrcReg[0].Negate = NEGATE_NONE; -#endif -#if 0 - inst->TexShadow = 0; -#endif -#if 0 - inst->Opcode = OPCODE_TEX; - inst->TexShadow = 0; -#endif - } - - } -} - /* Gets the minimum number of shader invocations per fragment. * This function is useful to determine if we need to do per * sample shading or per fragment shading. diff --git a/src/mesa/program/program.h b/src/mesa/program/program.h index a894147cafd..24e05974dc3 100644 --- a/src/mesa/program/program.h +++ b/src/mesa/program/program.h @@ -63,40 +63,8 @@ _mesa_update_default_objects_program(struct gl_context *ctx); extern void _mesa_set_program_error(struct gl_context *ctx, GLint pos, const char *string); -extern const GLubyte * -_mesa_find_line_column(const GLubyte *string, const GLubyte *pos, - GLint *line, GLint *col); - - -extern struct gl_program * -_mesa_init_vertex_program(struct gl_context *ctx, - struct gl_vertex_program *prog, - GLenum target, GLuint id); - -extern struct gl_program * -_mesa_init_fragment_program(struct gl_context *ctx, - struct gl_fragment_program *prog, - GLenum target, GLuint id); - extern struct gl_program * -_mesa_init_tess_ctrl_program(struct gl_context *ctx, - struct gl_tess_ctrl_program *prog, - GLenum target, GLuint id); - -extern struct gl_program * -_mesa_init_tess_eval_program(struct gl_context *ctx, - struct gl_tess_eval_program *prog, - GLenum target, GLuint id); - -extern struct gl_program * -_mesa_init_geometry_program(struct gl_context *ctx, - struct gl_geometry_program *prog, - GLenum target, GLuint id); - -extern struct gl_program * -_mesa_init_compute_program(struct gl_context *ctx, - struct gl_compute_program *prog, - GLenum target, GLuint id); +_mesa_init_gl_program(struct gl_program *prog, GLenum target, GLuint id); extern struct gl_program * _mesa_new_program(struct gl_context *ctx, GLenum target, GLuint id); @@ -176,56 +144,12 @@ _mesa_reference_tesseprog(struct gl_context *ctx, (struct gl_program *) prog); } -extern struct gl_program * -_mesa_clone_program(struct gl_context *ctx, const struct gl_program *prog); - -static inline struct gl_vertex_program * -_mesa_clone_vertex_program(struct gl_context *ctx, - const struct gl_vertex_program *prog) -{ - return (struct gl_vertex_program *) _mesa_clone_program(ctx, &prog->Base); -} - -static inline struct gl_tess_ctrl_program * -_mesa_clone_tess_ctrl_program(struct gl_context *ctx, - const struct gl_tess_ctrl_program *prog) -{ - return (struct gl_tess_ctrl_program *) _mesa_clone_program(ctx, &prog->Base); -} - -static inline struct gl_tess_eval_program * -_mesa_clone_tess_eval_program(struct gl_context *ctx, - const struct gl_tess_eval_program *prog) -{ - return (struct gl_tess_eval_program *) _mesa_clone_program(ctx, &prog->Base); -} - -static inline struct gl_geometry_program * -_mesa_clone_geometry_program(struct gl_context *ctx, - const struct gl_geometry_program *prog) -{ - return (struct gl_geometry_program *) _mesa_clone_program(ctx, &prog->Base); -} - -static inline struct gl_fragment_program * -_mesa_clone_fragment_program(struct gl_context *ctx, - const struct gl_fragment_program *prog) -{ - return (struct gl_fragment_program *) _mesa_clone_program(ctx, &prog->Base); -} - - extern GLboolean _mesa_insert_instructions(struct gl_program *prog, GLuint start, GLuint count); extern GLboolean _mesa_delete_instructions(struct gl_program *prog, GLuint start, GLuint count); -extern struct gl_program * -_mesa_combine_programs(struct gl_context *ctx, - const struct gl_program *progA, - const struct gl_program *progB); - extern void _mesa_find_used_registers(const struct gl_program *prog, gl_register_file file, @@ -235,15 +159,6 @@ extern GLint _mesa_find_free_register(const GLboolean used[], GLuint maxRegs, GLuint firstReg); - -extern GLboolean -_mesa_valid_register_index(const struct gl_context *ctx, - gl_shader_stage shaderType, - gl_register_file file, GLint index); - -extern void -_mesa_postprocess_program(struct gl_context *ctx, struct gl_program *prog); - extern GLint _mesa_get_min_invocations_per_fragment(struct gl_context *ctx, const struct gl_fragment_program *prog, diff --git a/src/mesa/program/sampler.cpp b/src/mesa/program/sampler.cpp index 1198a3c45f1..84e2504baba 100644 --- a/src/mesa/program/sampler.cpp +++ b/src/mesa/program/sampler.cpp @@ -24,7 +24,7 @@ */ #include "main/mtypes.h" -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "glsl/ir.h" #include "glsl/ir_uniform.h" #include "glsl/ir_visitor.h" diff --git a/src/mesa/state_tracker/st_atom_clip.c b/src/mesa/state_tracker/st_atom_clip.c index 506a770499f..b820d843385 100644 --- a/src/mesa/state_tracker/st_atom_clip.c +++ b/src/mesa/state_tracker/st_atom_clip.c @@ -56,6 +56,9 @@ static void update_clip( struct st_context *st ) use_eye = TRUE; } + /* _ClipUserPlane = _NEW_TRANSFORM | _NEW_PROJECTION + * EyeUserPlane = _NEW_TRANSFORM + */ memcpy(clip.ucp, use_eye ? ctx->Transform.EyeUserPlane : ctx->Transform._ClipUserPlane, sizeof(clip.ucp)); @@ -70,7 +73,7 @@ static void update_clip( struct st_context *st ) const struct st_tracked_state st_update_clip = { "st_update_clip", /* name */ { /* dirty */ - _NEW_TRANSFORM, /* mesa */ + _NEW_TRANSFORM | _NEW_PROJECTION, /* mesa */ ST_NEW_VERTEX_PROGRAM, /* st */ }, update_clip /* update */ diff --git a/src/mesa/state_tracker/st_atom_constbuf.c b/src/mesa/state_tracker/st_atom_constbuf.c index 6affb4d84d5..acaa85d9356 100644 --- a/src/mesa/state_tracker/st_atom_constbuf.c +++ b/src/mesa/state_tracker/st_atom_constbuf.c @@ -238,7 +238,7 @@ static void st_bind_ubos(struct st_context *st, struct gl_uniform_buffer_binding *binding; struct st_buffer_object *st_obj; - binding = &st->ctx->UniformBufferBindings[shader->UniformBlocks[i].Binding]; + binding = &st->ctx->UniformBufferBindings[shader->UniformBlocks[i]->Binding]; st_obj = st_buffer_object(binding->BufferObject); cb.buffer = st_obj->buffer; diff --git a/src/mesa/state_tracker/st_atom_pixeltransfer.c b/src/mesa/state_tracker/st_atom_pixeltransfer.c index a04163cc137..f94c358afba 100644 --- a/src/mesa/state_tracker/st_atom_pixeltransfer.c +++ b/src/mesa/state_tracker/st_atom_pixeltransfer.c @@ -25,65 +25,17 @@ * **************************************************************************/ -/* - * Generate fragment programs to implement pixel transfer ops, such as - * scale/bias, colortable, convolution... - * - * Authors: +/* Authors: * Brian Paul */ -#include "main/imports.h" -#include "main/image.h" -#include "main/macros.h" -#include "program/program.h" -#include "program/prog_cache.h" -#include "program/prog_instruction.h" -#include "program/prog_parameter.h" -#include "program/prog_print.h" - #include "st_context.h" -#include "st_format.h" #include "st_texture.h" -#include "pipe/p_screen.h" -#include "pipe/p_context.h" #include "util/u_inlines.h" #include "util/u_pack_color.h" -struct state_key -{ - GLuint scaleAndBias:1; - GLuint pixelMaps:1; - -#if 0 - GLfloat Maps[3][256][4]; - int NumMaps; - GLint NumStages; - pipeline_stage Stages[STAGE_MAX]; - GLboolean StagesUsed[STAGE_MAX]; - GLfloat Scale1[4], Bias1[4]; - GLfloat Scale2[4], Bias2[4]; -#endif -}; - -static void -make_state_key(struct gl_context *ctx, struct state_key *key) -{ - memset(key, 0, sizeof(*key)); - - if (ctx->Pixel.RedBias != 0.0 || ctx->Pixel.RedScale != 1.0 || - ctx->Pixel.GreenBias != 0.0 || ctx->Pixel.GreenScale != 1.0 || - ctx->Pixel.BlueBias != 0.0 || ctx->Pixel.BlueScale != 1.0 || - ctx->Pixel.AlphaBias != 0.0 || ctx->Pixel.AlphaScale != 1.0) { - key->scaleAndBias = 1; - } - - key->pixelMaps = ctx->Pixel.MapColorFlag; -} - - /** * Update the pixelmap texture with the contents of the R/G/B/A pixel maps. */ @@ -128,74 +80,15 @@ load_color_map_texture(struct gl_context *ctx, struct pipe_resource *pt) pipe_transfer_unmap(pipe, transfer); } - - -#define MAX_INST 100 - /** - * Returns a fragment program which implements the current pixel transfer ops. + * Upload the pixel transfer color map texture. */ -static struct gl_fragment_program * -get_pixel_transfer_program(struct gl_context *ctx, const struct state_key *key) +static void +update_pixel_transfer(struct st_context *st) { - struct st_context *st = st_context(ctx); - struct prog_instruction inst[MAX_INST]; - struct gl_program_parameter_list *params; - struct gl_fragment_program *fp; - GLuint ic = 0; - const GLuint colorTemp = 0; - - fp = (struct gl_fragment_program *) - ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0); - if (!fp) - return NULL; - - params = _mesa_new_parameter_list(); - - /* - * Get initial pixel color from the texture. - * TEX colorTemp, fragment.texcoord[0], texture[0], 2D; - */ - _mesa_init_instructions(inst + ic, 1); - inst[ic].Opcode = OPCODE_TEX; - inst[ic].DstReg.File = PROGRAM_TEMPORARY; - inst[ic].DstReg.Index = colorTemp; - inst[ic].SrcReg[0].File = PROGRAM_INPUT; - inst[ic].SrcReg[0].Index = VARYING_SLOT_TEX0; - inst[ic].TexSrcUnit = 0; - inst[ic].TexSrcTarget = TEXTURE_2D_INDEX; - ic++; - fp->Base.InputsRead = BITFIELD64_BIT(VARYING_SLOT_TEX0); - fp->Base.OutputsWritten = BITFIELD64_BIT(FRAG_RESULT_COLOR); - fp->Base.SamplersUsed = 0x1; /* sampler 0 (bit 0) is used */ - - if (key->scaleAndBias) { - static const gl_state_index scale_state[STATE_LENGTH] = - { STATE_INTERNAL, STATE_PT_SCALE, 0, 0, 0 }; - static const gl_state_index bias_state[STATE_LENGTH] = - { STATE_INTERNAL, STATE_PT_BIAS, 0, 0, 0 }; - GLint scale_p, bias_p; - - scale_p = _mesa_add_state_reference(params, scale_state); - bias_p = _mesa_add_state_reference(params, bias_state); - - /* MAD colorTemp, colorTemp, scale, bias; */ - _mesa_init_instructions(inst + ic, 1); - inst[ic].Opcode = OPCODE_MAD; - inst[ic].DstReg.File = PROGRAM_TEMPORARY; - inst[ic].DstReg.Index = colorTemp; - inst[ic].SrcReg[0].File = PROGRAM_TEMPORARY; - inst[ic].SrcReg[0].Index = colorTemp; - inst[ic].SrcReg[1].File = PROGRAM_STATE_VAR; - inst[ic].SrcReg[1].Index = scale_p; - inst[ic].SrcReg[2].File = PROGRAM_STATE_VAR; - inst[ic].SrcReg[2].Index = bias_p; - ic++; - } - - if (key->pixelMaps) { - const GLuint temp = 1; + struct gl_context *ctx = st->ctx; + if (ctx->Pixel.MapColorFlag) { /* create the colormap/texture now if not already done */ if (!st->pixel_xfer.pixelmap_texture) { st->pixel_xfer.pixelmap_texture = st_create_color_map_texture(ctx); @@ -203,117 +96,11 @@ get_pixel_transfer_program(struct gl_context *ctx, const struct state_key *key) st_create_texture_sampler_view(st->pipe, st->pixel_xfer.pixelmap_texture); } - - /* with a little effort, we can do four pixel map look-ups with - * two TEX instructions: - */ - - /* TEX temp.rg, colorTemp.rgba, texture[1], 2D; */ - _mesa_init_instructions(inst + ic, 1); - inst[ic].Opcode = OPCODE_TEX; - inst[ic].DstReg.File = PROGRAM_TEMPORARY; - inst[ic].DstReg.Index = temp; - inst[ic].DstReg.WriteMask = WRITEMASK_XY; /* write R,G */ - inst[ic].SrcReg[0].File = PROGRAM_TEMPORARY; - inst[ic].SrcReg[0].Index = colorTemp; - inst[ic].TexSrcUnit = 1; - inst[ic].TexSrcTarget = TEXTURE_2D_INDEX; - ic++; - - /* TEX temp.ba, colorTemp.baba, texture[1], 2D; */ - _mesa_init_instructions(inst + ic, 1); - inst[ic].Opcode = OPCODE_TEX; - inst[ic].DstReg.File = PROGRAM_TEMPORARY; - inst[ic].DstReg.Index = temp; - inst[ic].DstReg.WriteMask = WRITEMASK_ZW; /* write B,A */ - inst[ic].SrcReg[0].File = PROGRAM_TEMPORARY; - inst[ic].SrcReg[0].Index = colorTemp; - inst[ic].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_Z, SWIZZLE_W, - SWIZZLE_Z, SWIZZLE_W); - inst[ic].TexSrcUnit = 1; - inst[ic].TexSrcTarget = TEXTURE_2D_INDEX; - ic++; - - /* MOV colorTemp, temp; */ - _mesa_init_instructions(inst + ic, 1); - inst[ic].Opcode = OPCODE_MOV; - inst[ic].DstReg.File = PROGRAM_TEMPORARY; - inst[ic].DstReg.Index = colorTemp; - inst[ic].SrcReg[0].File = PROGRAM_TEMPORARY; - inst[ic].SrcReg[0].Index = temp; - ic++; - - fp->Base.SamplersUsed |= (1 << 1); /* sampler 1 is used */ - } - - /* Modify last instruction's dst reg to write to result.color */ - { - struct prog_instruction *last = &inst[ic - 1]; - last->DstReg.File = PROGRAM_OUTPUT; - last->DstReg.Index = FRAG_RESULT_COLOR; - } - - /* END; */ - _mesa_init_instructions(inst + ic, 1); - inst[ic].Opcode = OPCODE_END; - ic++; - - assert(ic <= MAX_INST); - - - fp->Base.Instructions = _mesa_alloc_instructions(ic); - if (!fp->Base.Instructions) { - _mesa_error(ctx, GL_OUT_OF_MEMORY, - "generating pixel transfer program"); - _mesa_free_parameter_list(params); - return NULL; - } - - _mesa_copy_instructions(fp->Base.Instructions, inst, ic); - fp->Base.NumInstructions = ic; - fp->Base.Parameters = params; - -#if 0 - printf("========= pixel transfer prog\n"); - _mesa_print_program(&fp->Base); - _mesa_print_parameter_list(fp->Base.Parameters); -#endif - - return fp; -} - - - -/** - * Update st->pixel_xfer.program in response to new pixel-transfer state. - */ -static void -update_pixel_transfer(struct st_context *st) -{ - struct gl_context *ctx = st->ctx; - struct state_key key; - struct gl_fragment_program *fp; - - make_state_key(st->ctx, &key); - - fp = (struct gl_fragment_program *) - _mesa_search_program_cache(st->pixel_xfer.cache, &key, sizeof(key)); - if (!fp) { - fp = get_pixel_transfer_program(st->ctx, &key); - _mesa_program_cache_insert(st->ctx, st->pixel_xfer.cache, - &key, sizeof(key), &fp->Base); - } - - if (ctx->Pixel.MapColorFlag) { load_color_map_texture(ctx, st->pixel_xfer.pixelmap_texture); } - st->pixel_xfer.pixelmap_enabled = ctx->Pixel.MapColorFlag; - - st->pixel_xfer.program = (struct st_fragment_program *) fp; } - const struct st_tracked_state st_update_pixel_transfer = { "st_update_pixel_transfer", /* name */ { /* dirty */ diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c index 230eba8c4a5..bb6dfe85644 100644 --- a/src/mesa/state_tracker/st_cb_bitmap.c +++ b/src/mesa/state_tracker/st_cb_bitmap.c @@ -108,151 +108,6 @@ struct bitmap_cache /** - * Make fragment program for glBitmap: - * Sample the texture and kill the fragment if the bit is 0. - * This program will be combined with the user's fragment program. - */ -static struct st_fragment_program * -make_bitmap_fragment_program(struct gl_context *ctx, GLuint samplerIndex) -{ - struct st_context *st = st_context(ctx); - struct st_fragment_program *stfp; - struct gl_program *p; - GLuint ic = 0; - - p = ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0); - if (!p) - return NULL; - - p->NumInstructions = 3; - - p->Instructions = _mesa_alloc_instructions(p->NumInstructions); - if (!p->Instructions) { - ctx->Driver.DeleteProgram(ctx, p); - return NULL; - } - _mesa_init_instructions(p->Instructions, p->NumInstructions); - - /* TEX tmp0, fragment.texcoord[0], texture[0], 2D; */ - p->Instructions[ic].Opcode = OPCODE_TEX; - p->Instructions[ic].DstReg.File = PROGRAM_TEMPORARY; - p->Instructions[ic].DstReg.Index = 0; - p->Instructions[ic].SrcReg[0].File = PROGRAM_INPUT; - p->Instructions[ic].SrcReg[0].Index = VARYING_SLOT_TEX0; - p->Instructions[ic].TexSrcUnit = samplerIndex; - p->Instructions[ic].TexSrcTarget = TEXTURE_2D_INDEX; - ic++; - - /* KIL if -tmp0 < 0 # texel=0 -> keep / texel=0 -> discard */ - p->Instructions[ic].Opcode = OPCODE_KIL; - p->Instructions[ic].SrcReg[0].File = PROGRAM_TEMPORARY; - - if (st->bitmap.tex_format == PIPE_FORMAT_L8_UNORM) - p->Instructions[ic].SrcReg[0].Swizzle = SWIZZLE_XXXX; - - p->Instructions[ic].SrcReg[0].Index = 0; - p->Instructions[ic].SrcReg[0].Negate = NEGATE_XYZW; - ic++; - - /* END; */ - p->Instructions[ic++].Opcode = OPCODE_END; - - assert(ic == p->NumInstructions); - - p->InputsRead = VARYING_BIT_TEX0; - p->OutputsWritten = 0x0; - p->SamplersUsed = (1 << samplerIndex); - - stfp = (struct st_fragment_program *) p; - stfp->Base.UsesKill = GL_TRUE; - - return stfp; -} - - -static struct gl_program * -make_bitmap_fragment_program_glsl(struct st_context *st, - struct st_fragment_program *orig, - GLuint samplerIndex) -{ - struct gl_context *ctx = st->ctx; - struct st_fragment_program *fp = (struct st_fragment_program *) - ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0); - - if (!fp) - return NULL; - - get_bitmap_visitor(fp, orig->glsl_to_tgsi, samplerIndex); - return &fp->Base.Base; -} - - -static int -find_free_bit(uint bitfield) -{ - int i; - for (i = 0; i < 32; i++) { - if ((bitfield & (1 << i)) == 0) { - return i; - } - } - return -1; -} - - -/** - * Combine basic bitmap fragment program with the user-defined program. - * \param st current context - * \param fpIn the incoming fragment program - * \param fpOut the new fragment program which does fragment culling - * \param bitmap_sampler sampler number for the bitmap texture - */ -void -st_make_bitmap_fragment_program(struct st_context *st, - struct gl_fragment_program *fpIn, - struct gl_fragment_program **fpOut, - GLuint *bitmap_sampler) -{ - struct st_fragment_program *bitmap_prog; - struct st_fragment_program *stfpIn = (struct st_fragment_program *) fpIn; - struct gl_program *newProg; - uint sampler; - - /* - * Generate new program which is the user-defined program prefixed - * with the bitmap sampler/kill instructions. - */ - sampler = find_free_bit(fpIn->Base.SamplersUsed); - - if (stfpIn->glsl_to_tgsi) - newProg = make_bitmap_fragment_program_glsl(st, stfpIn, sampler); - else { - bitmap_prog = make_bitmap_fragment_program(st->ctx, sampler); - - newProg = _mesa_combine_programs(st->ctx, - &bitmap_prog->Base.Base, - &fpIn->Base); - /* done with this after combining */ - st_reference_fragprog(st, &bitmap_prog, NULL); - } - -#if 0 - { - printf("Combined bitmap program:\n"); - _mesa_print_program(newProg); - printf("InputsRead: 0x%x\n", newProg->InputsRead); - printf("OutputsWritten: 0x%x\n", newProg->OutputsWritten); - _mesa_print_parameter_list(newProg->Parameters); - } -#endif - - /* return results */ - *fpOut = (struct gl_fragment_program *) newProg; - *bitmap_sampler = sampler; -} - - -/** * Copy user-provide bitmap bits into texture buffer, expanding * bits into texels. * "On" bits will set texels to 0x0. diff --git a/src/mesa/state_tracker/st_cb_bitmap.h b/src/mesa/state_tracker/st_cb_bitmap.h index b4254ca1eeb..dc7e5cb5c9e 100644 --- a/src/mesa/state_tracker/st_cb_bitmap.h +++ b/src/mesa/state_tracker/st_cb_bitmap.h @@ -31,6 +31,7 @@ #include "main/compiler.h" +#include <stdbool.h> struct dd_function_table; struct st_context; @@ -47,13 +48,11 @@ extern void st_destroy_bitmap(struct st_context *st); extern void -st_make_bitmap_fragment_program(struct st_context *st, - struct gl_fragment_program *fpIn, - struct gl_fragment_program **fpOut, - GLuint *bitmap_sampler); - -extern void st_flush_bitmap_cache(struct st_context *st); +extern const struct tgsi_token * +st_get_bitmap_shader(const struct tgsi_token *tokens, + unsigned sampler_index, + bool use_texcoord, bool swizzle_xxxx); #endif /* ST_CB_BITMAP_H */ diff --git a/src/mesa/state_tracker/st_cb_bitmap_shader.c b/src/mesa/state_tracker/st_cb_bitmap_shader.c new file mode 100644 index 00000000000..cddea36d4f6 --- /dev/null +++ b/src/mesa/state_tracker/st_cb_bitmap_shader.c @@ -0,0 +1,174 @@ +/************************************************************************** + * + * Copyright (C) 2015 Advanced Micro Devices, Inc. + * Copyright 2007 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "st_cb_bitmap.h" +#include "tgsi/tgsi_transform.h" +#include "tgsi/tgsi_scan.h" +#include "tgsi/tgsi_dump.h" +#include "util/u_debug.h" + +struct tgsi_bitmap_transform { + struct tgsi_transform_context base; + struct tgsi_shader_info info; + unsigned sampler_index; + bool use_texcoord; + bool swizzle_xxxx; + bool first_instruction_emitted; +}; + +static inline struct tgsi_bitmap_transform * +tgsi_bitmap_transform(struct tgsi_transform_context *tctx) +{ + return (struct tgsi_bitmap_transform *)tctx; +} + +static void +transform_instr(struct tgsi_transform_context *tctx, + struct tgsi_full_instruction *current_inst) +{ + struct tgsi_bitmap_transform *ctx = tgsi_bitmap_transform(tctx); + struct tgsi_full_declaration decl; + struct tgsi_full_instruction inst; + unsigned i, semantic; + int texcoord_index = -1; + + if (ctx->first_instruction_emitted) { + tctx->emit_instruction(tctx, current_inst); + return; + } + + ctx->first_instruction_emitted = true; + + /* Add TEMP[0] if it's missing. */ + if (ctx->info.file_max[TGSI_FILE_TEMPORARY] == -1) { + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_TEMPORARY; + tctx->emit_declaration(tctx, &decl); + } + + /* Add TEXCOORD[0] if it's missing. */ + semantic = ctx->use_texcoord ? TGSI_SEMANTIC_TEXCOORD : + TGSI_SEMANTIC_GENERIC; + for (i = 0; i < ctx->info.num_inputs; i++) { + if (ctx->info.input_semantic_name[i] == semantic && + ctx->info.input_semantic_index[i] == 0) { + texcoord_index = i; + break; + } + } + + if (texcoord_index == -1) { + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_INPUT; + decl.Declaration.Semantic = 1; + decl.Semantic.Name = semantic; + decl.Declaration.Interpolate = 1; + decl.Interp.Interpolate = TGSI_INTERPOLATE_PERSPECTIVE; + decl.Range.First = decl.Range.Last = ctx->info.num_inputs; + texcoord_index = ctx->info.num_inputs; + tctx->emit_declaration(tctx, &decl); + } + + /* Declare the sampler. */ + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_SAMPLER; + decl.Range.First = decl.Range.Last = ctx->sampler_index; + tctx->emit_declaration(tctx, &decl); + + /* TEX tmp0, fragment.texcoord[0], texture[0], 2D; */ + inst = tgsi_default_full_instruction(); + inst.Instruction.Opcode = TGSI_OPCODE_TEX; + inst.Instruction.Texture = 1; + inst.Texture.Texture = TGSI_TEXTURE_2D; + + inst.Instruction.NumDstRegs = 1; + inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY; + inst.Dst[0].Register.Index = 0; + inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW; + + inst.Instruction.NumSrcRegs = 2; + inst.Src[0].Register.File = TGSI_FILE_INPUT; + inst.Src[0].Register.Index = texcoord_index; + inst.Src[0].Register.SwizzleX = TGSI_SWIZZLE_X; + inst.Src[0].Register.SwizzleY = TGSI_SWIZZLE_Y; + inst.Src[0].Register.SwizzleZ = TGSI_SWIZZLE_Z; + inst.Src[0].Register.SwizzleW = TGSI_SWIZZLE_W; + inst.Src[1].Register.File = TGSI_FILE_SAMPLER; + inst.Src[1].Register.Index = ctx->sampler_index; + + tctx->emit_instruction(tctx, &inst); + + /* KIL if -tmp0 < 0 # texel=0 -> keep / texel=0 -> discard */ + inst = tgsi_default_full_instruction(); + inst.Instruction.Opcode = TGSI_OPCODE_KILL_IF; + inst.Instruction.NumDstRegs = 0; + inst.Instruction.NumSrcRegs = 1; + + inst.Src[0].Register.File = TGSI_FILE_TEMPORARY; + inst.Src[0].Register.Index = 0; + inst.Src[0].Register.Negate = 1; + inst.Src[0].Register.SwizzleX = TGSI_SWIZZLE_X; + if (ctx->swizzle_xxxx) { + inst.Src[0].Register.SwizzleY = TGSI_SWIZZLE_X; + inst.Src[0].Register.SwizzleZ = TGSI_SWIZZLE_X; + inst.Src[0].Register.SwizzleW = TGSI_SWIZZLE_X; + } else { + inst.Src[0].Register.SwizzleY = TGSI_SWIZZLE_Y; + inst.Src[0].Register.SwizzleZ = TGSI_SWIZZLE_Z; + inst.Src[0].Register.SwizzleW = TGSI_SWIZZLE_W; + } + tctx->emit_instruction(tctx, &inst); + + /* And emit the instruction we got. */ + tctx->emit_instruction(tctx, current_inst); +} + +const struct tgsi_token * +st_get_bitmap_shader(const struct tgsi_token *tokens, + unsigned sampler_index, + bool use_texcoord, bool swizzle_xxxx) +{ + struct tgsi_bitmap_transform ctx; + struct tgsi_token *newtoks; + int newlen; + + memset(&ctx, 0, sizeof(ctx)); + ctx.base.transform_instruction = transform_instr; + ctx.sampler_index = sampler_index; + ctx.use_texcoord = use_texcoord; + ctx.swizzle_xxxx = swizzle_xxxx; + tgsi_scan_shader(tokens, &ctx.info); + + newlen = tgsi_num_tokens(tokens) + 20; + newtoks = tgsi_alloc_tokens(newlen); + if (!newtoks) + return NULL; + + tgsi_transform_shader(tokens, newtoks, newlen, &ctx.base); + return newtoks; +} diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c index 152160e1dd2..7e8633edc1a 100644 --- a/src/mesa/state_tracker/st_cb_drawpixels.c +++ b/src/mesa/state_tracker/st_cb_drawpixels.c @@ -72,217 +72,74 @@ /** - * Check if the given program is: - * 0: MOVE result.color, fragment.color; - * 1: END; - */ -static GLboolean -is_passthrough_program(const struct gl_fragment_program *prog) -{ - if (prog->Base.NumInstructions == 2) { - const struct prog_instruction *inst = prog->Base.Instructions; - if (inst[0].Opcode == OPCODE_MOV && - inst[1].Opcode == OPCODE_END && - inst[0].DstReg.File == PROGRAM_OUTPUT && - inst[0].DstReg.Index == FRAG_RESULT_COLOR && - inst[0].DstReg.WriteMask == WRITEMASK_XYZW && - inst[0].SrcReg[0].File == PROGRAM_INPUT && - inst[0].SrcReg[0].Index == VARYING_SLOT_COL0 && - inst[0].SrcReg[0].Swizzle == SWIZZLE_XYZW) { - return GL_TRUE; - } - } - return GL_FALSE; -} - - -/** - * Returns a fragment program which implements the current pixel transfer ops. - */ -static struct gl_fragment_program * -get_glsl_pixel_transfer_program(struct st_context *st, - struct st_fragment_program *orig) -{ - int pixelMaps = 0, scaleAndBias = 0; - struct gl_context *ctx = st->ctx; - struct st_fragment_program *fp = (struct st_fragment_program *) - ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0); - - if (!fp) - return NULL; - - if (ctx->Pixel.RedBias != 0.0 || ctx->Pixel.RedScale != 1.0 || - ctx->Pixel.GreenBias != 0.0 || ctx->Pixel.GreenScale != 1.0 || - ctx->Pixel.BlueBias != 0.0 || ctx->Pixel.BlueScale != 1.0 || - ctx->Pixel.AlphaBias != 0.0 || ctx->Pixel.AlphaScale != 1.0) { - scaleAndBias = 1; - } - - pixelMaps = ctx->Pixel.MapColorFlag; - - if (pixelMaps) { - /* create the colormap/texture now if not already done */ - if (!st->pixel_xfer.pixelmap_texture) { - st->pixel_xfer.pixelmap_texture = st_create_color_map_texture(ctx); - st->pixel_xfer.pixelmap_sampler_view = - st_create_texture_sampler_view(st->pipe, - st->pixel_xfer.pixelmap_texture); - } - } - - get_pixel_transfer_visitor(fp, orig->glsl_to_tgsi, - scaleAndBias, pixelMaps); - - return &fp->Base; -} - - -/** - * Make fragment shader for glDraw/CopyPixels. This shader is made - * by combining the pixel transfer shader with the user-defined shader. - * \param fpIn the current/incoming fragment program - * \param fpOut returns the combined fragment program - */ -void -st_make_drawpix_fragment_program(struct st_context *st, - struct gl_fragment_program *fpIn, - struct gl_fragment_program **fpOut) -{ - struct gl_program *newProg; - struct st_fragment_program *stfp = (struct st_fragment_program *) fpIn; - - if (is_passthrough_program(fpIn)) { - newProg = (struct gl_program *) _mesa_clone_fragment_program(st->ctx, - &st->pixel_xfer.program->Base); - } - else if (stfp->glsl_to_tgsi != NULL) { - newProg = (struct gl_program *) get_glsl_pixel_transfer_program(st, stfp); - } - else { -#if 0 - /* debug */ - printf("Base program:\n"); - _mesa_print_program(&fpIn->Base); - printf("DrawPix program:\n"); - _mesa_print_program(&st->pixel_xfer.program->Base.Base); -#endif - newProg = _mesa_combine_programs(st->ctx, - &st->pixel_xfer.program->Base.Base, - &fpIn->Base); - } - -#if 0 - /* debug */ - printf("Combined DrawPixels program:\n"); - _mesa_print_program(newProg); - printf("InputsRead: 0x%x\n", newProg->InputsRead); - printf("OutputsWritten: 0x%x\n", newProg->OutputsWritten); - _mesa_print_parameter_list(newProg->Parameters); -#endif - - *fpOut = (struct gl_fragment_program *) newProg; -} - - -/** * Create fragment program that does a TEX() instruction to get a Z and/or * stencil value value, then writes to FRAG_RESULT_DEPTH/FRAG_RESULT_STENCIL. * Used for glDrawPixels(GL_DEPTH_COMPONENT / GL_STENCIL_INDEX). * Pass fragment color through as-is. - * \return pointer to the gl_fragment program + * + * \return CSO of the fragment shader. */ -struct gl_fragment_program * -st_make_drawpix_z_stencil_program(struct st_context *st, - GLboolean write_depth, - GLboolean write_stencil) +static void * +get_drawpix_z_stencil_program(struct st_context *st, + GLboolean write_depth, + GLboolean write_stencil) { - struct gl_context *ctx = st->ctx; - struct gl_program *p; - struct gl_fragment_program *fp; - GLuint ic = 0; + struct ureg_program *ureg; + struct ureg_src depth_sampler, stencil_sampler; + struct ureg_src texcoord, color; + struct ureg_dst out_color, out_depth, out_stencil; const GLuint shaderIndex = write_depth * 2 + write_stencil; + void *cso; - assert(shaderIndex < ARRAY_SIZE(st->drawpix.shaders)); + assert(shaderIndex < ARRAY_SIZE(st->drawpix.zs_shaders)); - if (st->drawpix.shaders[shaderIndex]) { + if (st->drawpix.zs_shaders[shaderIndex]) { /* already have the proper shader */ - return st->drawpix.shaders[shaderIndex]; + return st->drawpix.zs_shaders[shaderIndex]; } - /* - * Create shader now - */ - p = ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0); - if (!p) + ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT); + if (ureg == NULL) return NULL; - p->NumInstructions = write_depth ? 3 : 1; - p->NumInstructions += write_stencil ? 1 : 0; - - p->Instructions = _mesa_alloc_instructions(p->NumInstructions); - if (!p->Instructions) { - ctx->Driver.DeleteProgram(ctx, p); - return NULL; - } - _mesa_init_instructions(p->Instructions, p->NumInstructions); + ureg_property(ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, TRUE); if (write_depth) { - /* TEX result.depth, fragment.texcoord[0], texture[0], 2D; */ - p->Instructions[ic].Opcode = OPCODE_TEX; - p->Instructions[ic].DstReg.File = PROGRAM_OUTPUT; - p->Instructions[ic].DstReg.Index = FRAG_RESULT_DEPTH; - p->Instructions[ic].DstReg.WriteMask = WRITEMASK_Z; - p->Instructions[ic].SrcReg[0].File = PROGRAM_INPUT; - p->Instructions[ic].SrcReg[0].Index = VARYING_SLOT_TEX0; - p->Instructions[ic].TexSrcUnit = 0; - p->Instructions[ic].TexSrcTarget = TEXTURE_2D_INDEX; - ic++; - /* MOV result.color, fragment.color; */ - p->Instructions[ic].Opcode = OPCODE_MOV; - p->Instructions[ic].DstReg.File = PROGRAM_OUTPUT; - p->Instructions[ic].DstReg.Index = FRAG_RESULT_COLOR; - p->Instructions[ic].SrcReg[0].File = PROGRAM_INPUT; - p->Instructions[ic].SrcReg[0].Index = VARYING_SLOT_COL0; - ic++; + color = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 0, + TGSI_INTERPOLATE_COLOR); + out_color = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0); + + depth_sampler = ureg_DECL_sampler(ureg, 0); + out_depth = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0); } if (write_stencil) { - /* TEX result.stencil, fragment.texcoord[0], texture[0], 2D; */ - p->Instructions[ic].Opcode = OPCODE_TEX; - p->Instructions[ic].DstReg.File = PROGRAM_OUTPUT; - p->Instructions[ic].DstReg.Index = FRAG_RESULT_STENCIL; - p->Instructions[ic].DstReg.WriteMask = WRITEMASK_Y; - p->Instructions[ic].SrcReg[0].File = PROGRAM_INPUT; - p->Instructions[ic].SrcReg[0].Index = VARYING_SLOT_TEX0; - p->Instructions[ic].TexSrcUnit = 1; - p->Instructions[ic].TexSrcTarget = TEXTURE_2D_INDEX; - ic++; + stencil_sampler = ureg_DECL_sampler(ureg, 1); + out_stencil = ureg_DECL_output(ureg, TGSI_SEMANTIC_STENCIL, 0); } - /* END; */ - p->Instructions[ic++].Opcode = OPCODE_END; - - assert(ic == p->NumInstructions); + texcoord = ureg_DECL_fs_input(ureg, + st->needs_texcoord_semantic ? + TGSI_SEMANTIC_TEXCOORD : + TGSI_SEMANTIC_GENERIC, + 0, TGSI_INTERPOLATE_LINEAR); - p->InputsRead = VARYING_BIT_TEX0 | VARYING_BIT_COL0; - p->OutputsWritten = 0; if (write_depth) { - p->OutputsWritten |= BITFIELD64_BIT(FRAG_RESULT_DEPTH); - p->OutputsWritten |= BITFIELD64_BIT(FRAG_RESULT_COLOR); + ureg_TEX(ureg, ureg_writemask(out_depth, TGSI_WRITEMASK_Z), + TGSI_TEXTURE_2D, texcoord, depth_sampler); + ureg_MOV(ureg, out_color, color); } - if (write_stencil) - p->OutputsWritten |= BITFIELD64_BIT(FRAG_RESULT_STENCIL); - p->SamplersUsed = 0x1; /* sampler 0 (bit 0) is used */ if (write_stencil) - p->SamplersUsed |= 1 << 1; + ureg_TEX(ureg, ureg_writemask(out_stencil, TGSI_WRITEMASK_Y), + TGSI_TEXTURE_2D, texcoord, stencil_sampler); - fp = (struct gl_fragment_program *) p; + ureg_END(ureg); + cso = ureg_create_shader_and_destroy(ureg, st->pipe); /* save the new shader */ - st->drawpix.shaders[shaderIndex] = fp; - - return fp; + st->drawpix.zs_shaders[shaderIndex] = cso; + return cso; } @@ -668,6 +525,7 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z, int num_sampler_view, void *driver_vp, void *driver_fp, + struct st_fp_variant *fpv, const GLfloat *color, GLboolean invertTex, GLboolean write_depth, GLboolean write_stencil) @@ -755,10 +613,9 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z, cso_set_tesseval_shader_handle(cso, NULL); cso_set_geometry_shader_handle(cso, NULL); - /* texture sampling state: */ + /* user samplers, plus the drawpix samplers */ { struct pipe_sampler_state sampler; - const struct pipe_sampler_state *states[2] = {&sampler, &sampler}; memset(&sampler, 0, sizeof(sampler)); sampler.wrap_s = PIPE_TEX_WRAP_CLAMP; @@ -769,8 +626,25 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z, sampler.mag_img_filter = PIPE_TEX_FILTER_NEAREST; sampler.normalized_coords = normalized; - cso_set_samplers(cso, PIPE_SHADER_FRAGMENT, - num_sampler_view > 1 ? 2 : 1, states); + if (fpv) { + const struct pipe_sampler_state *samplers[PIPE_MAX_SAMPLERS]; + uint num = MAX2(MAX2(fpv->drawpix_sampler, fpv->pixelmap_sampler) + 1, + st->state.num_samplers[PIPE_SHADER_FRAGMENT]); + uint i; + + for (i = 0; i < st->state.num_samplers[PIPE_SHADER_FRAGMENT]; i++) + samplers[i] = &st->state.samplers[PIPE_SHADER_FRAGMENT][i]; + + samplers[fpv->drawpix_sampler] = &sampler; + if (sv[1]) + samplers[fpv->pixelmap_sampler] = &sampler; + + cso_set_samplers(cso, PIPE_SHADER_FRAGMENT, num, samplers); + } else { + const struct pipe_sampler_state *samplers[2] = {&sampler, &sampler}; + + cso_set_samplers(cso, PIPE_SHADER_FRAGMENT, num_sampler_view, samplers); + } } /* viewport state: viewport matching window dims */ @@ -790,8 +664,21 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z, cso_set_vertex_elements(cso, 3, st->velems_util_draw); cso_set_stream_outputs(st->cso_context, 0, NULL, NULL); - /* texture state: */ - cso_set_sampler_views(cso, PIPE_SHADER_FRAGMENT, num_sampler_view, sv); + /* user textures, plus the drawpix textures */ + if (fpv) { + struct pipe_sampler_view *sampler_views[PIPE_MAX_SAMPLERS]; + uint num = MAX2(MAX2(fpv->drawpix_sampler, fpv->pixelmap_sampler) + 1, + st->state.num_sampler_views[PIPE_SHADER_FRAGMENT]); + + memcpy(sampler_views, st->state.sampler_views[PIPE_SHADER_FRAGMENT], + sizeof(sampler_views)); + + sampler_views[fpv->drawpix_sampler] = sv[0]; + if (sv[1]) + sampler_views[fpv->pixelmap_sampler] = sv[1]; + cso_set_sampler_views(cso, PIPE_SHADER_FRAGMENT, num, sampler_views); + } else + cso_set_sampler_views(cso, PIPE_SHADER_FRAGMENT, num_sampler_view, sv); /* Compute Gallium window coords (y=0=top) with pixel zoom. * Recall that these coords are transformed by the current @@ -1048,30 +935,6 @@ get_color_fp_variant(struct st_context *st) /** - * Get fragment program variant for a glDrawPixels or glCopyPixels - * command for depth/stencil data. - */ -static struct st_fp_variant * -get_depth_stencil_fp_variant(struct st_context *st, GLboolean write_depth, - GLboolean write_stencil) -{ - struct st_fp_variant_key key; - struct st_fp_variant *fpv; - - memset(&key, 0, sizeof(key)); - - key.st = st; - key.drawpixels = 1; - key.drawpixels_z = write_depth; - key.drawpixels_stencil = write_stencil; - - fpv = st_get_fp_variant(st, st->fp, &key); - - return fpv; -} - - -/** * Clamp glDrawPixels width and height to the maximum texture size. */ static void @@ -1109,8 +972,8 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y, GLboolean write_stencil = GL_FALSE, write_depth = GL_FALSE; struct pipe_sampler_view *sv[2] = { NULL }; int num_sampler_view = 1; - struct st_fp_variant *fpv; struct gl_pixelstore_attrib clippedUnpack; + struct st_fp_variant *fpv = NULL; /* Mesa state should be up to date by now */ assert(ctx->NewState == 0x0); @@ -1144,31 +1007,27 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y, * Get vertex/fragment shaders */ if (write_depth || write_stencil) { - fpv = get_depth_stencil_fp_variant(st, write_depth, write_stencil); - - driver_fp = fpv->driver_shader; - + driver_fp = get_drawpix_z_stencil_program(st, write_depth, + write_stencil); driver_vp = make_passthrough_vertex_shader(st, GL_TRUE); - color = ctx->Current.RasterColor; } else { fpv = get_color_fp_variant(st); driver_fp = fpv->driver_shader; - driver_vp = make_passthrough_vertex_shader(st, GL_FALSE); color = NULL; - if (st->pixel_xfer.pixelmap_enabled) { + if (ctx->Pixel.MapColorFlag) { pipe_sampler_view_reference(&sv[1], st->pixel_xfer.pixelmap_sampler_view); num_sampler_view++; } - } - /* update fragment program constants */ - st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT); + /* update fragment program constants */ + st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT); + } /* draw with textured quad */ { @@ -1197,7 +1056,7 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y, sv, num_sampler_view, driver_vp, - driver_fp, + driver_fp, fpv, color, GL_FALSE, write_depth, write_stencil); pipe_sampler_view_reference(&sv[0], NULL); if (num_sampler_view > 1) @@ -1452,6 +1311,7 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy, void *driver_vp, *driver_fp; struct pipe_resource *pt; struct pipe_sampler_view *sv[2] = { NULL }; + struct st_fp_variant *fpv = NULL; int num_sampler_view = 1; GLfloat *color; enum pipe_format srcFormat; @@ -1459,7 +1319,6 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy, GLboolean invertTex = GL_FALSE; GLint readX, readY, readW, readH; struct gl_pixelstore_attrib pack = ctx->DefaultPacking; - struct st_fp_variant *fpv; st_validate_state(st); @@ -1491,19 +1350,22 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy, * Get vertex/fragment shaders */ if (type == GL_COLOR) { + fpv = get_color_fp_variant(st); + rbRead = st_get_color_read_renderbuffer(ctx); color = NULL; - fpv = get_color_fp_variant(st); driver_fp = fpv->driver_shader; - driver_vp = make_passthrough_vertex_shader(st, GL_FALSE); - if (st->pixel_xfer.pixelmap_enabled) { + if (ctx->Pixel.MapColorFlag) { pipe_sampler_view_reference(&sv[1], st->pixel_xfer.pixelmap_sampler_view); num_sampler_view++; } + + /* update fragment program constants */ + st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT); } else { assert(type == GL_DEPTH); @@ -1511,15 +1373,10 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy, Attachment[BUFFER_DEPTH].Renderbuffer); color = ctx->Current.Attrib[VERT_ATTRIB_COLOR0]; - fpv = get_depth_stencil_fp_variant(st, GL_TRUE, GL_FALSE); - driver_fp = fpv->driver_shader; - + driver_fp = get_drawpix_z_stencil_program(st, GL_TRUE, GL_FALSE); driver_vp = make_passthrough_vertex_shader(st, GL_TRUE); } - /* update fragment program constants */ - st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT); - /* Choose the format for the temporary texture. */ srcFormat = rbRead->texture->format; srcBind = PIPE_BIND_SAMPLER_VIEW | @@ -1645,7 +1502,7 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy, sv, num_sampler_view, driver_vp, - driver_fp, + driver_fp, fpv, color, invertTex, GL_FALSE, GL_FALSE); pipe_resource_reference(&pt, NULL); @@ -1666,12 +1523,12 @@ st_destroy_drawpix(struct st_context *st) { GLuint i; - for (i = 0; i < ARRAY_SIZE(st->drawpix.shaders); i++) { - if (st->drawpix.shaders[i]) - _mesa_reference_fragprog(st->ctx, &st->drawpix.shaders[i], NULL); + for (i = 0; i < ARRAY_SIZE(st->drawpix.zs_shaders); i++) { + if (st->drawpix.zs_shaders[i]) + cso_delete_fragment_shader(st->cso_context, + st->drawpix.zs_shaders[i]); } - st_reference_fragprog(st, &st->pixel_xfer.combined_prog, NULL); if (st->drawpix.vert_shaders[0]) cso_delete_vertex_shader(st->cso_context, st->drawpix.vert_shaders[0]); if (st->drawpix.vert_shaders[1]) diff --git a/src/mesa/state_tracker/st_cb_drawpixels.h b/src/mesa/state_tracker/st_cb_drawpixels.h index c707ace2f9f..f1fb32dd6cf 100644 --- a/src/mesa/state_tracker/st_cb_drawpixels.h +++ b/src/mesa/state_tracker/st_cb_drawpixels.h @@ -31,6 +31,7 @@ #include "main/compiler.h" +#include <stdbool.h> struct dd_function_table; struct st_context; @@ -40,15 +41,11 @@ extern void st_init_drawpixels_functions(struct dd_function_table *functions); extern void st_destroy_drawpix(struct st_context *st); -extern void -st_make_drawpix_fragment_program(struct st_context *st, - struct gl_fragment_program *fpIn, - struct gl_fragment_program **fpOut); - -extern struct gl_fragment_program * -st_make_drawpix_z_stencil_program(struct st_context *st, - GLboolean write_depth, - GLboolean write_stencil); - +extern const struct tgsi_token * +st_get_drawpix_shader(const struct tgsi_token *tokens, bool use_texcoord, + bool scale_and_bias, unsigned scale_const, + unsigned bias_const, bool pixel_maps, + unsigned drawpix_sampler, unsigned pixelmap_sampler, + unsigned texcoord_const); #endif /* ST_CB_DRAWPIXELS_H */ diff --git a/src/mesa/state_tracker/st_cb_drawpixels_shader.c b/src/mesa/state_tracker/st_cb_drawpixels_shader.c new file mode 100644 index 00000000000..749b46cfbf7 --- /dev/null +++ b/src/mesa/state_tracker/st_cb_drawpixels_shader.c @@ -0,0 +1,278 @@ +/************************************************************************** + * + * Copyright (C) 2015 Advanced Micro Devices, Inc. + * Copyright 2007 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "st_cb_drawpixels.h" +#include "tgsi/tgsi_transform.h" +#include "tgsi/tgsi_scan.h" + +struct tgsi_drawpix_transform { + struct tgsi_transform_context base; + struct tgsi_shader_info info; + bool use_texcoord; + bool scale_and_bias; + bool pixel_maps; + bool first_instruction_emitted; + unsigned scale_const; + unsigned bias_const; + unsigned color_temp; + unsigned drawpix_sampler; + unsigned pixelmap_sampler; + unsigned texcoord_const; +}; + +static inline struct tgsi_drawpix_transform * +tgsi_drawpix_transform(struct tgsi_transform_context *tctx) +{ + return (struct tgsi_drawpix_transform *)tctx; +} + +static void +set_src(struct tgsi_full_instruction *inst, unsigned i, unsigned file, unsigned index, + unsigned x, unsigned y, unsigned z, unsigned w) +{ + inst->Src[i].Register.File = file; + inst->Src[i].Register.Index = index; + inst->Src[i].Register.SwizzleX = x; + inst->Src[i].Register.SwizzleY = y; + inst->Src[i].Register.SwizzleZ = z; + inst->Src[i].Register.SwizzleW = w; +} + +#define SET_SRC(inst, i, file, index, x, y, z, w) \ + set_src(inst, i, file, index, TGSI_SWIZZLE_##x, TGSI_SWIZZLE_##y, \ + TGSI_SWIZZLE_##z, TGSI_SWIZZLE_##w) + +static void +transform_instr(struct tgsi_transform_context *tctx, + struct tgsi_full_instruction *current_inst) +{ + struct tgsi_drawpix_transform *ctx = tgsi_drawpix_transform(tctx); + struct tgsi_full_declaration decl; + struct tgsi_full_instruction inst; + unsigned i, sem_texcoord = ctx->use_texcoord ? TGSI_SEMANTIC_TEXCOORD : + TGSI_SEMANTIC_GENERIC; + int texcoord_index = -1; + + if (ctx->first_instruction_emitted) + goto transform_inst; + + ctx->first_instruction_emitted = true; + + /* Add scale and bias constants. */ + if (ctx->scale_and_bias) { + if (ctx->info.const_file_max[0] < (int)ctx->scale_const) { + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_CONSTANT; + decl.Range.First = decl.Range.Last = ctx->scale_const; + tctx->emit_declaration(tctx, &decl); + } + + if (ctx->info.const_file_max[0] < (int)ctx->bias_const) { + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_CONSTANT; + decl.Range.First = decl.Range.Last = ctx->bias_const; + tctx->emit_declaration(tctx, &decl); + } + } + + if (ctx->info.const_file_max[0] < (int)ctx->texcoord_const) { + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_CONSTANT; + decl.Range.First = decl.Range.Last = ctx->texcoord_const; + tctx->emit_declaration(tctx, &decl); + } + + /* Add a new temp. */ + ctx->color_temp = ctx->info.file_max[TGSI_FILE_TEMPORARY] + 1; + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_TEMPORARY; + decl.Range.First = decl.Range.Last = ctx->color_temp; + tctx->emit_declaration(tctx, &decl); + + /* Add TEXCOORD[texcoord_slot] if it's missing. */ + for (i = 0; i < ctx->info.num_inputs; i++) { + if (ctx->info.input_semantic_name[i] == sem_texcoord && + ctx->info.input_semantic_index[i] == 0) { + texcoord_index = i; + break; + } + } + + if (texcoord_index == -1) { + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_INPUT; + decl.Declaration.Semantic = 1; + decl.Semantic.Name = sem_texcoord; + decl.Declaration.Interpolate = 1; + decl.Interp.Interpolate = TGSI_INTERPOLATE_PERSPECTIVE; + decl.Range.First = decl.Range.Last = ctx->info.num_inputs; + texcoord_index = ctx->info.num_inputs; + tctx->emit_declaration(tctx, &decl); + } + + /* Declare the drawpix sampler if it's missing. */ + if (!(ctx->info.samplers_declared & (1 << ctx->drawpix_sampler))) { + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_SAMPLER; + decl.Range.First = decl.Range.Last = ctx->drawpix_sampler; + tctx->emit_declaration(tctx, &decl); + } + + /* Declare the pixel map sampler if it's missing. */ + if (ctx->pixel_maps && + !(ctx->info.samplers_declared & (1 << ctx->pixelmap_sampler))) { + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_SAMPLER; + decl.Range.First = decl.Range.Last = ctx->pixelmap_sampler; + tctx->emit_declaration(tctx, &decl); + } + + /* Get initial pixel color from the texture. + * TEX temp, fragment.texcoord[0], texture[0], 2D; + */ + inst = tgsi_default_full_instruction(); + inst.Instruction.Opcode = TGSI_OPCODE_TEX; + inst.Instruction.Texture = 1; + inst.Texture.Texture = TGSI_TEXTURE_2D; + + inst.Instruction.NumDstRegs = 1; + inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY; + inst.Dst[0].Register.Index = ctx->color_temp; + inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW; + + inst.Instruction.NumSrcRegs = 2; + SET_SRC(&inst, 0, TGSI_FILE_INPUT, texcoord_index, X, Y, Z, W); + inst.Src[1].Register.File = TGSI_FILE_SAMPLER; + inst.Src[1].Register.Index = ctx->drawpix_sampler; + + tctx->emit_instruction(tctx, &inst); + + /* Apply the scale and bias. */ + if (ctx->scale_and_bias) { + /* MAD temp, temp, scale, bias; */ + inst = tgsi_default_full_instruction(); + inst.Instruction.Opcode = TGSI_OPCODE_MAD; + + inst.Instruction.NumDstRegs = 1; + inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY; + inst.Dst[0].Register.Index = ctx->color_temp; + inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW; + + inst.Instruction.NumSrcRegs = 3; + SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->color_temp, X, Y, Z, W); + SET_SRC(&inst, 1, TGSI_FILE_CONSTANT, ctx->scale_const, X, Y, Z, W); + SET_SRC(&inst, 2, TGSI_FILE_CONSTANT, ctx->bias_const, X, Y, Z, W); + + tctx->emit_instruction(tctx, &inst); + } + + if (ctx->pixel_maps) { + /* do four pixel map look-ups with two TEX instructions: */ + + /* TEX temp.xy, temp.xyyy, texture[1], 2D; */ + inst = tgsi_default_full_instruction(); + inst.Instruction.Opcode = TGSI_OPCODE_TEX; + inst.Instruction.Texture = 1; + inst.Texture.Texture = TGSI_TEXTURE_2D; + + inst.Instruction.NumDstRegs = 1; + inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY; + inst.Dst[0].Register.Index = ctx->color_temp; + inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XY; + + inst.Instruction.NumSrcRegs = 2; + SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->color_temp, X, Y, Y, Y); + inst.Src[1].Register.File = TGSI_FILE_SAMPLER; + inst.Src[1].Register.Index = ctx->pixelmap_sampler; + + tctx->emit_instruction(tctx, &inst); + + /* TEX temp.zw, temp.zwww, texture[1], 2D; */ + inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_ZW; + SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->color_temp, Z, W, W, W); + tctx->emit_instruction(tctx, &inst); + } + + /* Now, "color_temp" should be used in place of IN:COLOR0, + * and CONST[texcoord_slot] should be used in place of IN:TEXCOORD0. + */ + +transform_inst: + + for (i = 0; i < current_inst->Instruction.NumSrcRegs; i++) { + struct tgsi_full_src_register *src = ¤t_inst->Src[i]; + unsigned reg = src->Register.Index; + + if (src->Register.File != TGSI_FILE_INPUT || src->Register.Indirect) + continue; + + if (ctx->info.input_semantic_name[reg] == TGSI_SEMANTIC_COLOR && + ctx->info.input_semantic_index[reg] == 0) { + src->Register.File = TGSI_FILE_TEMPORARY; + src->Register.Index = ctx->color_temp; + } else if (ctx->info.input_semantic_name[reg] == sem_texcoord && + ctx->info.input_semantic_index[reg] == 0) { + src->Register.File = TGSI_FILE_CONSTANT; + src->Register.Index = ctx->texcoord_const; + } + } + + tctx->emit_instruction(tctx, current_inst); +} + +const struct tgsi_token * +st_get_drawpix_shader(const struct tgsi_token *tokens, bool use_texcoord, + bool scale_and_bias, unsigned scale_const, + unsigned bias_const, bool pixel_maps, + unsigned drawpix_sampler, unsigned pixelmap_sampler, + unsigned texcoord_const) +{ + struct tgsi_drawpix_transform ctx; + struct tgsi_token *newtoks; + int newlen; + + memset(&ctx, 0, sizeof(ctx)); + ctx.base.transform_instruction = transform_instr; + ctx.use_texcoord = use_texcoord; + ctx.scale_and_bias = scale_and_bias; + ctx.scale_const = scale_const; + ctx.bias_const = bias_const; + ctx.pixel_maps = pixel_maps; + ctx.drawpix_sampler = drawpix_sampler; + ctx.pixelmap_sampler = pixelmap_sampler; + ctx.texcoord_const = texcoord_const; + tgsi_scan_shader(tokens, &ctx.info); + + newlen = tgsi_num_tokens(tokens) + 30; + newtoks = tgsi_alloc_tokens(newlen); + if (!newtoks) + return NULL; + + tgsi_transform_shader(tokens, newtoks, newlen, &ctx.base); + return newtoks; +} diff --git a/src/mesa/state_tracker/st_cb_fbo.c b/src/mesa/state_tracker/st_cb_fbo.c index ff703fa41cb..2a2eb0992c8 100644 --- a/src/mesa/state_tracker/st_cb_fbo.c +++ b/src/mesa/state_tracker/st_cb_fbo.c @@ -456,7 +456,7 @@ st_update_renderbuffer_surface(struct st_context *st, surf_tmpl.u.tex.first_layer = first_layer; surf_tmpl.u.tex.last_layer = last_layer; - pipe_surface_reference(&strb->surface, NULL); + pipe_surface_release(pipe, &strb->surface); strb->surface = pipe->create_surface(pipe, resource, &surf_tmpl); } diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c index 3029909d12d..708bdf5011e 100644 --- a/src/mesa/state_tracker/st_cb_program.c +++ b/src/mesa/state_tracker/st_cb_program.c @@ -105,29 +105,24 @@ st_new_program(struct gl_context *ctx, GLenum target, GLuint id) switch (target) { case GL_VERTEX_PROGRAM_ARB: { struct st_vertex_program *prog = ST_CALLOC_STRUCT(st_vertex_program); - return _mesa_init_vertex_program(ctx, &prog->Base, target, id); + return _mesa_init_gl_program(&prog->Base.Base, target, id); } - case GL_FRAGMENT_PROGRAM_ARB: { struct st_fragment_program *prog = ST_CALLOC_STRUCT(st_fragment_program); - return _mesa_init_fragment_program(ctx, &prog->Base, target, id); + return _mesa_init_gl_program(&prog->Base.Base, target, id); } - case GL_GEOMETRY_PROGRAM_NV: { struct st_geometry_program *prog = ST_CALLOC_STRUCT(st_geometry_program); - return _mesa_init_geometry_program(ctx, &prog->Base, target, id); + return _mesa_init_gl_program(&prog->Base.Base, target, id); } - case GL_TESS_CONTROL_PROGRAM_NV: { struct st_tessctrl_program *prog = ST_CALLOC_STRUCT(st_tessctrl_program); - return _mesa_init_tess_ctrl_program(ctx, &prog->Base, target, id); + return _mesa_init_gl_program(&prog->Base.Base, target, id); } - case GL_TESS_EVALUATION_PROGRAM_NV: { struct st_tesseval_program *prog = ST_CALLOC_STRUCT(st_tesseval_program); - return _mesa_init_tess_eval_program(ctx, &prog->Base, target, id); + return _mesa_init_gl_program(&prog->Base.Base, target, id); } - default: assert(0); return NULL; @@ -234,6 +229,8 @@ st_program_string_notify( struct gl_context *ctx, struct st_fragment_program *stfp = (struct st_fragment_program *) prog; st_release_fp_variants(st, stfp); + if (!st_translate_fragment_program(st, stfp)) + return false; if (st->fp == stfp) st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM; @@ -242,6 +239,8 @@ st_program_string_notify( struct gl_context *ctx, struct st_geometry_program *stgp = (struct st_geometry_program *) prog; st_release_gp_variants(st, stgp); + if (!st_translate_geometry_program(st, stgp)) + return false; if (st->gp == stgp) st->dirty.st |= ST_NEW_GEOMETRY_PROGRAM; @@ -249,7 +248,9 @@ st_program_string_notify( struct gl_context *ctx, else if (target == GL_VERTEX_PROGRAM_ARB) { struct st_vertex_program *stvp = (struct st_vertex_program *) prog; - st_release_vp_variants( st, stvp ); + st_release_vp_variants(st, stvp); + if (!st_translate_vertex_program(st, stvp)) + return false; if (st->vp == stvp) st->dirty.st |= ST_NEW_VERTEX_PROGRAM; @@ -259,6 +260,8 @@ st_program_string_notify( struct gl_context *ctx, (struct st_tessctrl_program *) prog; st_release_tcp_variants(st, sttcp); + if (!st_translate_tessctrl_program(st, sttcp)) + return false; if (st->tcp == sttcp) st->dirty.st |= ST_NEW_TESSCTRL_PROGRAM; @@ -268,6 +271,8 @@ st_program_string_notify( struct gl_context *ctx, (struct st_tesseval_program *) prog; st_release_tep_variants(st, sttep); + if (!st_translate_tesseval_program(st, sttep)) + return false; if (st->tep == sttep) st->dirty.st |= ST_NEW_TESSEVAL_PROGRAM; diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c index a9ab5edcf49..bef7307bb27 100644 --- a/src/mesa/state_tracker/st_context.c +++ b/src/mesa/state_tracker/st_context.c @@ -224,8 +224,6 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe, st->ctx->VertexProgram._MaintainTnlProgram = GL_TRUE; - st->pixel_xfer.cache = _mesa_new_program_cache(); - st->has_stencil_export = screen->get_param(screen, PIPE_CAP_SHADER_STENCIL_EXPORT); st->has_shader_model3 = screen->get_param(screen, PIPE_CAP_SM3); @@ -386,8 +384,8 @@ void st_destroy_context( struct st_context *st ) pipe_surface_reference(&st->state.framebuffer.cbufs[i], NULL); } pipe_surface_reference(&st->state.framebuffer.zsbuf, NULL); - - _mesa_delete_program_cache(st->ctx, st->pixel_xfer.cache); + pipe_sampler_view_reference(&st->pixel_xfer.pixelmap_sampler_view, NULL); + pipe_resource_reference(&st->pixel_xfer.pixelmap_texture, NULL); _vbo_DestroyContext(st->ctx); diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h index a4cda29059d..f187d82449b 100644 --- a/src/mesa/state_tracker/st_context.h +++ b/src/mesa/state_tracker/st_context.h @@ -162,15 +162,8 @@ struct st_context struct gl_texture_object *default_texture; struct { - struct gl_program_cache *cache; - struct st_fragment_program *program; /**< cur pixel transfer prog */ - GLuint xfer_prog_sn; /**< pixel xfer program serial no. */ - GLuint user_prog_sn; /**< user fragment program serial no. */ - struct st_fragment_program *combined_prog; - GLuint combined_prog_sn; struct pipe_resource *pixelmap_texture; struct pipe_sampler_view *pixelmap_sampler_view; - boolean pixelmap_enabled; /**< use the pixelmap texture? */ } pixel_xfer; /** for glBitmap */ @@ -184,7 +177,7 @@ struct st_context /** for glDraw/CopyPixels */ struct { - struct gl_fragment_program *shaders[4]; + void *zs_shaders[4]; void *vert_shaders[2]; /**< ureg shaders */ } drawpix; diff --git a/src/mesa/state_tracker/st_debug.c b/src/mesa/state_tracker/st_debug.c index 50891c112cb..6d859c6ab5b 100644 --- a/src/mesa/state_tracker/st_debug.c +++ b/src/mesa/state_tracker/st_debug.c @@ -98,7 +98,7 @@ st_print_current(void) if (st->vp->Base.Base.Parameters) _mesa_print_parameter_list(st->vp->Base.Base.Parameters); - tgsi_dump( st->fp->variants[0].tgsi.tokens, 0 ); + tgsi_dump(st->fp->tgsi.tokens, 0); if (st->fp->Base.Base.Parameters) _mesa_print_parameter_list(st->fp->Base.Base.Parameters); } diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp index 633e90ffa38..f481e8902d8 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp @@ -4334,216 +4334,6 @@ glsl_to_tgsi_visitor::renumber_registers(void) ralloc_free(first_reads); } -/** - * Returns a fragment program which implements the current pixel transfer ops. - * Based on get_pixel_transfer_program in st_atom_pixeltransfer.c. - */ -extern "C" void -get_pixel_transfer_visitor(struct st_fragment_program *fp, - glsl_to_tgsi_visitor *original, - int scale_and_bias, int pixel_maps) -{ - glsl_to_tgsi_visitor *v = new glsl_to_tgsi_visitor(); - struct st_context *st = st_context(original->ctx); - struct gl_program *prog = &fp->Base.Base; - struct gl_program_parameter_list *params = _mesa_new_parameter_list(); - st_src_reg coord, src0; - st_dst_reg dst0; - glsl_to_tgsi_instruction *inst; - - /* Copy attributes of the glsl_to_tgsi_visitor in the original shader. */ - v->ctx = original->ctx; - v->prog = prog; - v->shader_program = NULL; - v->shader = NULL; - v->glsl_version = original->glsl_version; - v->native_integers = original->native_integers; - v->options = original->options; - v->next_temp = original->next_temp; - v->num_address_regs = original->num_address_regs; - v->samplers_used = prog->SamplersUsed = original->samplers_used; - v->indirect_addr_consts = original->indirect_addr_consts; - memcpy(&v->immediates, &original->immediates, sizeof(v->immediates)); - v->num_immediates = original->num_immediates; - - /* - * Get initial pixel color from the texture. - * TEX colorTemp, fragment.texcoord[0], texture[0], 2D; - */ - coord = st_src_reg(PROGRAM_INPUT, VARYING_SLOT_TEX0, glsl_type::vec2_type); - src0 = v->get_temp(glsl_type::vec4_type); - dst0 = st_dst_reg(src0); - inst = v->emit_asm(NULL, TGSI_OPCODE_TEX, dst0, coord); - inst->sampler_array_size = 1; - inst->tex_target = TEXTURE_2D_INDEX; - - prog->InputsRead |= VARYING_BIT_TEX0; - prog->SamplersUsed |= (1 << 0); /* mark sampler 0 as used */ - v->samplers_used |= (1 << 0); - - if (scale_and_bias) { - static const gl_state_index scale_state[STATE_LENGTH] = - { STATE_INTERNAL, STATE_PT_SCALE, - (gl_state_index) 0, (gl_state_index) 0, (gl_state_index) 0 }; - static const gl_state_index bias_state[STATE_LENGTH] = - { STATE_INTERNAL, STATE_PT_BIAS, - (gl_state_index) 0, (gl_state_index) 0, (gl_state_index) 0 }; - GLint scale_p, bias_p; - st_src_reg scale, bias; - - scale_p = _mesa_add_state_reference(params, scale_state); - bias_p = _mesa_add_state_reference(params, bias_state); - - /* MAD colorTemp, colorTemp, scale, bias; */ - scale = st_src_reg(PROGRAM_STATE_VAR, scale_p, GLSL_TYPE_FLOAT); - bias = st_src_reg(PROGRAM_STATE_VAR, bias_p, GLSL_TYPE_FLOAT); - inst = v->emit_asm(NULL, TGSI_OPCODE_MAD, dst0, src0, scale, bias); - } - - if (pixel_maps) { - st_src_reg temp = v->get_temp(glsl_type::vec4_type); - st_dst_reg temp_dst = st_dst_reg(temp); - - assert(st->pixel_xfer.pixelmap_texture); - (void) st; - - /* With a little effort, we can do four pixel map look-ups with - * two TEX instructions: - */ - - /* TEX temp.rg, colorTemp.rgba, texture[1], 2D; */ - temp_dst.writemask = WRITEMASK_XY; /* write R,G */ - inst = v->emit_asm(NULL, TGSI_OPCODE_TEX, temp_dst, src0); - inst->sampler.index = 1; - inst->sampler_array_size = 1; - inst->tex_target = TEXTURE_2D_INDEX; - - /* TEX temp.ba, colorTemp.baba, texture[1], 2D; */ - src0.swizzle = MAKE_SWIZZLE4(SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W); - temp_dst.writemask = WRITEMASK_ZW; /* write B,A */ - inst = v->emit_asm(NULL, TGSI_OPCODE_TEX, temp_dst, src0); - inst->sampler.index = 1; - inst->sampler_array_size = 1; - inst->tex_target = TEXTURE_2D_INDEX; - - prog->SamplersUsed |= (1 << 1); /* mark sampler 1 as used */ - v->samplers_used |= (1 << 1); - - /* MOV colorTemp, temp; */ - inst = v->emit_asm(NULL, TGSI_OPCODE_MOV, dst0, temp); - } - - /* Now copy the instructions from the original glsl_to_tgsi_visitor into the - * new visitor. */ - foreach_in_list(glsl_to_tgsi_instruction, inst, &original->instructions) { - glsl_to_tgsi_instruction *newinst; - st_src_reg src_regs[4]; - - if (inst->dst[0].file == PROGRAM_OUTPUT) - prog->OutputsWritten |= BITFIELD64_BIT(inst->dst[0].index); - - for (int i = 0; i < 4; i++) { - src_regs[i] = inst->src[i]; - if (src_regs[i].file == PROGRAM_INPUT && - src_regs[i].index == VARYING_SLOT_COL0) { - src_regs[i].file = PROGRAM_TEMPORARY; - src_regs[i].index = src0.index; - } - else if (src_regs[i].file == PROGRAM_INPUT) - prog->InputsRead |= BITFIELD64_BIT(src_regs[i].index); - } - - newinst = v->emit_asm(NULL, inst->op, inst->dst[0], src_regs[0], src_regs[1], src_regs[2], src_regs[3]); - newinst->tex_target = inst->tex_target; - newinst->sampler_array_size = inst->sampler_array_size; - } - - /* Make modifications to fragment program info. */ - prog->Parameters = _mesa_combine_parameter_lists(params, - original->prog->Parameters); - _mesa_free_parameter_list(params); - count_resources(v, prog); - fp->glsl_to_tgsi = v; -} - -/** - * Make fragment program for glBitmap: - * Sample the texture and kill the fragment if the bit is 0. - * This program will be combined with the user's fragment program. - * - * Based on make_bitmap_fragment_program in st_cb_bitmap.c. - */ -extern "C" void -get_bitmap_visitor(struct st_fragment_program *fp, - glsl_to_tgsi_visitor *original, int samplerIndex) -{ - glsl_to_tgsi_visitor *v = new glsl_to_tgsi_visitor(); - struct st_context *st = st_context(original->ctx); - struct gl_program *prog = &fp->Base.Base; - st_src_reg coord, src0; - st_dst_reg dst0; - glsl_to_tgsi_instruction *inst; - - /* Copy attributes of the glsl_to_tgsi_visitor in the original shader. */ - v->ctx = original->ctx; - v->prog = prog; - v->shader_program = NULL; - v->shader = NULL; - v->glsl_version = original->glsl_version; - v->native_integers = original->native_integers; - v->options = original->options; - v->next_temp = original->next_temp; - v->num_address_regs = original->num_address_regs; - v->samplers_used = prog->SamplersUsed = original->samplers_used; - v->indirect_addr_consts = original->indirect_addr_consts; - memcpy(&v->immediates, &original->immediates, sizeof(v->immediates)); - v->num_immediates = original->num_immediates; - - /* TEX tmp0, fragment.texcoord[0], texture[0], 2D; */ - coord = st_src_reg(PROGRAM_INPUT, VARYING_SLOT_TEX0, glsl_type::vec2_type); - src0 = v->get_temp(glsl_type::vec4_type); - dst0 = st_dst_reg(src0); - inst = v->emit_asm(NULL, TGSI_OPCODE_TEX, dst0, coord); - inst->sampler.index = samplerIndex; - inst->sampler_array_size = 1; - inst->tex_target = TEXTURE_2D_INDEX; - - prog->InputsRead |= VARYING_BIT_TEX0; - prog->SamplersUsed |= (1 << samplerIndex); /* mark sampler as used */ - v->samplers_used |= (1 << samplerIndex); - - /* KIL if -tmp0 < 0 # texel=0 -> keep / texel=0 -> discard */ - src0.negate = NEGATE_XYZW; - if (st->bitmap.tex_format == PIPE_FORMAT_L8_UNORM) - src0.swizzle = SWIZZLE_XXXX; - inst = v->emit_asm(NULL, TGSI_OPCODE_KILL_IF, undef_dst, src0); - - /* Now copy the instructions from the original glsl_to_tgsi_visitor into the - * new visitor. */ - foreach_in_list(glsl_to_tgsi_instruction, inst, &original->instructions) { - glsl_to_tgsi_instruction *newinst; - st_src_reg src_regs[4]; - - if (inst->dst[0].file == PROGRAM_OUTPUT) - prog->OutputsWritten |= BITFIELD64_BIT(inst->dst[0].index); - - for (int i = 0; i < 4; i++) { - src_regs[i] = inst->src[i]; - if (src_regs[i].file == PROGRAM_INPUT) - prog->InputsRead |= BITFIELD64_BIT(src_regs[i].index); - } - - newinst = v->emit_asm(NULL, inst->op, inst->dst[0], src_regs[0], src_regs[1], src_regs[2], src_regs[3]); - newinst->tex_target = inst->tex_target; - newinst->sampler_array_size = inst->sampler_array_size; - } - - /* Make modifications to fragment program info. */ - prog->Parameters = _mesa_clone_parameter_list(original->prog->Parameters); - count_resources(v, prog); - fp->glsl_to_tgsi = v; -} - /* ------------------------- TGSI conversion stuff -------------------------- */ struct label { unsigned branch_target; @@ -4852,7 +4642,7 @@ src_register(struct st_translate *t, const st_src_reg *reg) static struct ureg_dst translate_dst(struct st_translate *t, const st_dst_reg *dst_reg, - bool saturate, bool clamp_color) + bool saturate) { struct ureg_dst dst = dst_register(t, dst_reg->file, dst_reg->index, dst_reg->array_id); @@ -4864,28 +4654,6 @@ translate_dst(struct st_translate *t, if (saturate) dst = ureg_saturate(dst); - else if (clamp_color && dst_reg->file == PROGRAM_OUTPUT) { - /* Clamp colors for ARB_color_buffer_float. */ - switch (t->procType) { - case TGSI_PROCESSOR_VERTEX: - /* This can only occur with a compatibility profile, which doesn't - * support geometry shaders. */ - if (dst_reg->index == VARYING_SLOT_COL0 || - dst_reg->index == VARYING_SLOT_COL1 || - dst_reg->index == VARYING_SLOT_BFC0 || - dst_reg->index == VARYING_SLOT_BFC1) { - dst = ureg_saturate(dst); - } - break; - - case TGSI_PROCESSOR_FRAGMENT: - if (dst_reg->index == FRAG_RESULT_COLOR || - dst_reg->index >= FRAG_RESULT_DATA0) { - dst = ureg_saturate(dst); - } - break; - } - } if (dst_reg->reladdr != NULL) { assert(dst_reg->file != PROGRAM_TEMPORARY); @@ -4991,8 +4759,7 @@ translate_tex_offset(struct st_translate *t, static void compile_tgsi_instruction(struct st_translate *t, - const glsl_to_tgsi_instruction *inst, - bool clamp_dst_color_output) + const glsl_to_tgsi_instruction *inst) { struct ureg_program *ureg = t->ureg; GLuint i; @@ -5010,8 +4777,7 @@ compile_tgsi_instruction(struct st_translate *t, for (i = 0; i < num_dst; i++) dst[i] = translate_dst(t, &inst->dst[i], - inst->saturate, - clamp_dst_color_output); + inst->saturate); for (i = 0; i < num_src; i++) src[i] = translate_src(t, &inst->src[i]); @@ -5286,16 +5052,6 @@ emit_face_var(struct gl_context *ctx, struct st_translate *t) t->inputs[t->inputMapping[VARYING_SLOT_FACE]] = ureg_src(face_temp); } -static void -emit_edgeflags(struct st_translate *t) -{ - struct ureg_program *ureg = t->ureg; - struct ureg_dst edge_dst = t->outputs[t->outputMapping[VARYING_SLOT_EDGE]]; - struct ureg_src edge_src = t->inputs[t->inputMapping[VERT_ATTRIB_EDGEFLAG]]; - - ureg_MOV(ureg, edge_dst, edge_src); -} - static bool find_array(unsigned attr, struct array_decl *arrays, unsigned count, unsigned *array_id, unsigned *array_size) @@ -5353,9 +5109,7 @@ st_translate_program( const GLuint outputMapping[], const GLuint outputSlotToAttr[], const ubyte outputSemanticName[], - const ubyte outputSemanticIndex[], - boolean passthrough_edgeflags, - boolean clamp_color) + const ubyte outputSemanticIndex[]) { struct st_translate *t; unsigned i; @@ -5544,8 +5298,6 @@ st_translate_program( t->outputs[i] = ureg_writemask(t->outputs[i], TGSI_WRITEMASK_X); } } - if (passthrough_edgeflags) - emit_edgeflags(t); } /* Declare address register. @@ -5639,7 +5391,7 @@ st_translate_program( unsigned num_ubos = program->shader->NumUniformBlocks; for (i = 0; i < num_ubos; i++) { - unsigned size = program->shader->UniformBlocks[i].UniformBufferSize; + unsigned size = program->shader->UniformBlocks[i]->UniformBufferSize; unsigned num_const_vecs = (size + 15) / 16; unsigned first, last; assert(num_const_vecs > 0); @@ -5696,7 +5448,7 @@ st_translate_program( */ foreach_in_list(glsl_to_tgsi_instruction, inst, &program->instructions) { set_insn_start(t, ureg_get_instruction_number(ureg)); - compile_tgsi_instruction(t, inst, clamp_color); + compile_tgsi_instruction(t, inst); } /* Fix up all emitted labels: diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.h b/src/mesa/state_tracker/st_glsl_to_tgsi.h index 4af747fa9de..729295bcb52 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.h +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.h @@ -52,17 +52,9 @@ enum pipe_error st_translate_program( const GLuint outputMapping[], const GLuint outputSlotToAttr[], const ubyte outputSemanticName[], - const ubyte outputSemanticIndex[], - boolean passthrough_edgeflags, - boolean clamp_color); + const ubyte outputSemanticIndex[]); void free_glsl_to_tgsi_visitor(struct glsl_to_tgsi_visitor *v); -void get_pixel_transfer_visitor(struct st_fragment_program *fp, - struct glsl_to_tgsi_visitor *original, - int scale_and_bias, int pixel_maps); -void get_bitmap_visitor(struct st_fragment_program *fp, - struct glsl_to_tgsi_visitor *original, - int samplerIndex); GLboolean st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog); diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c index 896e239ee68..4b9dc994ea5 100644 --- a/src/mesa/state_tracker/st_mesa_to_tgsi.c +++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c @@ -283,8 +283,7 @@ st_translate_texture_target( GLuint textarget, static struct ureg_dst translate_dst( struct st_translate *t, const struct prog_dst_register *DstReg, - boolean saturate, - boolean clamp_color) + boolean saturate) { struct ureg_dst dst = dst_register( t, DstReg->File, @@ -295,27 +294,6 @@ translate_dst( struct st_translate *t, if (saturate) dst = ureg_saturate( dst ); - else if (clamp_color && DstReg->File == PROGRAM_OUTPUT) { - /* Clamp colors for ARB_color_buffer_float. */ - switch (t->procType) { - case TGSI_PROCESSOR_VERTEX: - /* This can only occur with a compatibility profile, which doesn't - * support geometry shaders. */ - if (DstReg->Index == VARYING_SLOT_COL0 || - DstReg->Index == VARYING_SLOT_COL1 || - DstReg->Index == VARYING_SLOT_BFC0 || - DstReg->Index == VARYING_SLOT_BFC1) { - dst = ureg_saturate(dst); - } - break; - - case TGSI_PROCESSOR_FRAGMENT: - if (DstReg->Index >= FRAG_RESULT_COLOR) { - dst = ureg_saturate(dst); - } - break; - } - } if (DstReg->RelAddr) dst = ureg_dst_indirect( dst, ureg_src(t->address[0]) ); @@ -649,8 +627,7 @@ static void compile_instruction( struct gl_context *ctx, struct st_translate *t, - const struct prog_instruction *inst, - boolean clamp_dst_color_output) + const struct prog_instruction *inst) { struct ureg_program *ureg = t->ureg; GLuint i; @@ -665,8 +642,7 @@ compile_instruction( if (num_dst) dst[0] = translate_dst( t, &inst->DstReg, - inst->Saturate, - clamp_dst_color_output); + inst->Saturate); for (i = 0; i < num_src; i++) src[i] = translate_src( t, &inst->SrcReg[i] ); @@ -974,18 +950,6 @@ emit_face_var( struct st_translate *t, } -static void -emit_edgeflags( struct st_translate *t, - const struct gl_program *program ) -{ - struct ureg_program *ureg = t->ureg; - struct ureg_dst edge_dst = t->outputs[t->outputMapping[VARYING_SLOT_EDGE]]; - struct ureg_src edge_src = t->inputs[t->inputMapping[VERT_ATTRIB_EDGEFLAG]]; - - ureg_MOV( ureg, edge_dst, edge_src ); -} - - /** * Translate Mesa program to TGSI format. * \param program the program to translate @@ -1019,9 +983,7 @@ st_translate_mesa_program( GLuint numOutputs, const GLuint outputMapping[], const ubyte outputSemanticName[], - const ubyte outputSemanticIndex[], - boolean passthrough_edgeflags, - boolean clamp_color) + const ubyte outputSemanticIndex[]) { struct st_translate translate, *t; unsigned i; @@ -1125,8 +1087,6 @@ st_translate_mesa_program( t->outputs[i] = ureg_writemask(t->outputs[i], TGSI_WRITEMASK_X); } } - if (passthrough_edgeflags) - emit_edgeflags( t, program ); } /* Declare address register. @@ -1231,7 +1191,7 @@ st_translate_mesa_program( */ for (i = 0; i < program->NumInstructions; i++) { set_insn_start( t, ureg_get_instruction_number( ureg )); - compile_instruction( ctx, t, &program->Instructions[i], clamp_color ); + compile_instruction(ctx, t, &program->Instructions[i]); } /* Fix up all emitted labels: diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.h b/src/mesa/state_tracker/st_mesa_to_tgsi.h index 62bb654e95a..ed7a3adfe1a 100644 --- a/src/mesa/state_tracker/st_mesa_to_tgsi.h +++ b/src/mesa/state_tracker/st_mesa_to_tgsi.h @@ -58,9 +58,7 @@ st_translate_mesa_program( GLuint numOutputs, const GLuint outputMapping[], const ubyte outputSemanticName[], - const ubyte outputSemanticIndex[], - boolean passthrough_edgeflags, - boolean clamp_color); + const ubyte outputSemanticIndex[]); unsigned st_translate_texture_target(GLuint textarget, GLboolean shadow); diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c index a07f8fec309..6a69ba7aa26 100644 --- a/src/mesa/state_tracker/st_program.c +++ b/src/mesa/state_tracker/st_program.c @@ -43,6 +43,8 @@ #include "pipe/p_shader_tokens.h" #include "draw/draw_context.h" #include "tgsi/tgsi_dump.h" +#include "tgsi/tgsi_emulate.h" +#include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_ureg.h" #include "st_debug.h" @@ -92,6 +94,11 @@ st_release_vp_variants( struct st_context *st, } stvp->variants = NULL; + + if (stvp->tgsi.tokens) { + tgsi_free_tokens(stvp->tgsi.tokens); + stvp->tgsi.tokens = NULL; + } } @@ -107,8 +114,6 @@ delete_fp_variant(struct st_context *st, struct st_fp_variant *fpv) cso_delete_fragment_shader(st->cso_context, fpv->driver_shader); if (fpv->parameters) _mesa_free_parameter_list(fpv->parameters); - if (fpv->tgsi.tokens) - ureg_free_tokens(fpv->tgsi.tokens); free(fpv); } @@ -128,6 +133,11 @@ st_release_fp_variants(struct st_context *st, struct st_fragment_program *stfp) } stfp->variants = NULL; + + if (stfp->tgsi.tokens) { + ureg_free_tokens(stfp->tgsi.tokens); + stfp->tgsi.tokens = NULL; + } } @@ -160,6 +170,11 @@ st_release_gp_variants(struct st_context *st, struct st_geometry_program *stgp) } stgp->variants = NULL; + + if (stgp->tgsi.tokens) { + ureg_free_tokens(stgp->tgsi.tokens); + stgp->tgsi.tokens = NULL; + } } @@ -192,6 +207,11 @@ st_release_tcp_variants(struct st_context *st, struct st_tessctrl_program *sttcp } sttcp->variants = NULL; + + if (sttcp->tgsi.tokens) { + ureg_free_tokens(sttcp->tgsi.tokens); + sttcp->tgsi.tokens = NULL; + } } @@ -224,28 +244,34 @@ st_release_tep_variants(struct st_context *st, struct st_tesseval_program *sttep } sttep->variants = NULL; + + if (sttep->tgsi.tokens) { + ureg_free_tokens(sttep->tgsi.tokens); + sttep->tgsi.tokens = NULL; + } } /** - * Translate a Mesa vertex shader into a TGSI shader. - * \param outputMapping to map vertex program output registers (VARYING_SLOT_x) - * to TGSI output slots - * \param tokensOut destination for TGSI tokens - * \return pointer to cached pipe_shader object. + * Translate a vertex program. */ -void -st_prepare_vertex_program(struct gl_context *ctx, +bool +st_translate_vertex_program(struct st_context *st, struct st_vertex_program *stvp) { - struct st_context *st = st_context(ctx); - GLuint attr; + struct ureg_program *ureg; + enum pipe_error error; + unsigned num_outputs = 0; + unsigned attr; + unsigned input_to_index[VERT_ATTRIB_MAX] = {0}; + unsigned output_slot_to_attr[VARYING_SLOT_MAX] = {0}; + ubyte output_semantic_name[VARYING_SLOT_MAX] = {0}; + ubyte output_semantic_index[VARYING_SLOT_MAX] = {0}; stvp->num_inputs = 0; - stvp->num_outputs = 0; if (stvp->Base.IsPositionInvariant) - _mesa_insert_mvp_code(ctx, &stvp->Base); + _mesa_insert_mvp_code(st->ctx, &stvp->Base); /* * Determine number of inputs, the mappings between VERT_ATTRIB_x @@ -253,7 +279,7 @@ st_prepare_vertex_program(struct gl_context *ctx, */ for (attr = 0; attr < VERT_ATTRIB_MAX; attr++) { if ((stvp->Base.Base.InputsRead & BITFIELD64_BIT(attr)) != 0) { - stvp->input_to_index[attr] = stvp->num_inputs; + input_to_index[attr] = stvp->num_inputs; stvp->index_to_input[stvp->num_inputs] = attr; stvp->num_inputs++; if ((stvp->Base.Base.DoubleInputsRead & BITFIELD64_BIT(attr)) != 0) { @@ -264,7 +290,7 @@ st_prepare_vertex_program(struct gl_context *ctx, } } /* bit of a hack, presetup potentially unused edgeflag input */ - stvp->input_to_index[VERT_ATTRIB_EDGEFLAG] = stvp->num_inputs; + input_to_index[VERT_ATTRIB_EDGEFLAG] = stvp->num_inputs; stvp->index_to_input[stvp->num_inputs] = VERT_ATTRIB_EDGEFLAG; /* Compute mapping of vertex program outputs to slots. @@ -274,62 +300,62 @@ st_prepare_vertex_program(struct gl_context *ctx, stvp->result_to_output[attr] = ~0; } else { - unsigned slot = stvp->num_outputs++; + unsigned slot = num_outputs++; stvp->result_to_output[attr] = slot; - stvp->output_slot_to_attr[slot] = attr; + output_slot_to_attr[slot] = attr; switch (attr) { case VARYING_SLOT_POS: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_POSITION; - stvp->output_semantic_index[slot] = 0; + output_semantic_name[slot] = TGSI_SEMANTIC_POSITION; + output_semantic_index[slot] = 0; break; case VARYING_SLOT_COL0: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_COLOR; - stvp->output_semantic_index[slot] = 0; + output_semantic_name[slot] = TGSI_SEMANTIC_COLOR; + output_semantic_index[slot] = 0; break; case VARYING_SLOT_COL1: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_COLOR; - stvp->output_semantic_index[slot] = 1; + output_semantic_name[slot] = TGSI_SEMANTIC_COLOR; + output_semantic_index[slot] = 1; break; case VARYING_SLOT_BFC0: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR; - stvp->output_semantic_index[slot] = 0; + output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR; + output_semantic_index[slot] = 0; break; case VARYING_SLOT_BFC1: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR; - stvp->output_semantic_index[slot] = 1; + output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR; + output_semantic_index[slot] = 1; break; case VARYING_SLOT_FOGC: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_FOG; - stvp->output_semantic_index[slot] = 0; + output_semantic_name[slot] = TGSI_SEMANTIC_FOG; + output_semantic_index[slot] = 0; break; case VARYING_SLOT_PSIZ: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_PSIZE; - stvp->output_semantic_index[slot] = 0; + output_semantic_name[slot] = TGSI_SEMANTIC_PSIZE; + output_semantic_index[slot] = 0; break; case VARYING_SLOT_CLIP_DIST0: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST; - stvp->output_semantic_index[slot] = 0; + output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST; + output_semantic_index[slot] = 0; break; case VARYING_SLOT_CLIP_DIST1: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST; - stvp->output_semantic_index[slot] = 1; + output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST; + output_semantic_index[slot] = 1; break; case VARYING_SLOT_EDGE: assert(0); break; case VARYING_SLOT_CLIP_VERTEX: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_CLIPVERTEX; - stvp->output_semantic_index[slot] = 0; + output_semantic_name[slot] = TGSI_SEMANTIC_CLIPVERTEX; + output_semantic_index[slot] = 0; break; case VARYING_SLOT_LAYER: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_LAYER; - stvp->output_semantic_index[slot] = 0; + output_semantic_name[slot] = TGSI_SEMANTIC_LAYER; + output_semantic_index[slot] = 0; break; case VARYING_SLOT_VIEWPORT: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_VIEWPORT_INDEX; - stvp->output_semantic_index[slot] = 0; + output_semantic_name[slot] = TGSI_SEMANTIC_VIEWPORT_INDEX; + output_semantic_index[slot] = 0; break; case VARYING_SLOT_TEX0: @@ -341,8 +367,8 @@ st_prepare_vertex_program(struct gl_context *ctx, case VARYING_SLOT_TEX6: case VARYING_SLOT_TEX7: if (st->needs_texcoord_semantic) { - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_TEXCOORD; - stvp->output_semantic_index[slot] = attr - VARYING_SLOT_TEX0; + output_semantic_name[slot] = TGSI_SEMANTIC_TEXCOORD; + output_semantic_index[slot] = attr - VARYING_SLOT_TEX0; break; } /* fall through */ @@ -350,55 +376,24 @@ st_prepare_vertex_program(struct gl_context *ctx, default: assert(attr >= VARYING_SLOT_VAR0 || (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7)); - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_GENERIC; - stvp->output_semantic_index[slot] = + output_semantic_name[slot] = TGSI_SEMANTIC_GENERIC; + output_semantic_index[slot] = st_get_generic_varying_index(st, attr); break; } } } /* similar hack to above, presetup potentially unused edgeflag output */ - stvp->result_to_output[VARYING_SLOT_EDGE] = stvp->num_outputs; - stvp->output_semantic_name[stvp->num_outputs] = TGSI_SEMANTIC_EDGEFLAG; - stvp->output_semantic_index[stvp->num_outputs] = 0; -} - - -/** - * Translate a vertex program to create a new variant. - */ -static struct st_vp_variant * -st_translate_vertex_program(struct st_context *st, - struct st_vertex_program *stvp, - const struct st_vp_variant_key *key) -{ - struct st_vp_variant *vpv = CALLOC_STRUCT(st_vp_variant); - struct pipe_context *pipe = st->pipe; - struct ureg_program *ureg; - enum pipe_error error; - unsigned num_outputs; - - st_prepare_vertex_program(st->ctx, stvp); + stvp->result_to_output[VARYING_SLOT_EDGE] = num_outputs; + output_semantic_name[num_outputs] = TGSI_SEMANTIC_EDGEFLAG; + output_semantic_index[num_outputs] = 0; if (!stvp->glsl_to_tgsi) - { _mesa_remove_output_reads(&stvp->Base.Base, PROGRAM_OUTPUT); - } ureg = ureg_create_with_screen(TGSI_PROCESSOR_VERTEX, st->pipe->screen); - if (ureg == NULL) { - free(vpv); - return NULL; - } - - vpv->key = *key; - - vpv->num_inputs = stvp->num_inputs; - num_outputs = stvp->num_outputs; - if (key->passthrough_edgeflags) { - vpv->num_inputs++; - num_outputs++; - } + if (ureg == NULL) + return false; if (ST_DEBUG & DEBUG_MESA) { _mesa_print_program(&stvp->Base.Base); @@ -406,15 +401,15 @@ st_translate_vertex_program(struct st_context *st, debug_printf("\n"); } - if (stvp->glsl_to_tgsi) + if (stvp->glsl_to_tgsi) { error = st_translate_program(st->ctx, TGSI_PROCESSOR_VERTEX, ureg, stvp->glsl_to_tgsi, &stvp->Base.Base, /* inputs */ - vpv->num_inputs, - stvp->input_to_index, + stvp->num_inputs, + input_to_index, NULL, /* inputSlotToAttr */ NULL, /* input semantic name */ NULL, /* input semantic index */ @@ -423,43 +418,75 @@ st_translate_vertex_program(struct st_context *st, /* outputs */ num_outputs, stvp->result_to_output, - stvp->output_slot_to_attr, - stvp->output_semantic_name, - stvp->output_semantic_index, - key->passthrough_edgeflags, - key->clamp_color); - else + output_slot_to_attr, + output_semantic_name, + output_semantic_index); + + st_translate_stream_output_info(stvp->glsl_to_tgsi, + stvp->result_to_output, + &stvp->tgsi.stream_output); + + free_glsl_to_tgsi_visitor(stvp->glsl_to_tgsi); + stvp->glsl_to_tgsi = NULL; + } else error = st_translate_mesa_program(st->ctx, TGSI_PROCESSOR_VERTEX, ureg, &stvp->Base.Base, /* inputs */ - vpv->num_inputs, - stvp->input_to_index, + stvp->num_inputs, + input_to_index, NULL, /* input semantic name */ NULL, /* input semantic index */ NULL, /* outputs */ num_outputs, stvp->result_to_output, - stvp->output_semantic_name, - stvp->output_semantic_index, - key->passthrough_edgeflags, - key->clamp_color); + output_semantic_name, + output_semantic_index); + + if (error) { + debug_printf("%s: failed to translate Mesa program:\n", __func__); + _mesa_print_program(&stvp->Base.Base); + debug_assert(0); + return false; + } + + stvp->tgsi.tokens = ureg_get_tokens(ureg, NULL); + ureg_destroy(ureg); + return stvp->tgsi.tokens != NULL; +} - if (error) - goto fail; +static struct st_vp_variant * +st_create_vp_variant(struct st_context *st, + struct st_vertex_program *stvp, + const struct st_vp_variant_key *key) +{ + struct st_vp_variant *vpv = CALLOC_STRUCT(st_vp_variant); + struct pipe_context *pipe = st->pipe; - vpv->tgsi.tokens = ureg_get_tokens( ureg, NULL ); - if (!vpv->tgsi.tokens) - goto fail; + vpv->key = *key; + vpv->tgsi.tokens = tgsi_dup_tokens(stvp->tgsi.tokens); + vpv->tgsi.stream_output = stvp->tgsi.stream_output; + vpv->num_inputs = stvp->num_inputs; - ureg_destroy( ureg ); + /* Emulate features. */ + if (key->clamp_color || key->passthrough_edgeflags) { + const struct tgsi_token *tokens; + unsigned flags = + (key->clamp_color ? TGSI_EMU_CLAMP_COLOR_OUTPUTS : 0) | + (key->passthrough_edgeflags ? TGSI_EMU_PASSTHROUGH_EDGEFLAG : 0); - if (stvp->glsl_to_tgsi) { - st_translate_stream_output_info(stvp->glsl_to_tgsi, - stvp->result_to_output, - &vpv->tgsi.stream_output); + tokens = tgsi_emulate(vpv->tgsi.tokens, flags); + + if (tokens) { + tgsi_free_tokens(vpv->tgsi.tokens); + vpv->tgsi.tokens = tokens; + + if (key->passthrough_edgeflags) + vpv->num_inputs++; + } else + fprintf(stderr, "mesa: cannot emulate deprecated features\n"); } if (ST_DEBUG & DEBUG_TGSI) { @@ -469,14 +496,6 @@ st_translate_vertex_program(struct st_context *st, vpv->driver_shader = pipe->create_vs_state(pipe, &vpv->tgsi); return vpv; - -fail: - debug_printf("%s: failed to translate Mesa program:\n", __func__); - _mesa_print_program(&stvp->Base.Base); - debug_assert(0); - - ureg_destroy( ureg ); - return NULL; } @@ -499,7 +518,7 @@ st_get_vp_variant(struct st_context *st, if (!vpv) { /* create now */ - vpv = st_translate_vertex_program(st, stvp, key); + vpv = st_create_vp_variant(st, stvp, key); if (vpv) { /* insert into list */ vpv->next = stvp->variants; @@ -533,19 +552,12 @@ st_translate_interp(enum glsl_interp_qualifier glsl_qual, bool is_color) /** - * Translate a Mesa fragment shader into a TGSI shader using extra info in - * the key. - * \return new fragment program variant + * Translate a Mesa fragment shader into a TGSI shader. */ -static struct st_fp_variant * +bool st_translate_fragment_program(struct st_context *st, - struct st_fragment_program *stfp, - const struct st_fp_variant_key *key) + struct st_fragment_program *stfp) { - struct pipe_context *pipe = st->pipe; - struct st_fp_variant *variant = CALLOC_STRUCT(st_fp_variant); - GLboolean deleteFP = GL_FALSE; - GLuint outputMapping[FRAG_RESULT_MAX]; GLuint inputMapping[VARYING_SLOT_MAX]; GLuint inputSlotToAttr[VARYING_SLOT_MAX]; @@ -565,40 +577,8 @@ st_translate_fragment_program(struct st_context *st, ubyte fs_output_semantic_index[PIPE_MAX_SHADER_OUTPUTS]; uint fs_num_outputs = 0; - if (!variant) - return NULL; - - assert(!(key->bitmap && key->drawpixels)); memset(inputSlotToAttr, ~0, sizeof(inputSlotToAttr)); - if (key->bitmap) { - /* glBitmap drawing */ - struct gl_fragment_program *fp; /* we free this temp program below */ - - st_make_bitmap_fragment_program(st, &stfp->Base, - &fp, &variant->bitmap_sampler); - - variant->parameters = _mesa_clone_parameter_list(fp->Base.Parameters); - stfp = st_fragment_program(fp); - deleteFP = GL_TRUE; - } - else if (key->drawpixels) { - /* glDrawPixels drawing */ - struct gl_fragment_program *fp; /* we free this temp program below */ - - if (key->drawpixels_z || key->drawpixels_stencil) { - fp = st_make_drawpix_z_stencil_program(st, key->drawpixels_z, - key->drawpixels_stencil); - } - else { - /* RGBA */ - st_make_drawpix_fragment_program(st, &stfp->Base, &fp); - variant->parameters = _mesa_clone_parameter_list(fp->Base.Parameters); - deleteFP = GL_TRUE; - } - stfp = st_fragment_program(fp); - } - if (!stfp->glsl_to_tgsi) _mesa_remove_output_reads(&stfp->Base.Base, PROGRAM_OUTPUT); @@ -620,8 +600,7 @@ st_translate_fragment_program(struct st_context *st, interpLocation[slot] = TGSI_INTERPOLATE_LOC_CENTER; if (stfp->Base.Base.SystemValuesRead & (SYSTEM_BIT_SAMPLE_ID | - SYSTEM_BIT_SAMPLE_POS) || - key->persample_shading) + SYSTEM_BIT_SAMPLE_POS)) interpLocation[slot] = TGSI_INTERPOLATE_LOC_SAMPLE; switch (attr) { @@ -805,10 +784,8 @@ st_translate_fragment_program(struct st_context *st, } ureg = ureg_create_with_screen(TGSI_PROCESSOR_FRAGMENT, st->pipe->screen); - if (ureg == NULL) { - free(variant); - return NULL; - } + if (ureg == NULL) + return false; if (ST_DEBUG & DEBUG_MESA) { _mesa_print_program(&stfp->Base.Base); @@ -841,7 +818,7 @@ st_translate_fragment_program(struct st_context *st, } } - if (stfp->glsl_to_tgsi) + if (stfp->glsl_to_tgsi) { st_translate_program(st->ctx, TGSI_PROCESSOR_FRAGMENT, ureg, @@ -860,9 +837,11 @@ st_translate_fragment_program(struct st_context *st, outputMapping, NULL, fs_output_semantic_name, - fs_output_semantic_index, FALSE, - key->clamp_color ); - else + fs_output_semantic_index); + + free_glsl_to_tgsi_visitor(stfp->glsl_to_tgsi); + stfp->glsl_to_tgsi = NULL; + } else st_translate_mesa_program(st->ctx, TGSI_PROCESSOR_FRAGMENT, ureg, @@ -877,31 +856,134 @@ st_translate_fragment_program(struct st_context *st, fs_num_outputs, outputMapping, fs_output_semantic_name, - fs_output_semantic_index, FALSE, - key->clamp_color); + fs_output_semantic_index); + + stfp->tgsi.tokens = ureg_get_tokens(ureg, NULL); + ureg_destroy(ureg); + return stfp->tgsi.tokens != NULL; +} + +static struct st_fp_variant * +st_create_fp_variant(struct st_context *st, + struct st_fragment_program *stfp, + const struct st_fp_variant_key *key) +{ + struct pipe_context *pipe = st->pipe; + struct st_fp_variant *variant = CALLOC_STRUCT(st_fp_variant); + struct pipe_shader_state tgsi = {0}; + + if (!variant) + return NULL; + + tgsi.tokens = stfp->tgsi.tokens; - variant->tgsi.tokens = ureg_get_tokens( ureg, NULL ); - ureg_destroy( ureg ); + assert(!(key->bitmap && key->drawpixels)); + + /* Emulate features. */ + if (key->clamp_color || key->persample_shading) { + const struct tgsi_token *tokens; + unsigned flags = + (key->clamp_color ? TGSI_EMU_CLAMP_COLOR_OUTPUTS : 0) | + (key->persample_shading ? TGSI_EMU_FORCE_PERSAMPLE_INTERP : 0); + + tokens = tgsi_emulate(tgsi.tokens, flags); + + if (tokens) + tgsi.tokens = tokens; + else + fprintf(stderr, "mesa: cannot emulate deprecated features\n"); + } + + /* glBitmap */ + if (key->bitmap) { + const struct tgsi_token *tokens; + + variant->bitmap_sampler = ffs(~stfp->Base.Base.SamplersUsed) - 1; + + tokens = st_get_bitmap_shader(tgsi.tokens, + variant->bitmap_sampler, + st->needs_texcoord_semantic, + st->bitmap.tex_format == + PIPE_FORMAT_L8_UNORM); + + if (tokens) { + if (tgsi.tokens != stfp->tgsi.tokens) + tgsi_free_tokens(tgsi.tokens); + tgsi.tokens = tokens; + variant->parameters = + _mesa_clone_parameter_list(stfp->Base.Base.Parameters); + } else + fprintf(stderr, "mesa: cannot create a shader for glBitmap\n"); + } + + /* glDrawPixels (color only) */ + if (key->drawpixels) { + const struct tgsi_token *tokens; + unsigned scale_const = 0, bias_const = 0, texcoord_const = 0; + + /* Find the first unused slot. */ + variant->drawpix_sampler = ffs(~stfp->Base.Base.SamplersUsed) - 1; + + if (key->pixelMaps) { + unsigned samplers_used = stfp->Base.Base.SamplersUsed | + (1 << variant->drawpix_sampler); + + variant->pixelmap_sampler = ffs(~samplers_used) - 1; + } + + variant->parameters = + _mesa_clone_parameter_list(stfp->Base.Base.Parameters); + + if (key->scaleAndBias) { + static const gl_state_index scale_state[STATE_LENGTH] = + { STATE_INTERNAL, STATE_PT_SCALE }; + static const gl_state_index bias_state[STATE_LENGTH] = + { STATE_INTERNAL, STATE_PT_BIAS }; + + scale_const = _mesa_add_state_reference(variant->parameters, + scale_state); + bias_const = _mesa_add_state_reference(variant->parameters, + bias_state); + } + + { + static const gl_state_index state[STATE_LENGTH] = + { STATE_INTERNAL, STATE_CURRENT_ATTRIB, VERT_ATTRIB_TEX0 }; + + texcoord_const = _mesa_add_state_reference(variant->parameters, + state); + } + + tokens = st_get_drawpix_shader(tgsi.tokens, + st->needs_texcoord_semantic, + key->scaleAndBias, scale_const, + bias_const, key->pixelMaps, + variant->drawpix_sampler, + variant->pixelmap_sampler, + texcoord_const); + + if (tokens) { + if (tgsi.tokens != stfp->tgsi.tokens) + tgsi_free_tokens(tgsi.tokens); + tgsi.tokens = tokens; + } else + fprintf(stderr, "mesa: cannot create a shader for glDrawPixels\n"); + } if (ST_DEBUG & DEBUG_TGSI) { - tgsi_dump(variant->tgsi.tokens, 0/*TGSI_DUMP_VERBOSE*/); + tgsi_dump(tgsi.tokens, 0); debug_printf("\n"); } /* fill in variant */ - variant->driver_shader = pipe->create_fs_state(pipe, &variant->tgsi); + variant->driver_shader = pipe->create_fs_state(pipe, &tgsi); variant->key = *key; - if (deleteFP) { - /* Free the temporary program made above */ - struct gl_fragment_program *fp = &stfp->Base; - _mesa_reference_fragprog(st->ctx, &fp, NULL); - } - + if (tgsi.tokens != stfp->tgsi.tokens) + tgsi_free_tokens(tgsi.tokens); return variant; } - /** * Translate fragment program if needed. */ @@ -921,7 +1003,7 @@ st_get_fp_variant(struct st_context *st, if (!fpv) { /* create new */ - fpv = st_translate_fragment_program(st, stfp, key); + fpv = st_create_fp_variant(st, stfp, key); if (fpv) { /* insert into list */ fpv->next = stfp->variants; @@ -1191,9 +1273,7 @@ st_translate_program_common(struct st_context *st, outputMapping, outputSlotToAttr, output_semantic_name, - output_semantic_index, - FALSE, - FALSE); + output_semantic_index); out_state->tokens = ureg_get_tokens(ureg, NULL); ureg_destroy(ureg); @@ -1217,19 +1297,15 @@ st_translate_program_common(struct st_context *st, /** * Translate a geometry program to create a new variant. */ -static struct st_gp_variant * +bool st_translate_geometry_program(struct st_context *st, - struct st_geometry_program *stgp, - const struct st_gp_variant_key *key) + struct st_geometry_program *stgp) { - struct pipe_context *pipe = st->pipe; struct ureg_program *ureg; - struct st_gp_variant *gpv; - struct pipe_shader_state state; ureg = ureg_create_with_screen(TGSI_PROCESSOR_GEOMETRY, st->pipe->screen); if (ureg == NULL) - return NULL; + return false; ureg_property(ureg, TGSI_PROPERTY_GS_INPUT_PRIM, stgp->Base.InputType); ureg_property(ureg, TGSI_PROPERTY_GS_OUTPUT_PRIM, stgp->Base.OutputType); @@ -1238,19 +1314,29 @@ st_translate_geometry_program(struct st_context *st, ureg_property(ureg, TGSI_PROPERTY_GS_INVOCATIONS, stgp->Base.Invocations); st_translate_program_common(st, &stgp->Base.Base, stgp->glsl_to_tgsi, ureg, - TGSI_PROCESSOR_GEOMETRY, &state); + TGSI_PROCESSOR_GEOMETRY, &stgp->tgsi); + + free_glsl_to_tgsi_visitor(stgp->glsl_to_tgsi); + stgp->glsl_to_tgsi = NULL; + return true; +} + + +static struct st_gp_variant * +st_create_gp_variant(struct st_context *st, + struct st_geometry_program *stgp, + const struct st_gp_variant_key *key) +{ + struct pipe_context *pipe = st->pipe; + struct st_gp_variant *gpv; gpv = CALLOC_STRUCT(st_gp_variant); - if (!gpv) { - ureg_free_tokens(state.tokens); + if (!gpv) return NULL; - } /* fill in new variant */ - gpv->driver_shader = pipe->create_gs_state(pipe, &state); + gpv->driver_shader = pipe->create_gs_state(pipe, &stgp->tgsi); gpv->key = *key; - - ureg_free_tokens(state.tokens); return gpv; } @@ -1274,7 +1360,7 @@ st_get_gp_variant(struct st_context *st, if (!gpv) { /* create new */ - gpv = st_translate_geometry_program(st, stgp, key); + gpv = st_create_gp_variant(st, stgp, key); if (gpv) { /* insert into list */ gpv->next = stgp->variants; @@ -1289,38 +1375,43 @@ st_get_gp_variant(struct st_context *st, /** * Translate a tessellation control program to create a new variant. */ -static struct st_tcp_variant * +bool st_translate_tessctrl_program(struct st_context *st, - struct st_tessctrl_program *sttcp, - const struct st_tcp_variant_key *key) + struct st_tessctrl_program *sttcp) { - struct pipe_context *pipe = st->pipe; struct ureg_program *ureg; - struct st_tcp_variant *tcpv; - struct pipe_shader_state state; - ureg = ureg_create_with_screen(TGSI_PROCESSOR_TESS_CTRL, pipe->screen); - if (ureg == NULL) { - return NULL; - } + ureg = ureg_create_with_screen(TGSI_PROCESSOR_TESS_CTRL, st->pipe->screen); + if (ureg == NULL) + return false; ureg_property(ureg, TGSI_PROPERTY_TCS_VERTICES_OUT, sttcp->Base.VerticesOut); st_translate_program_common(st, &sttcp->Base.Base, sttcp->glsl_to_tgsi, - ureg, TGSI_PROCESSOR_TESS_CTRL, &state); + ureg, TGSI_PROCESSOR_TESS_CTRL, &sttcp->tgsi); + + free_glsl_to_tgsi_visitor(sttcp->glsl_to_tgsi); + sttcp->glsl_to_tgsi = NULL; + return true; +} + + +static struct st_tcp_variant * +st_create_tcp_variant(struct st_context *st, + struct st_tessctrl_program *sttcp, + const struct st_tcp_variant_key *key) +{ + struct pipe_context *pipe = st->pipe; + struct st_tcp_variant *tcpv; tcpv = CALLOC_STRUCT(st_tcp_variant); - if (!tcpv) { - ureg_free_tokens(state.tokens); + if (!tcpv) return NULL; - } /* fill in new variant */ - tcpv->driver_shader = pipe->create_tcs_state(pipe, &state); + tcpv->driver_shader = pipe->create_tcs_state(pipe, &sttcp->tgsi); tcpv->key = *key; - - ureg_free_tokens(state.tokens); return tcpv; } @@ -1344,7 +1435,7 @@ st_get_tcp_variant(struct st_context *st, if (!tcpv) { /* create new */ - tcpv = st_translate_tessctrl_program(st, sttcp, key); + tcpv = st_create_tcp_variant(st, sttcp, key); if (tcpv) { /* insert into list */ tcpv->next = sttcp->variants; @@ -1359,20 +1450,15 @@ st_get_tcp_variant(struct st_context *st, /** * Translate a tessellation evaluation program to create a new variant. */ -static struct st_tep_variant * +bool st_translate_tesseval_program(struct st_context *st, - struct st_tesseval_program *sttep, - const struct st_tep_variant_key *key) + struct st_tesseval_program *sttep) { - struct pipe_context *pipe = st->pipe; struct ureg_program *ureg; - struct st_tep_variant *tepv; - struct pipe_shader_state state; - ureg = ureg_create_with_screen(TGSI_PROCESSOR_TESS_EVAL, pipe->screen); - if (ureg == NULL) { - return NULL; - } + ureg = ureg_create_with_screen(TGSI_PROCESSOR_TESS_EVAL, st->pipe->screen); + if (ureg == NULL) + return false; if (sttep->Base.PrimitiveMode == GL_ISOLINES) ureg_property(ureg, TGSI_PROPERTY_TES_PRIM_MODE, GL_LINES); @@ -1400,19 +1486,29 @@ st_translate_tesseval_program(struct st_context *st, ureg_property(ureg, TGSI_PROPERTY_TES_POINT_MODE, sttep->Base.PointMode); st_translate_program_common(st, &sttep->Base.Base, sttep->glsl_to_tgsi, - ureg, TGSI_PROCESSOR_TESS_EVAL, &state); + ureg, TGSI_PROCESSOR_TESS_EVAL, &sttep->tgsi); + + free_glsl_to_tgsi_visitor(sttep->glsl_to_tgsi); + sttep->glsl_to_tgsi = NULL; + return true; +} + + +static struct st_tep_variant * +st_create_tep_variant(struct st_context *st, + struct st_tesseval_program *sttep, + const struct st_tep_variant_key *key) +{ + struct pipe_context *pipe = st->pipe; + struct st_tep_variant *tepv; tepv = CALLOC_STRUCT(st_tep_variant); - if (!tepv) { - ureg_free_tokens(state.tokens); + if (!tepv) return NULL; - } /* fill in new variant */ - tepv->driver_shader = pipe->create_tes_state(pipe, &state); + tepv->driver_shader = pipe->create_tes_state(pipe, &sttep->tgsi); tepv->key = *key; - - ureg_free_tokens(state.tokens); return tepv; } @@ -1436,7 +1532,7 @@ st_get_tep_variant(struct st_context *st, if (!tepv) { /* create new */ - tepv = st_translate_tesseval_program(st, sttep, key); + tepv = st_create_tep_variant(st, sttep, key); if (tepv) { /* insert into list */ tepv->next = sttep->variants; diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h index 7013993fe38..d9b53ac008c 100644 --- a/src/mesa/state_tracker/st_program.h +++ b/src/mesa/state_tracker/st_program.h @@ -59,8 +59,6 @@ struct st_fp_variant_key GLuint drawpixels:1; /**< glDrawPixels variant */ GLuint scaleAndBias:1; /**< glDrawPixels w/ scale and/or bias? */ GLuint pixelMaps:1; /**< glDrawPixels w/ pixel lookup map? */ - GLuint drawpixels_z:1; /**< glDrawPixels(GL_DEPTH) */ - GLuint drawpixels_stencil:1; /**< glDrawPixels(GL_STENCIL) */ /** for ARB_color_buffer_float */ GLuint clamp_color:1; @@ -78,8 +76,6 @@ struct st_fp_variant /** Parameters which generated this version of fragment program */ struct st_fp_variant_key key; - struct pipe_shader_state tgsi; - /** Driver's compiled shader */ void *driver_shader; @@ -87,6 +83,10 @@ struct st_fp_variant struct gl_program_parameter_list *parameters; uint bitmap_sampler; + /** For glDrawPixels variants */ + unsigned drawpix_sampler; + unsigned pixelmap_sampler; + /** next in linked list */ struct st_fp_variant *next; }; @@ -98,6 +98,7 @@ struct st_fp_variant struct st_fragment_program { struct gl_fragment_program Base; + struct pipe_shader_state tgsi; struct glsl_to_tgsi_visitor* glsl_to_tgsi; struct st_fp_variant *variants; @@ -153,20 +154,16 @@ struct st_vp_variant struct st_vertex_program { struct gl_vertex_program Base; /**< The Mesa vertex program */ + struct pipe_shader_state tgsi; struct glsl_to_tgsi_visitor* glsl_to_tgsi; /** maps a Mesa VERT_ATTRIB_x to a packed TGSI input index */ - GLuint input_to_index[VERT_ATTRIB_MAX]; /** maps a TGSI input index back to a Mesa VERT_ATTRIB_x */ GLuint index_to_input[PIPE_MAX_SHADER_INPUTS]; GLuint num_inputs; /** Maps VARYING_SLOT_x to slot */ GLuint result_to_output[VARYING_SLOT_MAX]; - GLuint output_slot_to_attr[VARYING_SLOT_MAX]; - ubyte output_semantic_name[VARYING_SLOT_MAX]; - ubyte output_semantic_index[VARYING_SLOT_MAX]; - GLuint num_outputs; /** List of translated variants of this vertex program. */ @@ -203,6 +200,7 @@ struct st_gp_variant struct st_geometry_program { struct gl_geometry_program Base; /**< The Mesa geometry program */ + struct pipe_shader_state tgsi; struct glsl_to_tgsi_visitor* glsl_to_tgsi; struct st_gp_variant *variants; @@ -238,6 +236,7 @@ struct st_tcp_variant struct st_tessctrl_program { struct gl_tess_ctrl_program Base; /**< The Mesa tess ctrl program */ + struct pipe_shader_state tgsi; struct glsl_to_tgsi_visitor* glsl_to_tgsi; struct st_tcp_variant *variants; @@ -273,6 +272,7 @@ struct st_tep_variant struct st_tesseval_program { struct gl_tess_eval_program Base; /**< The Mesa tess eval program */ + struct pipe_shader_state tgsi; struct glsl_to_tgsi_visitor* glsl_to_tgsi; struct st_tep_variant *variants; @@ -414,16 +414,6 @@ st_get_tep_variant(struct st_context *st, struct st_tesseval_program *stgp, const struct st_tep_variant_key *key); - -extern void -st_prepare_vertex_program(struct gl_context *ctx, - struct st_vertex_program *stvp); - -extern GLboolean -st_prepare_fragment_program(struct gl_context *ctx, - struct st_fragment_program *stfp); - - extern void st_release_vp_variants( struct st_context *st, struct st_vertex_program *stvp ); @@ -447,6 +437,25 @@ st_release_tep_variants(struct st_context *st, extern void st_destroy_program_variants(struct st_context *st); +extern bool +st_translate_vertex_program(struct st_context *st, + struct st_vertex_program *stvp); + +extern bool +st_translate_fragment_program(struct st_context *st, + struct st_fragment_program *stfp); + +extern bool +st_translate_geometry_program(struct st_context *st, + struct st_geometry_program *stgp); + +extern bool +st_translate_tessctrl_program(struct st_context *st, + struct st_tessctrl_program *sttcp); + +extern bool +st_translate_tesseval_program(struct st_context *st, + struct st_tesseval_program *sttep); extern void st_print_current_vertex_program(void); diff --git a/src/mesa/tnl/t_draw.c b/src/mesa/tnl/t_draw.c index c130ab3f93d..6f29abbe1ba 100644 --- a/src/mesa/tnl/t_draw.c +++ b/src/mesa/tnl/t_draw.c @@ -35,6 +35,7 @@ #include "main/mtypes.h" #include "main/macros.h" #include "main/enums.h" +#include "util/half_float.h" #include "t_context.h" #include "tnl.h" diff --git a/src/mesa/vbo/vbo_context.c b/src/mesa/vbo/vbo_context.c index e3eb286e482..5e1a760eb2c 100644 --- a/src/mesa/vbo/vbo_context.c +++ b/src/mesa/vbo/vbo_context.c @@ -33,7 +33,6 @@ #include "vbo.h" #include "vbo_context.h" -#define NR_MAT_ATTRIBS 12 static GLuint check_size( const GLfloat *attr ) { @@ -44,32 +43,47 @@ static GLuint check_size( const GLfloat *attr ) } +/** + * Helper for initializing a vertex array. + */ +static void +init_array(struct gl_context *ctx, struct gl_client_array *cl, + unsigned size, const void *pointer) +{ + memset(cl, 0, sizeof(*cl)); + + cl->Size = size; + cl->Type = GL_FLOAT; + cl->Format = GL_RGBA; + cl->Stride = 0; + cl->StrideB = 0; + cl->_ElementSize = cl->Size * sizeof(GLfloat); + cl->Ptr = pointer; + cl->Enabled = 1; + + _mesa_reference_buffer_object(ctx, &cl->BufferObj, + ctx->Shared->NullBufferObj); +} + + +/** + * Set up the vbo->currval arrays to point at the context's current + * vertex attributes (with strides = 0). + */ static void init_legacy_currval(struct gl_context *ctx) { struct vbo_context *vbo = vbo_context(ctx); - struct gl_client_array *arrays = &vbo->currval[VBO_ATTRIB_POS]; GLuint i; - memset(arrays, 0, sizeof(*arrays) * VERT_ATTRIB_FF_MAX); - /* Set up a constant (StrideB == 0) array for each current * attribute: */ for (i = 0; i < VERT_ATTRIB_FF_MAX; i++) { - struct gl_client_array *cl = &arrays[i]; + struct gl_client_array *cl = &vbo->currval[VERT_ATTRIB_FF(i)]; - /* Size will have to be determined at runtime: - */ - cl->Size = check_size(ctx->Current.Attrib[i]); - cl->Stride = 0; - cl->StrideB = 0; - cl->Enabled = 1; - cl->Type = GL_FLOAT; - cl->Format = GL_RGBA; - cl->Ptr = (const void *)ctx->Current.Attrib[i]; - cl->_ElementSize = cl->Size * sizeof(GLfloat); - _mesa_reference_buffer_object(ctx, &cl->BufferObj, - ctx->Shared->NullBufferObj); + init_array(ctx, cl, + check_size(ctx->Current.Attrib[i]), + ctx->Current.Attrib[i]); } } @@ -77,26 +91,12 @@ static void init_legacy_currval(struct gl_context *ctx) static void init_generic_currval(struct gl_context *ctx) { struct vbo_context *vbo = vbo_context(ctx); - struct gl_client_array *arrays = &vbo->currval[VBO_ATTRIB_GENERIC0]; GLuint i; - memset(arrays, 0, sizeof(*arrays) * VERT_ATTRIB_GENERIC_MAX); - for (i = 0; i < VERT_ATTRIB_GENERIC_MAX; i++) { - struct gl_client_array *cl = &arrays[i]; + struct gl_client_array *cl = &vbo->currval[VBO_ATTRIB_GENERIC0 + i]; - /* This will have to be determined at runtime: - */ - cl->Size = 1; - cl->Type = GL_FLOAT; - cl->Format = GL_RGBA; - cl->Ptr = (const void *)ctx->Current.Attrib[VERT_ATTRIB_GENERIC0 + i]; - cl->Stride = 0; - cl->StrideB = 0; - cl->Enabled = 1; - cl->_ElementSize = cl->Size * sizeof(GLfloat); - _mesa_reference_buffer_object(ctx, &cl->BufferObj, - ctx->Shared->NullBufferObj); + init_array(ctx, cl, 1, ctx->Current.Attrib[VERT_ATTRIB_GENERIC0 + i]); } } @@ -104,46 +104,34 @@ static void init_generic_currval(struct gl_context *ctx) static void init_mat_currval(struct gl_context *ctx) { struct vbo_context *vbo = vbo_context(ctx); - struct gl_client_array *arrays = - &vbo->currval[VBO_ATTRIB_MAT_FRONT_AMBIENT]; GLuint i; - assert(NR_MAT_ATTRIBS == MAT_ATTRIB_MAX); - - memset(arrays, 0, sizeof(*arrays) * NR_MAT_ATTRIBS); - /* Set up a constant (StrideB == 0) array for each current * attribute: */ - for (i = 0; i < NR_MAT_ATTRIBS; i++) { - struct gl_client_array *cl = &arrays[i]; + for (i = 0; i < MAT_ATTRIB_MAX; i++) { + struct gl_client_array *cl = + &vbo->currval[VBO_ATTRIB_MAT_FRONT_AMBIENT + i]; + unsigned size; /* Size is fixed for the material attributes, for others will * be determined at runtime: */ - switch (i - VERT_ATTRIB_GENERIC0) { + switch (i) { case MAT_ATTRIB_FRONT_SHININESS: case MAT_ATTRIB_BACK_SHININESS: - cl->Size = 1; - break; + size = 1; + break; case MAT_ATTRIB_FRONT_INDEXES: case MAT_ATTRIB_BACK_INDEXES: - cl->Size = 3; - break; + size = 3; + break; default: - cl->Size = 4; - break; + size = 4; + break; } - cl->Ptr = (const void *)ctx->Light.Material.Attrib[i]; - cl->Type = GL_FLOAT; - cl->Format = GL_RGBA; - cl->Stride = 0; - cl->StrideB = 0; - cl->Enabled = 1; - cl->_ElementSize = cl->Size * sizeof(GLfloat); - _mesa_reference_buffer_object(ctx, &cl->BufferObj, - ctx->Shared->NullBufferObj); + init_array(ctx, cl, size, ctx->Light.Material.Attrib[i]); } } @@ -175,7 +163,7 @@ GLboolean _vbo_CreateContext( struct gl_context *ctx ) for (i = 0; i < ARRAY_SIZE(vbo->map_vp_none); i++) vbo->map_vp_none[i] = i; /* map material attribs to generic slots */ - for (i = 0; i < NR_MAT_ATTRIBS; i++) + for (i = 0; i < MAT_ATTRIB_MAX; i++) vbo->map_vp_none[VERT_ATTRIB_GENERIC(i)] = VBO_ATTRIB_MAT_FRONT_AMBIENT + i; diff --git a/src/mesa/vbo/vbo_exec.h b/src/mesa/vbo/vbo_exec.h index 80f3015925d..00378eb7984 100644 --- a/src/mesa/vbo/vbo_exec.h +++ b/src/mesa/vbo/vbo_exec.h @@ -79,7 +79,7 @@ struct vbo_exec_copied_vtx { struct vbo_exec_context { - struct gl_context *ctx; + struct gl_context *ctx; GLvertexformat vtxfmt; GLvertexformat vtxfmt_noop; GLboolean validating; /**< if we're in the middle of state validation */ @@ -97,15 +97,17 @@ struct vbo_exec_context GLuint buffer_used; /* in bytes */ fi_type vertex[VBO_ATTRIB_MAX*4]; /* current vertex */ - GLuint vert_count; - GLuint max_vert; + GLuint vert_count; /**< Number of vertices currently in buffer */ + GLuint max_vert; /**< Max number of vertices allowed in buffer */ struct vbo_exec_copied_vtx copied; - GLubyte attrsz[VBO_ATTRIB_MAX]; - GLenum attrtype[VBO_ATTRIB_MAX]; - GLubyte active_sz[VBO_ATTRIB_MAX]; + GLubyte attrsz[VBO_ATTRIB_MAX]; /**< nr. of attrib components (1..4) */ + GLenum attrtype[VBO_ATTRIB_MAX]; /**< GL_FLOAT, GL_DOUBLE, GL_INT, etc */ + GLubyte active_sz[VBO_ATTRIB_MAX]; /**< attrib size (nr. 32-bit words) */ + /** pointers into the current 'vertex' array, declared above */ fi_type *attrptr[VBO_ATTRIB_MAX]; + struct gl_client_array arrays[VERT_ATTRIB_MAX]; /* According to program mode, the values above plus current @@ -115,7 +117,6 @@ struct vbo_exec_context const struct gl_client_array *inputs[VERT_ATTRIB_MAX]; } vtx; - struct { GLboolean recalculate_maps; struct vbo_exec_eval1_map map1[VERT_ATTRIB_MAX]; @@ -131,7 +132,7 @@ struct vbo_exec_context GLboolean recalculate_inputs; } array; - /* Which flags to set in vbo_exec_BeginVertices() */ + /* Which flags to set in vbo_exec_begin_vertices() */ GLbitfield begin_vertices_flags; #ifdef DEBUG @@ -147,8 +148,6 @@ void vbo_exec_init( struct gl_context *ctx ); void vbo_exec_destroy( struct gl_context *ctx ); void vbo_exec_invalidate_state( struct gl_context *ctx, GLuint new_state ); -void vbo_exec_BeginVertices( struct gl_context *ctx ); - /* Internal functions: */ diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c index 583a2f9b79f..7ae08fe3062 100644 --- a/src/mesa/vbo/vbo_exec_api.c +++ b/src/mesa/vbo/vbo_exec_api.c @@ -375,13 +375,16 @@ vbo_exec_wrap_upgrade_vertex(struct vbo_exec_context *exec, * This is when a vertex attribute transitions to a different size. * For example, we saw a bunch of glTexCoord2f() calls and now we got a * glTexCoord4f() call. We promote the array from size=2 to size=4. + * \param newSize size of new vertex (number of 32-bit words). */ static void -vbo_exec_fixup_vertex(struct gl_context *ctx, GLuint attr, GLuint newSize, GLenum newType) +vbo_exec_fixup_vertex(struct gl_context *ctx, GLuint attr, + GLuint newSize, GLenum newType) { struct vbo_exec_context *exec = &vbo_context(ctx)->exec; - if (newSize > exec->vtx.attrsz[attr] || newType != exec->vtx.attrtype[attr]) { + if (newSize > exec->vtx.attrsz[attr] || + newType != exec->vtx.attrtype[attr]) { /* New size is larger. Need to flush existing vertices and get * an enlarged vertex format. */ @@ -411,20 +414,49 @@ vbo_exec_fixup_vertex(struct gl_context *ctx, GLuint attr, GLuint newSize, GLenu /** + * Called upon first glVertex, glColor, glTexCoord, etc. + */ +static void +vbo_exec_begin_vertices(struct gl_context *ctx) +{ + struct vbo_exec_context *exec = &vbo_context(ctx)->exec; + + vbo_exec_vtx_map( exec ); + + assert((ctx->Driver.NeedFlush & FLUSH_UPDATE_CURRENT) == 0); + assert(exec->begin_vertices_flags); + + ctx->Driver.NeedFlush |= exec->begin_vertices_flags; +} + + +/** * This macro is used to implement all the glVertex, glColor, glTexCoord, * glVertexAttrib, etc functions. + * \param A attribute index + * \param N attribute size (1..4) + * \param T type (GL_FLOAT, GL_DOUBLE, GL_INT, GL_UNSIGNED_INT) + * \param C cast type (fi_type or double) + * \param V0, V1, v2, V3 attribute value */ #define ATTR_UNION( A, N, T, C, V0, V1, V2, V3 ) \ do { \ struct vbo_exec_context *exec = &vbo_context(ctx)->exec; \ int sz = (sizeof(C) / sizeof(GLfloat)); \ - if (unlikely(!(ctx->Driver.NeedFlush & FLUSH_UPDATE_CURRENT))) \ - vbo_exec_BeginVertices(ctx); \ \ + assert(sz == 1 || sz == 2); \ + \ + if (unlikely(!(ctx->Driver.NeedFlush & FLUSH_UPDATE_CURRENT))) { \ + vbo_exec_begin_vertices(ctx); \ + } \ + \ + /* check if attribute size or type is changing */ \ if (unlikely(exec->vtx.active_sz[A] != N * sz) || \ - unlikely(exec->vtx.attrtype[A] != T)) \ + unlikely(exec->vtx.attrtype[A] != T)) { \ vbo_exec_fixup_vertex(ctx, A, N * sz, T); \ + } \ \ + /* store vertex attribute in vertex buffer */ \ { \ C *dest = (C *)exec->vtx.attrptr[A]; \ if (N>0) dest[0] = V0; \ @@ -438,6 +470,7 @@ do { \ /* This is a glVertex call */ \ GLuint i; \ \ + /* copy 32-bit words */ \ for (i = 0; i < exec->vtx.vertex_size; i++) \ exec->vtx.buffer_ptr[i] = exec->vtx.vertex[i]; \ \ @@ -1149,22 +1182,6 @@ void vbo_exec_vtx_destroy( struct vbo_exec_context *exec ) /** - * Called upon first glVertex, glColor, glTexCoord, etc. - */ -void vbo_exec_BeginVertices( struct gl_context *ctx ) -{ - struct vbo_exec_context *exec = &vbo_context(ctx)->exec; - - vbo_exec_vtx_map( exec ); - - assert((ctx->Driver.NeedFlush & FLUSH_UPDATE_CURRENT) == 0); - assert(exec->begin_vertices_flags); - - ctx->Driver.NeedFlush |= exec->begin_vertices_flags; -} - - -/** * If inside glBegin()/glEnd(), it should assert(0). Otherwise, if * FLUSH_STORED_VERTICES bit in \p flags is set flushes any buffered * vertices, if FLUSH_UPDATE_CURRENT bit is set updates @@ -1197,7 +1214,7 @@ void vbo_exec_FlushVertices( struct gl_context *ctx, GLuint flags ) /* Flush (draw), and make sure VBO is left unmapped when done */ vbo_exec_FlushVertices_internal(exec, GL_TRUE); - /* Need to do this to ensure vbo_exec_BeginVertices gets called again: + /* Need to do this to ensure vbo_exec_begin_vertices gets called again: */ ctx->Driver.NeedFlush &= ~(FLUSH_UPDATE_CURRENT | flags); diff --git a/src/mesa/vbo/vbo_exec_draw.c b/src/mesa/vbo/vbo_exec_draw.c index 2bfb0c32b73..174cbc37c26 100644 --- a/src/mesa/vbo/vbo_exec_draw.c +++ b/src/mesa/vbo/vbo_exec_draw.c @@ -53,10 +53,10 @@ vbo_exec_debug_verts( struct vbo_exec_context *exec ) for (i = 0 ; i < exec->vtx.prim_count ; i++) { struct _mesa_prim *prim = &exec->vtx.prim[i]; printf(" prim %d: %s%s %d..%d %s %s\n", - i, + i, _mesa_lookup_prim_by_nr(prim->mode), prim->weak ? " (weak)" : "", - prim->start, + prim->start, prim->start + prim->count, prim->begin ? "BEGIN" : "(wrap)", prim->end ? "END" : "(wrap)"); @@ -79,7 +79,6 @@ vbo_copy_vertices( struct vbo_exec_context *exec ) exec->vtx.prim[exec->vtx.prim_count-1].start * exec->vtx.vertex_size); - switch (exec->ctx->Driver.CurrentExecPrimitive) { case GL_POINTS: return 0; @@ -219,7 +218,7 @@ vbo_exec_bind_arrays( struct gl_context *ctx ) exec->vtx.inputs[attr] = &arrays[attr]; if (_mesa_is_bufferobj(exec->vtx.bufferobj)) { - /* a real buffer obj: Ptr is an offset, not a pointer*/ + /* a real buffer obj: Ptr is an offset, not a pointer */ assert(exec->vtx.bufferobj->Mappings[MAP_INTERNAL].Pointer); assert(offset >= 0); arrays[attr].Ptr = (GLubyte *) @@ -259,7 +258,7 @@ vbo_exec_vtx_unmap( struct vbo_exec_context *exec ) { if (_mesa_is_bufferobj(exec->vtx.bufferobj)) { struct gl_context *ctx = exec->ctx; - + if (ctx->Driver.FlushMappedBufferRange) { GLintptr offset = exec->vtx.buffer_used - exec->vtx.bufferobj->Mappings[MAP_INTERNAL].Offset; @@ -277,7 +276,7 @@ vbo_exec_vtx_unmap( struct vbo_exec_context *exec ) assert(exec->vtx.buffer_used <= VBO_VERT_BUFFER_SIZE); assert(exec->vtx.buffer_ptr != NULL); - + ctx->Driver.UnmapBuffer(ctx, exec->vtx.bufferobj, MAP_INTERNAL); exec->vtx.buffer_map = NULL; exec->vtx.buffer_ptr = NULL; @@ -299,7 +298,7 @@ vbo_exec_vtx_map( struct vbo_exec_context *exec ) GL_MAP_FLUSH_EXPLICIT_BIT | MESA_MAP_NOWAIT_BIT; const GLenum usage = GL_STREAM_DRAW_ARB; - + if (!_mesa_is_bufferobj(exec->vtx.bufferobj)) return; @@ -323,7 +322,7 @@ vbo_exec_vtx_map( struct vbo_exec_context *exec ) exec->vtx.buffer_ptr = exec->vtx.buffer_map = NULL; } } - + if (!exec->vtx.buffer_map) { /* Need to allocate a new VBO */ exec->vtx.buffer_used = 0; @@ -381,14 +380,14 @@ vbo_exec_vtx_flush(struct vbo_exec_context *exec, GLboolean keepUnmapped) if (0) vbo_exec_debug_verts( exec ); - if (exec->vtx.prim_count && + if (exec->vtx.prim_count && exec->vtx.vert_count) { - exec->vtx.copied.nr = vbo_copy_vertices( exec ); + exec->vtx.copied.nr = vbo_copy_vertices( exec ); if (exec->vtx.copied.nr != exec->vtx.vert_count) { struct gl_context *ctx = exec->ctx; - + /* Before the update_state() as this may raise _NEW_VARYING_VP_INPUTS * from _mesa_set_varying_vp_inputs(). */ @@ -405,7 +404,7 @@ vbo_exec_vtx_flush(struct vbo_exec_context *exec, GLboolean keepUnmapped) printf("%s %d %d\n", __func__, exec->vtx.prim_count, exec->vtx.vert_count); - vbo_context(ctx)->draw_prims( ctx, + vbo_context(ctx)->draw_prims( ctx, exec->vtx.prim, exec->vtx.prim_count, NULL, @@ -433,7 +432,7 @@ vbo_exec_vtx_flush(struct vbo_exec_context *exec, GLboolean keepUnmapped) if (keepUnmapped || exec->vtx.vertex_size == 0) exec->vtx.max_vert = 0; else - exec->vtx.max_vert = ((VBO_VERT_BUFFER_SIZE - exec->vtx.buffer_used) / + exec->vtx.max_vert = ((VBO_VERT_BUFFER_SIZE - exec->vtx.buffer_used) / (exec->vtx.vertex_size * sizeof(GLfloat))); exec->vtx.buffer_ptr = exec->vtx.buffer_map; diff --git a/src/mesa/vbo/vbo_save_api.c b/src/mesa/vbo/vbo_save_api.c index 1a70d168c55..fdc677f9a07 100644 --- a/src/mesa/vbo/vbo_save_api.c +++ b/src/mesa/vbo/vbo_save_api.c @@ -648,7 +648,8 @@ _save_upgrade_vertex(struct gl_context *ctx, GLuint attr, GLuint newsz) /* Recalculate all the attrptr[] values: */ - for (i = 0, tmp = save->vertex; i < VBO_ATTRIB_MAX; i++) { + tmp = save->vertex; + for (i = 0; i < VBO_ATTRIB_MAX; i++) { if (save->attrsz[i]) { save->attrptr[i] = tmp; tmp += save->attrsz[i]; @@ -1543,7 +1544,7 @@ vbo_print_vertex_list(struct gl_context *ctx, void *data, FILE *f) node->vertex_store->bufferobj : NULL; (void) ctx; - fprintf(f, "VBO-VERTEX-LIST, %u vertices %d primitives, %d vertsize " + fprintf(f, "VBO-VERTEX-LIST, %u vertices, %d primitives, %d vertsize, " "buffer %p\n", node->count, node->prim_count, node->vertex_size, buffer); diff --git a/src/util/Makefile.sources b/src/util/Makefile.sources index e45431d1de8..a87114601c8 100644 --- a/src/util/Makefile.sources +++ b/src/util/Makefile.sources @@ -3,6 +3,8 @@ MESA_UTIL_FILES := \ debug.c \ debug.h \ format_srgb.h \ + half_float.c \ + half_float.h \ hash_table.c \ hash_table.h \ list.h \ diff --git a/src/util/half_float.c b/src/util/half_float.c new file mode 100644 index 00000000000..4df64c2ccf9 --- /dev/null +++ b/src/util/half_float.c @@ -0,0 +1,177 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 1999-2007 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <math.h> +#include <assert.h> +#include "half_float.h" +#include "rounding.h" + +typedef union { float f; int32_t i; uint32_t u; } fi_type; + +/** + * Convert a 4-byte float to a 2-byte half float. + * + * Not all float32 values can be represented exactly as a float16 value. We + * round such intermediate float32 values to the nearest float16. When the + * float32 lies exactly between to float16 values, we round to the one with + * an even mantissa. + * + * This rounding behavior has several benefits: + * - It has no sign bias. + * + * - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's + * GPU ISA. + * + * - By reproducing the behavior of the GPU (at least on Intel hardware), + * compile-time evaluation of constant packHalf2x16 GLSL expressions will + * result in the same value as if the expression were executed on the GPU. + */ +uint16_t +_mesa_float_to_half(float val) +{ + const fi_type fi = {val}; + const int flt_m = fi.i & 0x7fffff; + const int flt_e = (fi.i >> 23) & 0xff; + const int flt_s = (fi.i >> 31) & 0x1; + int s, e, m = 0; + uint16_t result; + + /* sign bit */ + s = flt_s; + + /* handle special cases */ + if ((flt_e == 0) && (flt_m == 0)) { + /* zero */ + /* m = 0; - already set */ + e = 0; + } + else if ((flt_e == 0) && (flt_m != 0)) { + /* denorm -- denorm float maps to 0 half */ + /* m = 0; - already set */ + e = 0; + } + else if ((flt_e == 0xff) && (flt_m == 0)) { + /* infinity */ + /* m = 0; - already set */ + e = 31; + } + else if ((flt_e == 0xff) && (flt_m != 0)) { + /* NaN */ + m = 1; + e = 31; + } + else { + /* regular number */ + const int new_exp = flt_e - 127; + if (new_exp < -14) { + /* The float32 lies in the range (0.0, min_normal16) and is rounded + * to a nearby float16 value. The result will be either zero, subnormal, + * or normal. + */ + e = 0; + m = _mesa_lroundevenf((1 << 24) * fabsf(fi.f)); + } + else if (new_exp > 15) { + /* map this value to infinity */ + /* m = 0; - already set */ + e = 31; + } + else { + /* The float32 lies in the range + * [min_normal16, max_normal16 + max_step16) + * and is rounded to a nearby float16 value. The result will be + * either normal or infinite. + */ + e = new_exp + 15; + m = _mesa_lroundevenf(flt_m / (float) (1 << 13)); + } + } + + assert(0 <= m && m <= 1024); + if (m == 1024) { + /* The float32 was rounded upwards into the range of the next exponent, + * so bump the exponent. This correctly handles the case where f32 + * should be rounded up to float16 infinity. + */ + ++e; + m = 0; + } + + result = (s << 15) | (e << 10) | m; + return result; +} + + +/** + * Convert a 2-byte half float to a 4-byte float. + * Based on code from: + * http://www.opengl.org/discussion_boards/ubb/Forum3/HTML/008786.html + */ +float +_mesa_half_to_float(uint16_t val) +{ + /* XXX could also use a 64K-entry lookup table */ + const int m = val & 0x3ff; + const int e = (val >> 10) & 0x1f; + const int s = (val >> 15) & 0x1; + int flt_m, flt_e, flt_s; + fi_type fi; + float result; + + /* sign bit */ + flt_s = s; + + /* handle special cases */ + if ((e == 0) && (m == 0)) { + /* zero */ + flt_m = 0; + flt_e = 0; + } + else if ((e == 0) && (m != 0)) { + /* denorm -- denorm half will fit in non-denorm single */ + const float half_denorm = 1.0f / 16384.0f; /* 2^-14 */ + float mantissa = ((float) (m)) / 1024.0f; + float sign = s ? -1.0f : 1.0f; + return sign * mantissa * half_denorm; + } + else if ((e == 31) && (m == 0)) { + /* infinity */ + flt_e = 0xff; + flt_m = 0; + } + else if ((e == 31) && (m != 0)) { + /* NaN */ + flt_e = 0xff; + flt_m = 1; + } + else { + /* regular */ + flt_e = e + 112; + flt_m = m << 13; + } + + fi.i = (flt_s << 31) | (flt_e << 23) | flt_m; + result = fi.f; + return result; +} diff --git a/src/util/half_float.h b/src/util/half_float.h new file mode 100644 index 00000000000..64f20421018 --- /dev/null +++ b/src/util/half_float.h @@ -0,0 +1,41 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 1999-2007 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _HALF_FLOAT_H_ +#define _HALF_FLOAT_H_ + +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + +uint16_t _mesa_float_to_half(float val); +float _mesa_half_to_float(uint16_t val); + +#ifdef __cplusplus +} /* extern C */ +#endif + +#endif /* _HALF_FLOAT_H_ */ diff --git a/src/vulkan/Makefile.am b/src/vulkan/Makefile.am index 985864a87fe..5abbd379b54 100644 --- a/src/vulkan/Makefile.am +++ b/src/vulkan/Makefile.am @@ -42,6 +42,7 @@ AM_CPPFLAGS = \ $(DEFINES) \ -I$(top_srcdir)/include \ -I$(top_srcdir)/src \ + -I$(top_srcdir)/src/glsl/nir \ -I$(top_srcdir)/src/mapi \ -I$(top_srcdir)/src/mesa \ -I$(top_srcdir)/src/mesa/drivers/dri/common \ diff --git a/src/vulkan/anv_compiler.cpp b/src/vulkan/anv_compiler.cpp index a3b8d1cc80c..2b8e7cee9aa 100644 --- a/src/vulkan/anv_compiler.cpp +++ b/src/vulkan/anv_compiler.cpp @@ -36,6 +36,7 @@ #include <brw_gs.h> #include <brw_cs.h> #include "brw_vec4_gs_visitor.h" +#include <brw_compiler.h> #include <mesa/main/shaderobj.h> #include <mesa/main/fbobject.h> @@ -307,8 +308,9 @@ really_do_vs_prog(struct brw_context *brw, /* Emit GEN4 code. */ - program = brw_vs_emit(brw, mem_ctx, key, prog_data, &vp->program, - prog, -1, &program_size); + program = brw_compile_vs(brw->intelScreen->compiler, brw, mem_ctx, + key, prog_data, vs->Program->nir, NULL, false, -1, + &program_size, NULL); if (program == NULL) { ralloc_free(mem_ctx); return false; @@ -562,8 +564,9 @@ really_do_wm_prog(struct brw_context *brw, */ prog_data->binding_table.render_target_start = 0; - program = brw_wm_fs_emit(brw, mem_ctx, key, prog_data, - &fp->program, prog, -1, -1, &program_size); + program = brw_compile_fs(brw->intelScreen->compiler, brw, mem_ctx, key, + prog_data, fp->program.Base.nir, fs->Program, + -1, -1, brw->use_rep_send, &program_size, NULL); if (program == NULL) { ralloc_free(mem_ctx); return false; @@ -831,7 +834,8 @@ anv_codegen_gs_prog(struct brw_context *brw, void *mem_ctx = ralloc_context(NULL); unsigned program_size; const unsigned *program = - brw_gs_emit(brw, prog, &c, mem_ctx, -1, &program_size); + brw_compile_gs(brw->intelScreen->compiler, brw, &c, gp->program.Base.nir, + prog, mem_ctx, -1, &program_size, NULL); if (program == NULL) { ralloc_free(mem_ctx); return false; @@ -867,8 +871,9 @@ brw_codegen_cs_prog(struct brw_context *brw, anv_nir_apply_dynamic_offsets(pipeline, cs->Program->nir, &prog_data->base); anv_nir_apply_pipeline_layout(cs->Program->nir, pipeline->layout); - program = brw_cs_emit(brw, mem_ctx, key, prog_data, - &cp->program, prog, -1, &program_size); + program = brw_compile_cs(brw->intelScreen->compiler, brw, mem_ctx, key, + prog_data, cs->Program->nir, -1, + &program_size, NULL); if (program == NULL) { ralloc_free(mem_ctx); return false; @@ -1142,10 +1147,13 @@ setup_nir_io(struct gl_shader *mesa_shader, prog->OutputsWritten |= BITFIELD64_BIT(var->data.location); } + shader->info.system_values_read = 0; + foreach_list_typed(nir_variable, var, node, &shader->system_values) { + shader->info.system_values_read |= BITFIELD64_BIT(var->data.location); + } + shader->info.inputs_read = prog->InputsRead; shader->info.outputs_written = prog->OutputsWritten; - - mesa_shader->num_uniform_components = shader->num_uniforms; } static void @@ -1163,7 +1171,7 @@ anv_compile_shader_spirv(struct anv_compiler *compiler, "failed to create %s shader\n", stage_info[stage].name); #define CREATE_PROGRAM(stage) \ - _mesa_init_##stage##_program(&brw->ctx, &ralloc(mesa_shader, struct brw_##stage##_program)->program, 0, 0) + &ralloc(mesa_shader, struct brw_##stage##_program)->program.Base bool is_scalar; struct gl_program *prog; @@ -1187,6 +1195,7 @@ anv_compile_shader_spirv(struct anv_compiler *compiler, default: unreachable("Unsupported shader stage"); } + _mesa_init_gl_program(prog, 0, 0); _mesa_reference_program(&brw->ctx, &mesa_shader->Program, prog); mesa_shader->Program->Parameters = @@ -1215,11 +1224,14 @@ anv_compile_shader_spirv(struct anv_compiler *compiler, } nir_validate_shader(mesa_shader->Program->nir); + setup_nir_io(mesa_shader, mesa_shader->Program->nir); + brw_process_nir(mesa_shader->Program->nir, compiler->screen->devinfo, NULL, mesa_shader->Stage, is_scalar); - setup_nir_io(mesa_shader, mesa_shader->Program->nir); + mesa_shader->num_uniform_components = + mesa_shader->Program->nir->num_uniforms; fail_if(mesa_shader->Program->nir == NULL, "failed to translate SPIR-V to NIR\n"); diff --git a/src/vulkan/anv_meta.c b/src/vulkan/anv_meta.c index 8f6bc421194..76b8c4173e6 100644 --- a/src/vulkan/anv_meta.c +++ b/src/vulkan/anv_meta.c @@ -39,13 +39,11 @@ build_nir_vertex_shader(bool attr_flat) nir_builder_init_simple_shader(&b, MESA_SHADER_VERTEX); - nir_variable *pos_in = nir_variable_create(b.shader, "a_pos", - vertex_type, - nir_var_shader_in); + nir_variable *pos_in = nir_variable_create(b.shader, nir_var_shader_in, + vertex_type, "a_pos"); pos_in->data.location = VERT_ATTRIB_GENERIC0; - nir_variable *pos_out = nir_variable_create(b.shader, "gl_Position", - vertex_type, - nir_var_shader_out); + nir_variable *pos_out = nir_variable_create(b.shader, nir_var_shader_out, + vertex_type, "gl_Position"); pos_in->data.location = VARYING_SLOT_POS; nir_copy_var(&b, pos_out, pos_in); @@ -53,11 +51,11 @@ build_nir_vertex_shader(bool attr_flat) * to store the color and for blit shaders it's the texture coordinate. */ const struct glsl_type *attr_type = glsl_vec4_type(); - nir_variable *attr_in = nir_variable_create(b.shader, "a_attr", attr_type, - nir_var_shader_in); + nir_variable *attr_in = nir_variable_create(b.shader, nir_var_shader_in, + attr_type, "a_attr"); attr_in->data.location = VERT_ATTRIB_GENERIC1; - nir_variable *attr_out = nir_variable_create(b.shader, "v_attr", attr_type, - nir_var_shader_out); + nir_variable *attr_out = nir_variable_create(b.shader, nir_var_shader_out, + attr_type, "v_attr"); attr_out->data.location = VARYING_SLOT_VAR0; attr_out->data.interpolation = attr_flat ? INTERP_QUALIFIER_FLAT : INTERP_QUALIFIER_SMOOTH; @@ -75,14 +73,12 @@ build_nir_clear_fragment_shader(void) nir_builder_init_simple_shader(&b, MESA_SHADER_FRAGMENT); - nir_variable *color_in = nir_variable_create(b.shader, "v_attr", - color_type, - nir_var_shader_in); + nir_variable *color_in = nir_variable_create(b.shader, nir_var_shader_in, + color_type, "v_attr"); color_in->data.location = VARYING_SLOT_VAR0; color_in->data.interpolation = INTERP_QUALIFIER_FLAT; - nir_variable *color_out = nir_variable_create(b.shader, "f_color", - color_type, - nir_var_shader_out); + nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out, + color_type, "f_color"); color_out->data.location = FRAG_RESULT_DATA0; nir_copy_var(&b, color_out, color_in); @@ -98,15 +94,14 @@ build_nir_copy_fragment_shader(enum glsl_sampler_dim tex_dim) const struct glsl_type *color_type = glsl_vec4_type(); - nir_variable *tex_pos_in = nir_variable_create(b.shader, "v_attr", - glsl_vec4_type(), - nir_var_shader_in); + nir_variable *tex_pos_in = nir_variable_create(b.shader, nir_var_shader_in, + glsl_vec4_type(), "v_attr"); tex_pos_in->data.location = VARYING_SLOT_VAR0; const struct glsl_type *sampler_type = glsl_sampler_type(tex_dim, false, false, glsl_get_base_type(color_type)); - nir_variable *sampler = nir_variable_create(b.shader, "s_tex", sampler_type, - nir_var_uniform); + nir_variable *sampler = nir_variable_create(b.shader, nir_var_uniform, + sampler_type, "s_tex"); sampler->data.descriptor_set = 0; sampler->data.binding = 0; @@ -133,9 +128,8 @@ build_nir_copy_fragment_shader(enum glsl_sampler_dim tex_dim) nir_ssa_dest_init(&tex->instr, &tex->dest, 4, "tex"); nir_builder_instr_insert(&b, &tex->instr); - nir_variable *color_out = nir_variable_create(b.shader, "f_color", - color_type, - nir_var_shader_out); + nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out, + color_type, "f_color"); color_out->data.location = FRAG_RESULT_DATA0; nir_store_var(&b, color_out, &tex->dest.ssa); diff --git a/src/vulkan/anv_nir_builder.h b/src/vulkan/anv_nir_builder.h index 299c8c1aad0..f26cb046a6b 100644 --- a/src/vulkan/anv_nir_builder.h +++ b/src/vulkan/anv_nir_builder.h @@ -54,49 +54,3 @@ nir_copy_var(nir_builder *build, nir_variable *dest, nir_variable *src) copy->variables[1] = nir_deref_var_create(copy, src); nir_builder_instr_insert(build, ©->instr); } - -static inline nir_variable * -nir_variable_create(nir_shader *shader, const char *name, - const struct glsl_type *type, nir_variable_mode mode) -{ - nir_variable *var = rzalloc(shader, nir_variable); - var->name = ralloc_strdup(var, name); - var->type = type; - var->data.mode = mode; - - if ((mode == nir_var_shader_in && shader->stage != MESA_SHADER_VERTEX) || - (mode == nir_var_shader_out && shader->stage != MESA_SHADER_FRAGMENT)) - var->data.interpolation = INTERP_QUALIFIER_SMOOTH; - - switch (var->data.mode) { - case nir_var_local: - assert(!"nir_variable_create cannot be used for local variables"); - break; - - case nir_var_global: - exec_list_push_tail(&shader->globals, &var->node); - break; - - case nir_var_shader_in: - exec_list_push_tail(&shader->inputs, &var->node); - break; - - case nir_var_shader_out: - exec_list_push_tail(&shader->outputs, &var->node); - break; - - case nir_var_uniform: - case nir_var_shader_storage: - exec_list_push_tail(&shader->uniforms, &var->node); - break; - - case nir_var_system_value: - exec_list_push_tail(&shader->system_values, &var->node); - break; - - default: - unreachable("not reached"); - } - - return var; -} |