diff options
Diffstat (limited to 'src/gallium/drivers')
23 files changed, 9082 insertions, 5020 deletions
diff --git a/src/gallium/drivers/nv50/Makefile b/src/gallium/drivers/nv50/Makefile index e31e6f8662a..bf1e8201a08 100644 --- a/src/gallium/drivers/nv50/Makefile +++ b/src/gallium/drivers/nv50/Makefile @@ -8,9 +8,9 @@ C_SOURCES = \ nv50_clear.c \ nv50_context.c \ nv50_draw.c \ + nv50_formats.c \ nv50_miptree.c \ nv50_query.c \ - nv50_program.c \ nv50_resource.c \ nv50_screen.c \ nv50_state.c \ @@ -19,6 +19,14 @@ C_SOURCES = \ nv50_tex.c \ nv50_transfer.c \ nv50_vbo.c \ - nv50_push.c + nv50_push.c \ + nv50_program.c \ + nv50_shader_state.c \ + nv50_pc.c \ + nv50_pc_print.c \ + nv50_pc_emit.c \ + nv50_tgsi_to_nc.c \ + nv50_pc_optimize.c \ + nv50_pc_regalloc.c include ../../Makefile.template diff --git a/src/gallium/drivers/nv50/SConscript b/src/gallium/drivers/nv50/SConscript index 8625f926221..e4a93c15ce4 100644 --- a/src/gallium/drivers/nv50/SConscript +++ b/src/gallium/drivers/nv50/SConscript @@ -9,6 +9,7 @@ nv50 = env.ConvenienceLibrary( 'nv50_clear.c', 'nv50_context.c', 'nv50_draw.c', + 'nv50_formats.c', 'nv50_miptree.c', 'nv50_query.c', 'nv50_program.c', diff --git a/src/gallium/drivers/nv50/nv50_formats.c b/src/gallium/drivers/nv50/nv50_formats.c new file mode 100644 index 00000000000..3be39d5337a --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_formats.c @@ -0,0 +1,452 @@ +/* + * Copyright 2010 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "nv50_screen.h" +#include "nv50_texture.h" +#include "nv50_reg.h" +#include "pipe/p_defines.h" + +#define A_(cr, cg, cb, ca, t0, t1, t2, t3, sz, r) \ + NV50TIC_0_0_MAPR_##cr | NV50TIC_0_0_TYPER_##t0 | \ + NV50TIC_0_0_MAPG_##cg | NV50TIC_0_0_TYPEG_##t1 | \ + NV50TIC_0_0_MAPB_##cb | NV50TIC_0_0_TYPEB_##t2 | \ + NV50TIC_0_0_MAPA_##ca | NV50TIC_0_0_TYPEA_##t3 | \ + NV50TIC_0_0_FMT_##sz, \ + NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_##sz | \ + NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_##t0 | \ + (NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_##t0 << 3) | (r << 31) + +#define B_(cr, cg, cb, ca, t0, t1, t2, t3, sz, r) \ + NV50TIC_0_0_MAPR_##cr | NV50TIC_0_0_TYPER_##t0 | \ + NV50TIC_0_0_MAPG_##cg | NV50TIC_0_0_TYPEG_##t1 | \ + NV50TIC_0_0_MAPB_##cb | NV50TIC_0_0_TYPEB_##t2 | \ + NV50TIC_0_0_MAPA_##ca | NV50TIC_0_0_TYPEA_##t3 | \ + NV50TIC_0_0_FMT_##sz, 0 + +#define VERTEX_BUFFER PIPE_BIND_VERTEX_BUFFER +#define SAMPLER_VIEW PIPE_BIND_SAMPLER_VIEW +#define RENDER_TARGET PIPE_BIND_RENDER_TARGET +#define DEPTH_STENCIL PIPE_BIND_DEPTH_STENCIL +#define SCANOUT PIPE_BIND_SCANOUT + +/* for vertex buffers: */ +#define NV50TIC_0_0_FMT_8_8_8 NV50TIC_0_0_FMT_8_8_8_8 +#define NV50TIC_0_0_FMT_16_16_16 NV50TIC_0_0_FMT_16_16_16_16 +#define NV50TIC_0_0_FMT_32_32_32 NV50TIC_0_0_FMT_32_32_32_32 + +const struct nv50_format nv50_format_table[PIPE_FORMAT_COUNT] = +{ + /* COMMON FORMATS */ + + [PIPE_FORMAT_B8G8R8A8_UNORM] = { NV50TCL_RT_FORMAT_A8R8G8B8_UNORM, + A_(C2, C1, C0, C3, UNORM, UNORM, UNORM, UNORM, 8_8_8_8, 1), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET | SCANOUT }, + + [PIPE_FORMAT_B8G8R8X8_UNORM] = { NV50TCL_RT_FORMAT_X8R8G8B8_UNORM, + A_(C2, C1, C0, ONE, UNORM, UNORM, UNORM, UNORM, 8_8_8_8, 1), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET | SCANOUT }, + + [PIPE_FORMAT_B8G8R8A8_SRGB] = { NV50TCL_RT_FORMAT_A8R8G8B8_SRGB, + A_(C2, C1, C0, C3, UNORM, UNORM, UNORM, UNORM, 8_8_8_8, 1), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_B8G8R8X8_SRGB] = { NV50TCL_RT_FORMAT_X8R8G8B8_SRGB, + A_(C2, C1, C0, ONE, UNORM, UNORM, UNORM, UNORM, 8_8_8_8, 1), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_B5G6R5_UNORM] = { NV50TCL_RT_FORMAT_R5G6B5_UNORM, + B_(C2, C1, C0, ONE, UNORM, UNORM, UNORM, UNORM, 5_6_5, 1), + SAMPLER_VIEW | RENDER_TARGET | SCANOUT }, + + [PIPE_FORMAT_B5G5R5A1_UNORM] = { NV50TCL_RT_FORMAT_A1R5G5B5_UNORM, + B_(C2, C1, C0, C3, UNORM, UNORM, UNORM, UNORM, 1_5_5_5, 1), + SAMPLER_VIEW | RENDER_TARGET | SCANOUT }, + + [PIPE_FORMAT_B4G4R4A4_UNORM] = { 0, + B_(C2, C1, C0, C3, UNORM, UNORM, UNORM, UNORM, 4_4_4_4, 1), + SAMPLER_VIEW }, + + [PIPE_FORMAT_R10G10B10A2_UNORM] = { NV50TCL_RT_FORMAT_A2B10G10R10_UNORM, + A_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, 2_10_10_10, 0), + SAMPLER_VIEW | RENDER_TARGET | VERTEX_BUFFER | SCANOUT }, + + [PIPE_FORMAT_B10G10R10A2_UNORM] = { NV50TCL_RT_FORMAT_A2R10G10B10_UNORM, + A_(C2, C1, C0, C3, UNORM, UNORM, UNORM, UNORM, 2_10_10_10, 1), + SAMPLER_VIEW | RENDER_TARGET | VERTEX_BUFFER }, + + /* DEPTH/STENCIL FORMATS */ + + [PIPE_FORMAT_Z16_UNORM] = { NV50TCL_ZETA_FORMAT_Z16_UNORM, + B_(C0, C0, C0, ONE, UNORM, UINT, UINT, UINT, 16_DEPTH, 0), + SAMPLER_VIEW | DEPTH_STENCIL }, + + [PIPE_FORMAT_Z24_UNORM_S8_USCALED] = { NV50TCL_ZETA_FORMAT_S8Z24_UNORM, + B_(C0, C0, C0, ONE, UNORM, UINT, UINT, UINT, 8_24, 0), + SAMPLER_VIEW | DEPTH_STENCIL }, + + [PIPE_FORMAT_Z24X8_UNORM] = { NV50TCL_ZETA_FORMAT_X8Z24_UNORM, + B_(C0, C0, C0, ONE, UNORM, UINT, UINT, UINT, 8_24, 0), + SAMPLER_VIEW | DEPTH_STENCIL }, + + [PIPE_FORMAT_S8_USCALED_Z24_UNORM] = { NV50TCL_ZETA_FORMAT_S8Z24_UNORM, + B_(C1, C1, C1, ONE, UINT, UNORM, UINT, UINT, 24_8, 0), + SAMPLER_VIEW | DEPTH_STENCIL }, + + [PIPE_FORMAT_Z32_FLOAT] = { NV50TCL_ZETA_FORMAT_Z32_FLOAT, + B_(C0, C0, C0, ONE, FLOAT, UINT, UINT, UINT, 32_DEPTH, 0), + SAMPLER_VIEW | DEPTH_STENCIL }, + + [PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED] = { + NV50TCL_ZETA_FORMAT_Z32_FLOAT_X24S8_UNORM, + B_(C0, C0, C0, ONE, FLOAT, UINT, UINT, UINT, 32_8, 0), + SAMPLER_VIEW | DEPTH_STENCIL }, + + /* LUMINANCE, ALPHA, INTENSITY */ + + [PIPE_FORMAT_L8_UNORM] = { 0, + A_(C0, C0, C0, ONE, UNORM, UNORM, UNORM, UNORM, 8, 0), + SAMPLER_VIEW }, + + [PIPE_FORMAT_L8_SRGB] = { 0, + A_(C0, C0, C0, ONE, UNORM, UNORM, UNORM, UNORM, 8, 0), + SAMPLER_VIEW }, + + [PIPE_FORMAT_I8_UNORM] = { 0, + A_(C0, C0, C0, C0, UNORM, UNORM, UNORM, UNORM, 8, 0), + SAMPLER_VIEW }, + + [PIPE_FORMAT_A8_UNORM] = { NV50TCL_RT_FORMAT_A8_UNORM, + A_(ZERO, ZERO, ZERO, C0, UNORM, UNORM, UNORM, UNORM, 8, 0), + SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_L8A8_UNORM] = { 0, + A_(C0, C0, C0, C1, UNORM, UNORM, UNORM, UNORM, 8_8, 0), + SAMPLER_VIEW }, + + [PIPE_FORMAT_L8A8_SRGB] = { 0, + A_(C0, C0, C0, C1, UNORM, UNORM, UNORM, UNORM, 8_8, 0), + SAMPLER_VIEW }, + + /* DXT, RGTC */ + + [PIPE_FORMAT_DXT1_RGB] = { 0, + B_(C0, C1, C2, ONE, UNORM, UNORM, UNORM, UNORM, DXT1, 0), + SAMPLER_VIEW }, + + [PIPE_FORMAT_DXT1_RGBA] = { 0, + B_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, DXT1, 0), + SAMPLER_VIEW }, + + [PIPE_FORMAT_DXT3_RGBA] = { 0, + B_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, DXT3, 0), + SAMPLER_VIEW }, + + [PIPE_FORMAT_DXT5_RGBA] = { 0, + B_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, DXT5, 0), + SAMPLER_VIEW }, + + [PIPE_FORMAT_RGTC1_UNORM] = { 0, + B_(C0, ZERO, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, RGTC1, 0), + SAMPLER_VIEW }, + + [PIPE_FORMAT_RGTC1_SNORM] = { 0, + B_(C0, ZERO, ZERO, ONE, SNORM, SNORM, SNORM, SNORM, RGTC1, 0), + SAMPLER_VIEW }, + + [PIPE_FORMAT_RGTC2_UNORM] = { 0, + B_(C0, C1, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, RGTC2, 0), + SAMPLER_VIEW }, + + [PIPE_FORMAT_RGTC2_SNORM] = { 0, + B_(C0, C1, ZERO, ONE, SNORM, SNORM, SNORM, SNORM, RGTC2, 0), + SAMPLER_VIEW }, + + /* FLOAT 16 */ + + [PIPE_FORMAT_R16G16B16A16_FLOAT] = { NV50TCL_RT_FORMAT_R16G16B16A16_FLOAT, + A_(C0, C1, C2, C3, FLOAT, FLOAT, FLOAT, FLOAT, 16_16_16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R16G16B16_FLOAT] = { NV50TCL_RT_FORMAT_R16G16B16X16_FLOAT, + A_(C0, C1, C2, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 16_16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R16G16_FLOAT] = { NV50TCL_RT_FORMAT_R16G16_FLOAT, + A_(C0, C1, ZERO, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R16_FLOAT] = { NV50TCL_RT_FORMAT_R16_FLOAT, + A_(C0, ZERO, ZERO, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 16, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + /* FLOAT 32 */ + + [PIPE_FORMAT_R32G32B32A32_FLOAT] = { NV50TCL_RT_FORMAT_R32G32B32A32_FLOAT, + A_(C0, C1, C2, C3, FLOAT, FLOAT, FLOAT, FLOAT, 32_32_32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R32G32B32_FLOAT] = { NV50TCL_RT_FORMAT_R32G32B32X32_FLOAT, + A_(C0, C1, C2, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 32_32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R32G32_FLOAT] = { NV50TCL_RT_FORMAT_R32G32_FLOAT, + A_(C0, C1, ZERO, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R32_FLOAT] = { NV50TCL_RT_FORMAT_R32_FLOAT, + A_(C0, ZERO, ZERO, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 32, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + /* ODD FORMATS */ + + [PIPE_FORMAT_R11G11B10_FLOAT] = { NV50TCL_RT_FORMAT_B10G11R11_FLOAT, + B_(C0, C1, C2, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 10_11_11, 0), + SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R9G9B9E5_FLOAT] = { 0, + B_(C0, C1, C2, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 5_9_9_9, 0), + SAMPLER_VIEW }, + + /* SNORM 32 */ + + [PIPE_FORMAT_R32G32B32A32_SNORM] = { 0, + A_(C0, C1, C2, C3, FLOAT, FLOAT, FLOAT, FLOAT, 32_32_32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32G32B32_SNORM] = { 0, + A_(C0, C1, C2, ONE, SNORM, SNORM, SNORM, SNORM, 32_32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32G32_SNORM] = { 0, + A_(C0, C1, ZERO, ONE, SNORM, SNORM, SNORM, SNORM, 32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32_SNORM] = { 0, + A_(C0, ZERO, ZERO, ONE, SNORM, SNORM, SNORM, SNORM, 32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + /* UNORM 32 */ + + [PIPE_FORMAT_R32G32B32A32_UNORM] = { 0, + A_(C0, C1, C2, C3, FLOAT, FLOAT, FLOAT, FLOAT, 32_32_32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32G32B32_UNORM] = { 0, + A_(C0, C1, C2, ONE, UNORM, UNORM, UNORM, UNORM, 32_32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32G32_UNORM] = { 0, + A_(C0, C1, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, 32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32_UNORM] = { 0, + A_(C0, ZERO, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, 32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + /* SNORM 16 */ + + [PIPE_FORMAT_R16G16B16A16_SNORM] = { NV50TCL_RT_FORMAT_R16G16B16A16_SNORM, + A_(C0, C1, C2, C3, SNORM, SNORM, SNORM, SNORM, 16_16_16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R16G16B16_SNORM] = { 0, + A_(C0, C1, C2, ONE, SNORM, SNORM, SNORM, SNORM, 16_16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R16G16_SNORM] = { NV50TCL_RT_FORMAT_R16G16_SNORM, + A_(C0, C1, C2, C3, SNORM, SNORM, SNORM, SNORM, 16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R16_SNORM] = { 0, + A_(C0, ZERO, ZERO, ONE, SNORM, SNORM, SNORM, SNORM, 16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + /* UNORM 16 */ + + [PIPE_FORMAT_R16G16B16A16_UNORM] = { NV50TCL_RT_FORMAT_R16G16B16A16_UNORM, + A_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, 16_16_16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R16G16B16_UNORM] = { 0, + A_(C0, C1, C2, ONE, UNORM, UNORM, UNORM, UNORM, 16_16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R16G16_UNORM] = { NV50TCL_RT_FORMAT_R16G16_UNORM, + A_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, 16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R16_UNORM] = { 0, + A_(C0, ZERO, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, 16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + /* SNORM 8 */ + + [PIPE_FORMAT_R8G8B8A8_SNORM] = { NV50TCL_RT_FORMAT_A8B8G8R8_SNORM, + A_(C0, C1, C2, C3, SNORM, SNORM, SNORM, SNORM, 8_8_8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R8G8B8_SNORM] = { 0, + A_(C0, C1, C2, ONE, SNORM, SNORM, SNORM, SNORM, 8_8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R8G8_SNORM] = { NV50TCL_RT_FORMAT_R8G8_SNORM, + A_(C0, C1, ZERO, ONE, SNORM, SNORM, SNORM, SNORM, 8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R8_SNORM] = { NV50TCL_RT_FORMAT_R8_SNORM, + A_(C0, ZERO, ZERO, ONE, SNORM, SNORM, SNORM, SNORM, 8, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + /* UNORM 8 */ + + [PIPE_FORMAT_R8G8B8A8_UNORM] = { NV50TCL_RT_FORMAT_A8B8G8R8_UNORM, + A_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, 8_8_8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R8G8B8A8_SRGB] = { NV50TCL_RT_FORMAT_A8B8G8R8_SRGB, + A_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, 8_8_8_8, 0), + SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R8G8B8_UNORM] = { NV50TCL_RT_FORMAT_X8B8G8R8_UNORM, + A_(C0, C1, C2, ONE, UNORM, UNORM, UNORM, UNORM, 8_8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R8G8B8_SRGB] = { NV50TCL_RT_FORMAT_X8B8G8R8_SRGB, + A_(C0, C1, C2, ONE, UNORM, UNORM, UNORM, UNORM, 8_8_8, 0), + SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R8G8_UNORM] = { NV50TCL_RT_FORMAT_R8G8_UNORM, + A_(C0, C1, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, 8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R8_UNORM] = { NV50TCL_RT_FORMAT_R8_UNORM, + A_(C0, ZERO, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, 8, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + /* SSCALED 32 */ + + [PIPE_FORMAT_R32G32B32A32_SSCALED] = { 0, + A_(C0, C1, C2, C3, SSCALED, SSCALED, SSCALED, SSCALED, 32_32_32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32G32B32_SSCALED] = { 0, + A_(C0, C1, C2, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 32_32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32G32_SSCALED] = { 0, + A_(C0, C1, ZERO, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32_SSCALED] = { 0, + A_(C0, ZERO, ZERO, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + /* USCALED 32 */ + + [PIPE_FORMAT_R32G32B32A32_USCALED] = { 0, + A_(C0, C1, C2, C3, USCALED, USCALED, USCALED, USCALED, 32_32_32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32G32B32_USCALED] = { 0, + A_(C0, C1, C2, ONE, USCALED, USCALED, USCALED, USCALED, 32_32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32G32_USCALED] = { 0, + A_(C0, C1, ZERO, ONE, USCALED, USCALED, USCALED, USCALED, 32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32_USCALED] = { 0, + A_(C0, ZERO, ZERO, ONE, USCALED, USCALED, USCALED, USCALED, 32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + /* SSCALED 16 */ + + [PIPE_FORMAT_R16G16B16A16_SSCALED] = { 0, + A_(C0, C1, C2, C3, SSCALED, SSCALED, SSCALED, SSCALED, 16_16_16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R16G16B16_SSCALED] = { 0, + A_(C0, C1, C2, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 16_16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R16G16_SSCALED] = { 0, + A_(C0, C1, ZERO, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R16_SSCALED] = { 0, + A_(C0, ZERO, ZERO, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + /* USCALED 16 */ + + [PIPE_FORMAT_R16G16B16A16_USCALED] = { 0, + A_(C0, C1, C2, C3, USCALED, USCALED, USCALED, USCALED, 16_16_16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R16G16B16_USCALED] = { 0, + A_(C0, C1, C2, ONE, USCALED, USCALED, USCALED, USCALED, 16_16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R16G16_USCALED] = { 0, + A_(C0, C1, ZERO, ONE, USCALED, USCALED, USCALED, USCALED, 16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R16_USCALED] = { 0, + A_(C0, ZERO, ZERO, ONE, USCALED, USCALED, USCALED, USCALED, 16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + /* SSCALED 8 */ + + [PIPE_FORMAT_R8G8B8A8_SSCALED] = { 0, + A_(C0, C1, C2, C3, SSCALED, SSCALED, SSCALED, SSCALED, 8_8_8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R8G8B8_SSCALED] = { 0, + A_(C0, C1, C2, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 8_8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R8G8_SSCALED] = { 0, + A_(C0, C1, ZERO, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R8_SSCALED] = { 0, + A_(C0, ZERO, ZERO, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 8, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + /* USCALED 8 */ + + [PIPE_FORMAT_R8G8B8A8_USCALED] = { 0, + A_(C0, C1, C2, C3, USCALED, USCALED, USCALED, USCALED, 8_8_8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R8G8B8_USCALED] = { 0, + A_(C0, C1, C2, ONE, USCALED, USCALED, USCALED, USCALED, 8_8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R8G8_USCALED] = { 0, + A_(C0, C1, ZERO, ONE, USCALED, USCALED, USCALED, USCALED, 8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R8_USCALED] = { 0, + A_(C0, ZERO, ZERO, ONE, USCALED, USCALED, USCALED, USCALED, 8, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, +}; diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c index c0f5cc10dd7..dd0e8fd41b1 100644 --- a/src/gallium/drivers/nv50/nv50_miptree.c +++ b/src/gallium/drivers/nv50/nv50_miptree.c @@ -159,6 +159,9 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_resource *tmp case PIPE_FORMAT_Z24_UNORM_S8_USCALED: tile_flags = 0x2800; break; + case PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED: + tile_flags = 0xe000; + break; case PIPE_FORMAT_R32G32B32A32_FLOAT: case PIPE_FORMAT_R32G32B32_FLOAT: tile_flags = 0x7400; diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c new file mode 100644 index 00000000000..bb464ec4c9f --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -0,0 +1,804 @@ +/* + * Copyright 2010 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* #define NV50PC_DEBUG */ + +#include "nv50_pc.h" +#include "nv50_program.h" + +#include <stdio.h> + +/* returns TRUE if operands 0 and 1 can be swapped */ +boolean +nv_op_commutative(uint opcode) +{ + switch (opcode) { + case NV_OP_ADD: + case NV_OP_MUL: + case NV_OP_MAD: + case NV_OP_AND: + case NV_OP_OR: + case NV_OP_XOR: + case NV_OP_MIN: + case NV_OP_MAX: + case NV_OP_SAD: + return TRUE; + default: + return FALSE; + } +} + +/* return operand to which the address register applies */ +int +nv50_indirect_opnd(struct nv_instruction *i) +{ + if (!i->src[4]) + return -1; + + switch (i->opcode) { + case NV_OP_MOV: + case NV_OP_LDA: + case NV_OP_STA: + return 0; + default: + return 1; + } +} + +boolean +nv50_nvi_can_use_imm(struct nv_instruction *nvi, int s) +{ + if (nvi->flags_src || nvi->flags_def) + return FALSE; + + switch (nvi->opcode) { + case NV_OP_ADD: + case NV_OP_MUL: + case NV_OP_AND: + case NV_OP_OR: + case NV_OP_XOR: + case NV_OP_SHL: + case NV_OP_SHR: + return (s == 1) && (nvi->src[0]->value->reg.file == NV_FILE_GPR) && + (nvi->def[0]->reg.file == NV_FILE_GPR); + case NV_OP_MOV: + assert(s == 0); + return (nvi->def[0]->reg.file == NV_FILE_GPR); + default: + return FALSE; + } +} + +boolean +nv50_nvi_can_load(struct nv_instruction *nvi, int s, struct nv_value *value) +{ + int i; + + for (i = 0; i < 3 && nvi->src[i]; ++i) + if (nvi->src[i]->value->reg.file == NV_FILE_IMM) + return FALSE; + + switch (nvi->opcode) { + case NV_OP_ABS: + case NV_OP_ADD: + case NV_OP_CEIL: + case NV_OP_FLOOR: + case NV_OP_TRUNC: + case NV_OP_CVT: + case NV_OP_NEG: + case NV_OP_MAD: + case NV_OP_MUL: + case NV_OP_SAT: + case NV_OP_SUB: + case NV_OP_MAX: + case NV_OP_MIN: + if (s == 0 && (value->reg.file == NV_FILE_MEM_S || + value->reg.file == NV_FILE_MEM_P)) + return TRUE; + if (value->reg.file < NV_FILE_MEM_C(0) || + value->reg.file > NV_FILE_MEM_C(15)) + return FALSE; + return (s == 1) || + ((s == 2) && (nvi->src[1]->value->reg.file == NV_FILE_GPR)); + case NV_OP_MOV: + assert(s == 0); + return /* TRUE */ FALSE; /* don't turn MOVs into loads */ + default: + return FALSE; + } +} + +/* Return whether this instruction can be executed conditionally. */ +boolean +nv50_nvi_can_predicate(struct nv_instruction *nvi) +{ + int i; + + if (nvi->flags_src) + return FALSE; + for (i = 0; i < 4 && nvi->src[i]; ++i) + if (nvi->src[i]->value->reg.file == NV_FILE_IMM) + return FALSE; + return TRUE; +} + +ubyte +nv50_supported_src_mods(uint opcode, int s) +{ + switch (opcode) { + case NV_OP_ABS: + return NV_MOD_NEG | NV_MOD_ABS; /* obviously */ + case NV_OP_ADD: + case NV_OP_MUL: + case NV_OP_MAD: + return NV_MOD_NEG; + case NV_OP_DFDX: + case NV_OP_DFDY: + assert(s == 0); + return NV_MOD_NEG; + case NV_OP_MAX: + case NV_OP_MIN: + return NV_MOD_ABS; + case NV_OP_CVT: + case NV_OP_LG2: + case NV_OP_NEG: + case NV_OP_PREEX2: + case NV_OP_PRESIN: + case NV_OP_RCP: + case NV_OP_RSQ: + return NV_MOD_ABS | NV_MOD_NEG; + default: + return 0; + } +} + +/* We may want an opcode table. */ +boolean +nv50_op_can_write_flags(uint opcode) +{ + if (nv_is_vector_op(opcode)) + return FALSE; + switch (opcode) { /* obvious ones like KIL, CALL, etc. not included */ + case NV_OP_PHI: + case NV_OP_MOV: + case NV_OP_LINTERP: + case NV_OP_PINTERP: + case NV_OP_LDA: + return FALSE; + default: + break; + } + if (opcode >= NV_OP_RCP && opcode <= NV_OP_PREEX2) + return FALSE; + return TRUE; +} + +int +nv_nvi_refcount(struct nv_instruction *nvi) +{ + int i, rc; + + rc = nvi->flags_def ? nvi->flags_def->refc : 0; + + for (i = 0; i < 4; ++i) { + if (!nvi->def[i]) + return rc; + rc += nvi->def[i]->refc; + } + return rc; +} + +int +nvcg_replace_value(struct nv_pc *pc, struct nv_value *old_val, + struct nv_value *new_val) +{ + int i, n; + + if (old_val == new_val) + return old_val->refc; + + for (i = 0, n = 0; i < pc->num_refs; ++i) { + if (pc->refs[i]->value == old_val) { + ++n; + nv_reference(pc, &pc->refs[i], new_val); + } + } + return n; +} + +struct nv_value * +nvcg_find_constant(struct nv_ref *ref) +{ + struct nv_value *src; + + if (!ref) + return NULL; + + src = ref->value; + while (src->insn && src->insn->opcode == NV_OP_MOV) { + assert(!src->insn->src[0]->mod); + src = src->insn->src[0]->value; + } + if ((src->reg.file == NV_FILE_IMM) || + (src->insn && src->insn->opcode == NV_OP_LDA && + src->insn->src[0]->value->reg.file >= NV_FILE_MEM_C(0) && + src->insn->src[0]->value->reg.file <= NV_FILE_MEM_C(15))) + return src; + return NULL; +} + +struct nv_value * +nvcg_find_immediate(struct nv_ref *ref) +{ + struct nv_value *src = nvcg_find_constant(ref); + + return (src && src->reg.file == NV_FILE_IMM) ? src : NULL; +} + +static void +nv_pc_free_refs(struct nv_pc *pc) +{ + int i; + for (i = 0; i < pc->num_refs; i += 64) + FREE(pc->refs[i]); + FREE(pc->refs); +} + +static const char * +edge_name(ubyte type) +{ + switch (type) { + case CFG_EDGE_FORWARD: return "forward"; + case CFG_EDGE_BACK: return "back"; + case CFG_EDGE_LOOP_ENTER: return "loop"; + case CFG_EDGE_LOOP_LEAVE: return "break"; + case CFG_EDGE_FAKE: return "fake"; + default: + return "?"; + } +} + +void +nv_pc_pass_in_order(struct nv_basic_block *root, nv_pc_pass_func f, void *priv) +{ + struct nv_basic_block *bb[64], *bbb[16], *b; + int j, p, pp; + + bb[0] = root; + p = 1; + pp = 0; + + while (p > 0) { + b = bb[--p]; + b->priv = 0; + + for (j = 1; j >= 0; --j) { + if (!b->out[j]) + continue; + + switch (b->out_kind[j]) { + case CFG_EDGE_BACK: + continue; + case CFG_EDGE_FORWARD: + case CFG_EDGE_FAKE: + if (++b->out[j]->priv == b->out[j]->num_in) + bb[p++] = b->out[j]; + break; + case CFG_EDGE_LOOP_ENTER: + bb[p++] = b->out[j]; + break; + case CFG_EDGE_LOOP_LEAVE: + bbb[pp++] = b->out[j]; + break; + default: + assert(0); + break; + } + } + + f(priv, b); + + if (!p) { + p = pp; + for (; pp > 0; --pp) + bb[pp - 1] = bbb[pp - 1]; + } + } +} + +static void +nv_do_print_function(void *priv, struct nv_basic_block *b) +{ + struct nv_instruction *i = b->phi; + + debug_printf("=== BB %i ", b->id); + if (b->out[0]) + debug_printf("[%s -> %i] ", edge_name(b->out_kind[0]), b->out[0]->id); + if (b->out[1]) + debug_printf("[%s -> %i] ", edge_name(b->out_kind[1]), b->out[1]->id); + debug_printf("===\n"); + + i = b->phi; + if (!i) + i = b->entry; + for (; i; i = i->next) + nv_print_instruction(i); +} + +void +nv_print_function(struct nv_basic_block *root) +{ + if (root->subroutine) + debug_printf("SUBROUTINE %i\n", root->subroutine); + else + debug_printf("MAIN\n"); + + nv_pc_pass_in_order(root, nv_do_print_function, root); +} + +void +nv_print_program(struct nv_pc *pc) +{ + int i; + for (i = 0; i < pc->num_subroutines + 1; ++i) + if (pc->root[i]) + nv_print_function(pc->root[i]); +} + +#ifdef NV50PC_DEBUG +static void +nv_do_print_cfgraph(struct nv_pc *pc, FILE *f, struct nv_basic_block *b) +{ + int i; + + b->pass_seq = pc->pass_seq; + + fprintf(f, "\t%i [shape=box]\n", b->id); + + for (i = 0; i < 2; ++i) { + if (!b->out[i]) + continue; + switch (b->out_kind[i]) { + case CFG_EDGE_FORWARD: + fprintf(f, "\t%i -> %i;\n", b->id, b->out[i]->id); + break; + case CFG_EDGE_LOOP_ENTER: + fprintf(f, "\t%i -> %i [color=green];\n", b->id, b->out[i]->id); + break; + case CFG_EDGE_LOOP_LEAVE: + fprintf(f, "\t%i -> %i [color=red];\n", b->id, b->out[i]->id); + break; + case CFG_EDGE_BACK: + fprintf(f, "\t%i -> %i;\n", b->id, b->out[i]->id); + continue; + case CFG_EDGE_FAKE: + fprintf(f, "\t%i -> %i [style=dotted];\n", b->id, b->out[i]->id); + break; + default: + assert(0); + break; + } + if (b->out[i]->pass_seq < pc->pass_seq) + nv_do_print_cfgraph(pc, f, b->out[i]); + } +} + +/* Print the control flow graph of subroutine @subr (0 == MAIN) to a file. */ +static void +nv_print_cfgraph(struct nv_pc *pc, const char *filepath, int subr) +{ + FILE *f; + + f = fopen(filepath, "a"); + if (!f) + return; + + fprintf(f, "digraph G {\n"); + + ++pc->pass_seq; + + nv_do_print_cfgraph(pc, f, pc->root[subr]); + + fprintf(f, "}\n"); + + fclose(f); +} +#endif + +static INLINE void +nvcg_show_bincode(struct nv_pc *pc) +{ + unsigned i; + + for (i = 0; i < pc->bin_size / 4; ++i) { + debug_printf("0x%08x ", pc->emit[i]); + if ((i % 16) == 15) + debug_printf("\n"); + } + debug_printf("\n"); +} + +static int +nv50_emit_program(struct nv_pc *pc) +{ + uint32_t *code = pc->emit; + int n; + + NV50_DBGMSG("emitting program: size = %u\n", pc->bin_size); + + for (n = 0; n < pc->num_blocks; ++n) { + struct nv_instruction *i; + struct nv_basic_block *b = pc->bb_list[n]; + + for (i = b->entry; i; i = i->next) { + nv50_emit_instruction(pc, i); + + pc->bin_pos += 1 + (pc->emit[0] & 1); + pc->emit += 1 + (pc->emit[0] & 1); + } + } + assert(pc->emit == &code[pc->bin_size / 4]); + + /* XXX: we can do better than this ... */ + if (!(pc->emit[-2] & 1) || (pc->emit[-2] & 2) || (pc->emit[-1] & 3)) { + pc->emit[0] = 0xf0000001; + pc->emit[1] = 0xe0000000; + pc->bin_size += 8; + } + + pc->emit = code; + code[pc->bin_size / 4 - 1] |= 1; + +#ifdef NV50PC_DEBUG + nvcg_show_bincode(pc); +#endif + + return 0; +} + +int +nv50_generate_code(struct nv50_translation_info *ti) +{ + struct nv_pc *pc; + int ret; + int i; + + pc = CALLOC_STRUCT(nv_pc); + if (!pc) + return 1; + + pc->root = CALLOC(ti->subr_nr + 1, sizeof(pc->root[0])); + if (!pc->root) { + FREE(pc); + return 1; + } + pc->num_subroutines = ti->subr_nr; + + ret = nv50_tgsi_to_nc(pc, ti); + if (ret) + goto out; +#ifdef NV50PC_DEBUG + nv_print_program(pc); +#endif + + pc->opt_reload_elim = ti->store_to_memory ? FALSE : TRUE; + + /* optimization */ + ret = nv_pc_exec_pass0(pc); + if (ret) + goto out; +#ifdef NV50PC_DEBUG + nv_print_program(pc); +#endif + + /* register allocation */ + ret = nv_pc_exec_pass1(pc); + if (ret) + goto out; +#ifdef NV50PC_DEBUG + nv_print_program(pc); + nv_print_cfgraph(pc, "nv50_shader_cfgraph.dot", 0); +#endif + + /* prepare for emission */ + ret = nv_pc_exec_pass2(pc); + if (ret) + goto out; + + pc->emit = CALLOC(pc->bin_size / 4 + 2, 4); + if (!pc->emit) { + ret = 3; + goto out; + } + ret = nv50_emit_program(pc); + if (ret) + goto out; + + ti->p->code_size = pc->bin_size; + ti->p->code = pc->emit; + + ti->p->immd_size = pc->immd_count * 4; + ti->p->immd = pc->immd_buf; + + /* highest 16 bit reg to num of 32 bit regs, limit to >= 4 */ + ti->p->max_gpr = MAX2(4, (pc->max_reg[NV_FILE_GPR] >> 1) + 1); + + ti->p->fixups = pc->fixups; + ti->p->num_fixups = pc->num_fixups; + + NV50_DBGMSG("SHADER TRANSLATION - %s\n", ret ? "failure" : "success"); + +out: + nv_pc_free_refs(pc); + + for (i = 0; i < pc->num_blocks; ++i) + FREE(pc->bb_list[i]); + if (pc->root) + FREE(pc->root); + if (ret) { /* on success, these will be referenced by nv50_program */ + if (pc->emit) + FREE(pc->emit); + if (pc->immd_buf) + FREE(pc->immd_buf); + if (pc->fixups) + FREE(pc->fixups); + } + FREE(pc); + return ret; +} + +static void +nvbb_insert_phi(struct nv_basic_block *b, struct nv_instruction *i) +{ + if (!b->phi) { + i->prev = NULL; + b->phi = i; + i->next = b->entry; + if (b->entry) { + assert(!b->entry->prev && b->exit); + b->entry->prev = i; + } else { + b->entry = i; + b->exit = i; + } + } else { + assert(b->entry); + if (b->entry->opcode == NV_OP_PHI) { /* insert after entry */ + assert(b->entry == b->exit); + b->entry->next = i; + i->prev = b->entry; + b->entry = i; + b->exit = i; + } else { /* insert before entry */ + assert(b->entry->prev && b->exit); + i->next = b->entry; + i->prev = b->entry->prev; + b->entry->prev = i; + i->prev->next = i; + } + } +} + +void +nvbb_insert_tail(struct nv_basic_block *b, struct nv_instruction *i) +{ + if (i->opcode == NV_OP_PHI) { + nvbb_insert_phi(b, i); + } else { + i->prev = b->exit; + if (b->exit) + b->exit->next = i; + b->exit = i; + if (!b->entry) + b->entry = i; + else + if (i->prev && i->prev->opcode == NV_OP_PHI) + b->entry = i; + } + + i->bb = b; + b->num_instructions++; +} + +void +nvi_insert_after(struct nv_instruction *at, struct nv_instruction *ni) +{ + if (!at->next) { + nvbb_insert_tail(at->bb, ni); + return; + } + ni->next = at->next; + ni->prev = at; + ni->next->prev = ni; + ni->prev->next = ni; +} + +void +nv_nvi_delete(struct nv_instruction *nvi) +{ + struct nv_basic_block *b = nvi->bb; + int j; + + /* debug_printf("REM: "); nv_print_instruction(nvi); */ + + for (j = 0; j < 5; ++j) + nv_reference(NULL, &nvi->src[j], NULL); + nv_reference(NULL, &nvi->flags_src, NULL); + + if (nvi->next) + nvi->next->prev = nvi->prev; + else { + assert(nvi == b->exit); + b->exit = nvi->prev; + } + + if (nvi->prev) + nvi->prev->next = nvi->next; + + if (nvi == b->entry) { + /* PHIs don't get hooked to b->entry */ + b->entry = nvi->next; + assert(!nvi->prev || nvi->prev->opcode == NV_OP_PHI); + } + + if (nvi == b->phi) { + if (nvi->opcode != NV_OP_PHI) + NV50_DBGMSG("NOTE: b->phi points to non-PHI instruction\n"); + + assert(!nvi->prev); + if (!nvi->next || nvi->next->opcode != NV_OP_PHI) + b->phi = NULL; + else + b->phi = nvi->next; + } +} + +void +nv_nvi_permute(struct nv_instruction *i1, struct nv_instruction *i2) +{ + struct nv_basic_block *b = i1->bb; + + assert(i1->opcode != NV_OP_PHI && + i2->opcode != NV_OP_PHI); + assert(i1->next == i2); + + if (b->exit == i2) + b->exit = i1; + + if (b->entry == i1) + b->entry = i2; + + i2->prev = i1->prev; + i1->next = i2->next; + i2->next = i1; + i1->prev = i2; + + if (i2->prev) + i2->prev->next = i2; + if (i1->next) + i1->next->prev = i1; +} + +void +nvbb_attach_block(struct nv_basic_block *parent, + struct nv_basic_block *b, ubyte edge_kind) +{ + assert(b->num_in < 8); + + if (parent->out[0]) { + assert(!parent->out[1]); + parent->out[1] = b; + parent->out_kind[1] = edge_kind; + } else { + parent->out[0] = b; + parent->out_kind[0] = edge_kind; + } + + b->in[b->num_in] = parent; + b->in_kind[b->num_in++] = edge_kind; +} + +/* NOTE: all BRKs are treated as conditional, so there are 2 outgoing BBs */ + +boolean +nvbb_dominated_by(struct nv_basic_block *b, struct nv_basic_block *d) +{ + int j; + + if (b == d) + return TRUE; + + for (j = 0; j < b->num_in; ++j) + if ((b->in_kind[j] != CFG_EDGE_BACK) && !nvbb_dominated_by(b->in[j], d)) + return FALSE; + + return j ? TRUE : FALSE; +} + +/* check if @bf (future) can be reached from @bp (past), stop at @bt */ +boolean +nvbb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp, + struct nv_basic_block *bt) +{ + struct nv_basic_block *q[NV_PC_MAX_BASIC_BLOCKS], *b; + int i, p, n; + + p = 0; + n = 1; + q[0] = bp; + + while (p < n) { + b = q[p++]; + + if (b == bf) + break; + if (b == bt) + continue; + assert(n <= (1024 - 2)); + + for (i = 0; i < 2; ++i) { + if (b->out[i] && !IS_WALL_EDGE(b->out_kind[i]) && !b->out[i]->priv) { + q[n] = b->out[i]; + q[n++]->priv = 1; + } + } + } + for (--n; n >= 0; --n) + q[n]->priv = 0; + + return (b == bf); +} + +static struct nv_basic_block * +nvbb_find_dom_frontier(struct nv_basic_block *b, struct nv_basic_block *df) +{ + struct nv_basic_block *out; + int i; + + if (!nvbb_dominated_by(df, b)) { + for (i = 0; i < df->num_in; ++i) { + if (df->in_kind[i] == CFG_EDGE_BACK) + continue; + if (nvbb_dominated_by(df->in[i], b)) + return df; + } + } + for (i = 0; i < 2 && df->out[i]; ++i) { + if (df->out_kind[i] == CFG_EDGE_BACK) + continue; + if ((out = nvbb_find_dom_frontier(b, df->out[i]))) + return out; + } + return NULL; +} + +struct nv_basic_block * +nvbb_dom_frontier(struct nv_basic_block *b) +{ + struct nv_basic_block *df; + int i; + + for (i = 0; i < 2 && b->out[i]; ++i) + if ((df = nvbb_find_dom_frontier(b, b->out[i]))) + return df; + return NULL; +} diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h new file mode 100644 index 00000000000..92c6be5f6e5 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -0,0 +1,514 @@ +/* + * Copyright 2010 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __NV50_COMPILER_H__ +#define __NV50_COMPILER_H__ + +#ifdef NV50PC_DEBUG +# define NV50_DBGMSG(args...) debug_printf(args) +#else +# define NV50_DBGMSG(args...) +#endif + +#include "pipe/p_defines.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" + +#define NV_OP_PHI 0 +#define NV_OP_EXTRACT 1 +#define NV_OP_COMBINE 2 +#define NV_OP_LDA 3 +#define NV_OP_STA 4 +#define NV_OP_MOV 5 +#define NV_OP_ADD 6 +#define NV_OP_SUB 7 +#define NV_OP_NEG 8 +#define NV_OP_MUL 9 +#define NV_OP_MAD 10 +#define NV_OP_CVT 11 +#define NV_OP_SAT 12 +#define NV_OP_NOT 13 +#define NV_OP_AND 14 +#define NV_OP_OR 15 +#define NV_OP_XOR 16 +#define NV_OP_SHL 17 +#define NV_OP_SHR 18 +#define NV_OP_RCP 19 +#define NV_OP_UNDEF 20 +#define NV_OP_RSQ 21 +#define NV_OP_LG2 22 +#define NV_OP_SIN 23 +#define NV_OP_COS 24 +#define NV_OP_EX2 25 +#define NV_OP_PRESIN 26 +#define NV_OP_PREEX2 27 +#define NV_OP_MIN 28 +#define NV_OP_MAX 29 +#define NV_OP_SET 30 +#define NV_OP_SAD 31 +#define NV_OP_KIL 32 +#define NV_OP_BRA 33 +#define NV_OP_CALL 34 +#define NV_OP_RET 35 +#define NV_OP_BREAK 36 +#define NV_OP_BREAKADDR 37 +#define NV_OP_JOINAT 38 +#define NV_OP_TEX 39 +#define NV_OP_TXB 40 +#define NV_OP_TXL 41 +#define NV_OP_TXF 42 +#define NV_OP_TXQ 43 +#define NV_OP_DFDX 44 +#define NV_OP_DFDY 45 +#define NV_OP_QUADOP 46 +#define NV_OP_LINTERP 47 +#define NV_OP_PINTERP 48 +#define NV_OP_ABS 49 +#define NV_OP_CEIL 50 +#define NV_OP_FLOOR 51 +#define NV_OP_TRUNC 52 +#define NV_OP_NOP 53 +#define NV_OP_SELECT 54 +#define NV_OP_EXPORT 55 +#define NV_OP_JOIN 56 +#define NV_OP_COUNT 57 + +#define NV_FILE_GPR 0 +#define NV_FILE_OUT 1 +#define NV_FILE_ADDR 2 +#define NV_FILE_FLAGS 3 +#define NV_FILE_IMM 16 +#define NV_FILE_MEM_S 32 +#define NV_FILE_MEM_P 33 +#define NV_FILE_MEM_V 34 +#define NV_FILE_MEM_L 48 +#define NV_FILE_MEM_G(i) (64 + i) +#define NV_FILE_MEM_C(i) (80 + i) + +#define NV_MOD_NEG 1 +#define NV_MOD_ABS 2 +#define NV_MOD_NOT 4 +#define NV_MOD_SAT 8 + +#define NV_TYPE_U8 0x00 +#define NV_TYPE_S8 0x01 +#define NV_TYPE_U16 0x02 +#define NV_TYPE_S16 0x03 +#define NV_TYPE_U32 0x04 +#define NV_TYPE_S32 0x05 +#define NV_TYPE_P32 0x07 +#define NV_TYPE_F32 0x09 +#define NV_TYPE_F64 0x0b +#define NV_TYPE_VEC(x, n) (NV_TYPE_##x | (n << 4)) +#define NV_TYPE_LO 0x00 +#define NV_TYPE_HI 0x80 +#define NV_TYPE_ANY 0xff + +#define NV_TYPE_ISINT(t) ((t) <= 5) +#define NV_TYPE_ISFLT(t) ((t) & 0x08) + +/* $cX registers contain 4 bits: OCSZ (Z is bit 0) */ +#define NV_CC_FL 0x0 +#define NV_CC_LT 0x1 +#define NV_CC_EQ 0x2 +#define NV_CC_LE 0x3 +#define NV_CC_GT 0x4 +#define NV_CC_NE 0x5 +#define NV_CC_GE 0x6 +#define NV_CC_U 0x8 +#define NV_CC_TR 0xf +#define NV_CC_O 0x10 +#define NV_CC_C 0x11 +#define NV_CC_A 0x12 +#define NV_CC_S 0x13 + +#define NV_PC_MAX_INSTRUCTIONS 2048 +#define NV_PC_MAX_VALUES (NV_PC_MAX_INSTRUCTIONS * 4) + +#define NV_PC_MAX_BASIC_BLOCKS 1024 + +static INLINE boolean +nv_is_vector_op(uint opcode) +{ + return (opcode >= NV_OP_TEX) && (opcode <= NV_OP_TXQ); +} + +static INLINE uint +nv_type_order(ubyte type) +{ + switch (type & 0xf) { + case NV_TYPE_U8: + case NV_TYPE_S8: + return 0; + case NV_TYPE_U16: + case NV_TYPE_S16: + return 1; + case NV_TYPE_U32: + case NV_TYPE_F32: + case NV_TYPE_S32: + case NV_TYPE_P32: + return 2; + case NV_TYPE_F64: + return 3; + } + assert(0); +} + +static INLINE uint +nv_type_sizeof(ubyte type) +{ + if (type & 0xf0) + return (1 << nv_type_order(type)) * (type >> 4); + return 1 << nv_type_order(type); +} + +static INLINE uint +nv_type_sizeof_base(ubyte type) +{ + return 1 << nv_type_order(type); +} + +struct nv_reg { + int id; + ubyte file; + ubyte type; /* type of generating instruction's result */ + ubyte as_type; /* default type for new references to this value */ + union { + float f32; + double f64; + int32_t s32; + uint32_t u32; + } imm; +}; + +struct nv_range { + struct nv_range *next; + int bgn; + int end; +}; + +struct nv_value { + struct nv_reg reg; + struct nv_instruction *insn; + struct nv_value *join; + int n; + struct nv_range *livei; + int refc; + + struct nv_value *next; + struct nv_value *prev; +}; + +struct nv_ref { + struct nv_value *value; + ubyte mod; + ubyte typecast; + ubyte flags; /* not used yet */ +}; + +struct nv_basic_block; + +struct nv_instruction { + struct nv_instruction *next; + struct nv_instruction *prev; + uint opcode; + int serial; + struct nv_value *def[4]; + struct nv_value *flags_def; + struct nv_ref *src[5]; + struct nv_ref *flags_src; + struct nv_basic_block *bb; + struct nv_basic_block *target; /* target block of control flow insn */ + ubyte cc; + ubyte set_cond : 4; + ubyte fixed : 1; /* don't optimize away */ + ubyte is_terminator : 1; + ubyte is_join : 1; + ubyte is_long : 1; /* for emission */ + /* */ + ubyte saturate : 1; + ubyte centroid : 1; + ubyte flat : 1; + ubyte lanes : 4; + ubyte tex_live : 1; + /* */ + ubyte tex_t; /* TIC binding */ + ubyte tex_s; /* TSC binding */ + ubyte tex_argc : 3; + ubyte tex_cube : 1; + ubyte tex_mask : 4; + /* */ + ubyte quadop; +}; + +#define CFG_EDGE_FORWARD 0 +#define CFG_EDGE_BACK 1 +#define CFG_EDGE_LOOP_ENTER 2 +#define CFG_EDGE_LOOP_LEAVE 4 +#define CFG_EDGE_FAKE 8 + +/* 'WALL' edge means where reachability check doesn't follow */ +/* 'LOOP' edge means just having to do with loops */ +#define IS_LOOP_EDGE(k) ((k) & 7) +#define IS_WALL_EDGE(k) ((k) & 9) + +struct nv_basic_block { + struct nv_instruction *entry; /* first non-phi instruction */ + struct nv_instruction *exit; + struct nv_instruction *phi; /* very first instruction */ + int num_instructions; + + struct nv_basic_block *out[2]; /* no indirect branches -> 2 */ + struct nv_basic_block *in[8]; /* hope that suffices */ + uint num_in; + ubyte out_kind[2]; + ubyte in_kind[8]; + + int id; + int subroutine; + uint priv; /* reset to 0 after you're done */ + uint pass_seq; + + uint32_t bin_pos; /* position, size in emitted code */ + uint32_t bin_size; + + uint32_t live_set[NV_PC_MAX_VALUES / 32]; +}; + +#define NV_FIXUP_CFLOW_RELOC 0 +#define NV_FIXUP_PARAM_RELOC 1 + +struct nv_fixup { + ubyte type; + ubyte shift; + uint32_t mask; + uint32_t data; + uint32_t offset; +}; + +static INLINE void +nv_fixup_apply(uint32_t *bin, struct nv_fixup *fixup, uint32_t data) +{ + uint32_t val; + + val = bin[fixup->offset / 4] & ~fixup->mask; + data = (fixup->shift < 0) ? (data >> fixup->shift) : (data << fixup->shift); + val |= (fixup->data + data) & fixup->mask; + bin[fixup->offset / 4] = val; +} + +struct nv50_translation_info; + +struct nv_pc { + struct nv_basic_block **root; + struct nv_basic_block *current_block; + struct nv_basic_block *parent_block; + + int loop_nesting_bound; + uint pass_seq; + + struct nv_value values[NV_PC_MAX_VALUES]; + struct nv_instruction instructions[NV_PC_MAX_INSTRUCTIONS]; + struct nv_ref **refs; + struct nv_basic_block *bb_list[NV_PC_MAX_BASIC_BLOCKS]; + int num_values; + int num_instructions; + int num_refs; + int num_blocks; + int num_subroutines; + + int max_reg[4]; + + uint32_t *immd_buf; /* populated on emit */ + unsigned immd_count; + + uint32_t *emit; + unsigned bin_size; + unsigned bin_pos; + + struct nv_fixup *fixups; + int num_fixups; + + /* optimization enables */ + boolean opt_reload_elim; +}; + +void nvbb_insert_tail(struct nv_basic_block *, struct nv_instruction *); +void nvi_insert_after(struct nv_instruction *, struct nv_instruction *); + +static INLINE struct nv_instruction * +nv_alloc_instruction(struct nv_pc *pc, uint opcode) +{ + struct nv_instruction *insn; + + insn = &pc->instructions[pc->num_instructions++]; + assert(pc->num_instructions < NV_PC_MAX_INSTRUCTIONS); + + insn->cc = NV_CC_TR; + insn->opcode = opcode; + + return insn; +} + +static INLINE struct nv_instruction * +new_instruction(struct nv_pc *pc, uint opcode) +{ + struct nv_instruction *insn = nv_alloc_instruction(pc, opcode); + + nvbb_insert_tail(pc->current_block, insn); + return insn; +} + +static INLINE struct nv_instruction * +new_instruction_at(struct nv_pc *pc, struct nv_instruction *at, uint opcode) +{ + struct nv_instruction *insn = nv_alloc_instruction(pc, opcode); + + nvi_insert_after(at, insn); + return insn; +} + +static INLINE struct nv_value * +new_value(struct nv_pc *pc, ubyte file, ubyte type) +{ + struct nv_value *value = &pc->values[pc->num_values]; + + assert(pc->num_values < NV_PC_MAX_VALUES - 1); + + value->n = pc->num_values++; + value->join = value; + value->reg.id = -1; + value->reg.file = file; + value->reg.type = value->reg.as_type = type; + return value; +} + +static INLINE struct nv_value * +new_value_like(struct nv_pc *pc, struct nv_value *like) +{ + struct nv_value *val = new_value(pc, like->reg.file, like->reg.type); + val->reg.as_type = like->reg.as_type; + return val; +} + +static INLINE struct nv_ref * +new_ref(struct nv_pc *pc, struct nv_value *val) +{ + int i; + struct nv_ref *ref; + + if ((pc->num_refs % 64) == 0) { + const unsigned old_size = pc->num_refs * sizeof(struct nv_ref *); + const unsigned new_size = (pc->num_refs + 64) * sizeof(struct nv_ref *); + + pc->refs = REALLOC(pc->refs, old_size, new_size); + + ref = CALLOC(64, sizeof(struct nv_ref)); + for (i = 0; i < 64; ++i) + pc->refs[pc->num_refs + i] = &ref[i]; + } + + ref = pc->refs[pc->num_refs++]; + ref->value = val; + ref->typecast = val->reg.as_type; + + ++val->refc; + return ref; +} + +static INLINE struct nv_basic_block * +new_basic_block(struct nv_pc *pc) +{ + struct nv_basic_block *bb; + + if (pc->num_blocks >= NV_PC_MAX_BASIC_BLOCKS) + return NULL; + + bb = CALLOC_STRUCT(nv_basic_block); + + bb->id = pc->num_blocks; + pc->bb_list[pc->num_blocks++] = bb; + return bb; +} + +static INLINE void +nv_reference(struct nv_pc *pc, struct nv_ref **d, struct nv_value *s) +{ + if (*d) + --(*d)->value->refc; + + if (s) { + if (!*d) + *d = new_ref(pc, s); + else { + (*d)->value = s; + ++(s->refc); + } + } else { + *d = NULL; + } +} + +/* nv50_emit.c */ +void nv50_emit_instruction(struct nv_pc *, struct nv_instruction *); + +/* nv50_print.c */ +const char *nv_opcode_name(uint opcode); +void nv_print_instruction(struct nv_instruction *); + +/* nv50_pc.c */ + +void nv_print_function(struct nv_basic_block *root); +void nv_print_program(struct nv_pc *); + +boolean nv_op_commutative(uint opcode); +int nv50_indirect_opnd(struct nv_instruction *); +boolean nv50_nvi_can_use_imm(struct nv_instruction *, int s); +boolean nv50_nvi_can_predicate(struct nv_instruction *); +boolean nv50_nvi_can_load(struct nv_instruction *, int s, struct nv_value *); +boolean nv50_op_can_write_flags(uint opcode); +ubyte nv50_supported_src_mods(uint opcode, int s); +int nv_nvi_refcount(struct nv_instruction *); +void nv_nvi_delete(struct nv_instruction *); +void nv_nvi_permute(struct nv_instruction *, struct nv_instruction *); +void nvbb_attach_block(struct nv_basic_block *parent, + struct nv_basic_block *, ubyte edge_kind); +boolean nvbb_dominated_by(struct nv_basic_block *, struct nv_basic_block *); +boolean nvbb_reachable_by(struct nv_basic_block *, struct nv_basic_block *, + struct nv_basic_block *); +struct nv_basic_block *nvbb_dom_frontier(struct nv_basic_block *); +int nvcg_replace_value(struct nv_pc *pc, struct nv_value *old_val, + struct nv_value *new_val); +struct nv_value *nvcg_find_immediate(struct nv_ref *); +struct nv_value *nvcg_find_constant(struct nv_ref *); + +typedef void (*nv_pc_pass_func)(void *priv, struct nv_basic_block *b); + +void nv_pc_pass_in_order(struct nv_basic_block *, nv_pc_pass_func, void *); + +int nv_pc_exec_pass0(struct nv_pc *pc); +int nv_pc_exec_pass1(struct nv_pc *pc); +int nv_pc_exec_pass2(struct nv_pc *pc); + +int nv50_tgsi_to_nc(struct nv_pc *, struct nv50_translation_info *); + +#endif // NV50_COMPILER_H diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c new file mode 100644 index 00000000000..137a531dd62 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc_emit.c @@ -0,0 +1,1216 @@ +/* + * Copyright 2010 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "nv50_context.h" +#include "nv50_pc.h" + +// Definitions + +#define FLAGS_CC_SHIFT 7 +#define FLAGS_ID_SHIFT 12 +#define FLAGS_WR_ID_SHIFT 4 +#define FLAGS_CC_MASK (0x1f << FLAGS_CC_SHIFT) +#define FLAGS_ID_MASK (0x03 << FLAGS_ID_SHIFT) +#define FLAGS_WR_EN (1 << 6) +#define FLAGS_WR_ID_MASK (0x3 << FLAGS_WR_ID_SHIFT) + +const ubyte nv50_inst_min_size_tab[NV_OP_COUNT] = +{ + 0, 0, 0, 8, 8, 4, 4, 4, 8, 4, 4, 8, 8, 8, 8, 8, /* 15 */ + 8, 8, 8, 4, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, /* 31 */ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, /* 47 */ + 4, 8, 8, 8, 8, 8, 0, 0, 8 +}; + +/* XXX: silence, you ! */ +unsigned +nv50_inst_min_size(struct nv_instruction *i); + +unsigned +nv50_inst_min_size(struct nv_instruction *i) +{ + int n; + + if (nv50_inst_min_size_tab[i->opcode] > 4) + return 8; + + if (i->def[0] && i->def[0]->reg.file != NV_FILE_GPR) + return 8; + if (i->def[0]->join->reg.id > 63) + return 8; + + for (n = 0; n < 3; ++n) { + if (!i->src[n]) + break; + if (i->src[n]->value->reg.file != NV_FILE_GPR && + i->src[n]->value->reg.file != NV_FILE_MEM_V) + return 8; + if (i->src[n]->value->reg.id > 63) + return 8; + } + + if (i->flags_def || i->flags_src || i->src[4]) + return 8; + + if (i->is_join) + return 8; + + if (i->src[2]) { + if (i->saturate || i->src[2]->mod) + return 8; + if (i->src[0]->mod ^ i->src[1]->mod) + return 8; + if ((i->src[0]->mod | i->src[1]->mod) & NV_MOD_ABS) + return 8; + if (i->def[0]->join->reg.id < 0 || + i->def[0]->join->reg.id != i->src[2]->value->join->reg.id) + return 8; + } + + return nv50_inst_min_size_tab[i->opcode]; +} + +static INLINE ubyte +STYPE(struct nv_instruction *nvi, int s) +{ + return nvi->src[s]->typecast; +} + +static INLINE ubyte +DTYPE(struct nv_instruction *nvi, int d) +{ + return nvi->def[d]->reg.type; +} + +static INLINE struct nv_reg * +SREG(struct nv_ref *ref) +{ + return &ref->value->join->reg; +} + +static INLINE struct nv_reg * +DREG(struct nv_value *val) +{ + return &val->join->reg; +} + +static INLINE ubyte +SFILE(struct nv_instruction *nvi, int s) +{ + return nvi->src[s]->value->reg.file; +} + +static INLINE ubyte +DFILE(struct nv_instruction *nvi, int d) +{ + return nvi->def[0]->reg.file; +} + +static INLINE void +SID(struct nv_pc *pc, struct nv_ref *ref, int pos) +{ + pc->emit[pos / 32] |= SREG(ref)->id << (pos % 32); +} + +static INLINE void +DID(struct nv_pc *pc, struct nv_value *val, int pos) +{ + pc->emit[pos / 32] |= DREG(val)->id << (pos % 32); +} + +static INLINE uint32_t +get_immd_u32(struct nv_ref *ref) +{ + assert(ref->value->reg.file == NV_FILE_IMM); + return ref->value->reg.imm.u32; +} + +static INLINE void +set_immd_u32(struct nv_pc *pc, uint32_t u32) +{ + pc->emit[1] |= 3; + pc->emit[0] |= (u32 & 0x3f) << 16; + pc->emit[1] |= (u32 >> 6) << 2; +} + +static INLINE void +set_immd(struct nv_pc *pc, struct nv_ref *ref) +{ + assert(ref->value->reg.file == NV_FILE_IMM); + set_immd_u32(pc, get_immd_u32(ref)); +} + +static void +new_fixup(struct nv_pc *pc, unsigned type, uint32_t data, uint32_t m, int s) +{ + const unsigned size = sizeof(struct nv_fixup); + const unsigned n = pc->num_fixups; + return; + + if (!(n % 8)) + pc->fixups = REALLOC(pc->fixups, n * size, (n + 8) * size); + + pc->fixups[n].offset = pc->bin_pos + (s / 32); + pc->fixups[n].type = type; + pc->fixups[n].data = data; + pc->fixups[n].mask = m << (s % 32); + pc->fixups[n].shift = s % 32; + + ++pc->num_fixups; + + assert(((data << (s % 32)) & pc->fixups[n].mask) == (data << (s % 32))); +} + +static void +nv_pc_alloc_immd(struct nv_pc *pc, struct nv_ref *ref) +{ + uint32_t i, val = get_immd_u32(ref); + + for (i = 0; i < pc->immd_count; ++i) + if (pc->immd_buf[i] == val) + break; + + if (i == pc->immd_count) { + if (!(pc->immd_count % 8)) + pc->immd_buf = REALLOC(pc->immd_buf, + pc->immd_count * 4, (pc->immd_count + 8) * 4); + pc->immd_buf[pc->immd_count++] = val; + } + + SREG(ref)->id = i; +} + +static INLINE void +set_pred(struct nv_pc *pc, struct nv_instruction *i) +{ + assert(!(pc->emit[1] & 0x00003f80)); + + pc->emit[1] |= i->cc << 7; + if (i->flags_src) + pc->emit[1] |= SREG(i->flags_src)->id << 12; +} + +static INLINE void +set_pred_wr(struct nv_pc *pc, struct nv_instruction *i) +{ + assert(!(pc->emit[1] & 0x00000070)); + + if (i->flags_def) + pc->emit[1] |= (DREG(i->flags_def)->id << 4) | 0x40; +} + +static INLINE void +set_a16_bits(struct nv_pc *pc, uint id) +{ + ++id; /* $a0 is always 0 */ + pc->emit[0] |= (id & 3) << 26; + pc->emit[1] |= id & 4; +} + +static INLINE void +set_addr(struct nv_pc *pc, struct nv_instruction *i) +{ + if (i->src[4]) + set_a16_bits(pc, SREG(i->src[4])->id); +} + +static void +set_dst(struct nv_pc *pc, struct nv_value *value) +{ + struct nv_reg *reg = &value->join->reg; + + if (reg->id < 0) { + pc->emit[0] |= (127 << 2) | 1; /* set 'long'-bit to catch bugs */ + pc->emit[1] |= 0x8; + return; + } + + if (reg->file == NV_FILE_OUT) + pc->emit[1] |= 0x8; + else + if (reg->file == NV_FILE_ADDR) + assert(0); + + pc->emit[0] |= reg->id << 2; +} + +static void +set_src_0(struct nv_pc *pc, struct nv_ref *ref) +{ + struct nv_reg *reg = SREG(ref); + + if (reg->file == NV_FILE_MEM_S) + pc->emit[1] |= 0x00200000; + else + if (reg->file == NV_FILE_MEM_P) + pc->emit[0] |= 0x01800000; + else + if (reg->file != NV_FILE_GPR) + NOUVEAU_ERR("invalid src0 register file: %d\n", reg->file); + + assert(reg->id < 128); + pc->emit[0] |= reg->id << 9; +} + +static void +set_src_1(struct nv_pc *pc, struct nv_ref *ref) +{ + struct nv_reg *reg = SREG(ref); + + if (reg->file >= NV_FILE_MEM_C(0) && + reg->file <= NV_FILE_MEM_C(15)) { + assert(!(pc->emit[1] & 0x01800000)); + + pc->emit[0] |= 0x00800000; + pc->emit[1] |= (reg->file - NV_FILE_MEM_C(0)) << 22; + } else + if (reg->file != NV_FILE_GPR) + NOUVEAU_ERR("invalid src1 register file: %d\n", reg->file); + + assert(reg->id < 128); + pc->emit[0] |= reg->id << 16; +} + +static void +set_src_2(struct nv_pc *pc, struct nv_ref *ref) +{ + struct nv_reg *reg = SREG(ref); + + if (reg->file >= NV_FILE_MEM_C(0) && + reg->file <= NV_FILE_MEM_C(15)) { + assert(!(pc->emit[1] & 0x01800000)); + + pc->emit[0] |= 0x01000000; + pc->emit[1] |= (reg->file - NV_FILE_MEM_C(0)) << 22; + } else + if (reg->file != NV_FILE_GPR) + NOUVEAU_ERR("invalid src2 register file: %d\n", reg->file); + + assert(reg->id < 128); + pc->emit[1] |= reg->id << 14; +} + +/* the default form: + * - long instruction + * - 1 to 3 sources in slots 0, 1, 2 + * - address & flags + */ +static void +emit_form_MAD(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] |= 1; + + set_pred(pc, i); + set_pred_wr(pc, i); + + if (i->def[0]) + set_dst(pc, i->def[0]); + else { + pc->emit[0] |= 0x01fc; + pc->emit[1] |= 0x0008; + } + + if (i->src[0]) + set_src_0(pc, i->src[0]); + + if (i->src[1]) + set_src_1(pc, i->src[1]); + + if (i->src[2]) + set_src_2(pc, i->src[2]); + + set_addr(pc, i); +} + +/* like default form, but 2nd source in slot 2, no 3rd source */ +static void +emit_form_ADD(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] |= 1; + + if (i->def[0]) + set_dst(pc, i->def[0]); + else { + pc->emit[0] |= 0x01fc; + pc->emit[1] |= 0x0008; + } + + set_pred(pc, i); + set_pred_wr(pc, i); + + if (i->src[0]) + set_src_0(pc, i->src[0]); + + if (i->src[1]) + set_src_2(pc, i->src[1]); + + set_addr(pc, i); +} + +/* short mul */ +static void +emit_form_MUL(struct nv_pc *pc, struct nv_instruction *i) +{ + assert(!i->is_long && !(pc->emit[0] & 1)); + + assert(i->def[0]); + set_dst(pc, i->def[0]); + + if (i->src[0]) + set_src_0(pc, i->src[0]); + + if (i->src[1]) + set_src_1(pc, i->src[1]); +} + +/* default immediate form + * - 1 to 3 sources where last is immediate + * - no address or predicate possible + */ +static void +emit_form_IMM(struct nv_pc *pc, struct nv_instruction *i, ubyte mod_mask) +{ + pc->emit[0] |= 1; + + assert(i->def[0]); + assert(i->src[0]); + set_dst(pc, i->def[0]); + + assert(!i->src[4] && !i->flags_src && !i->flags_def); + + if (i->src[2]) { + set_immd(pc, i->src[2]); + set_src_0(pc, i->src[1]); + set_src_1(pc, i->src[0]); + } else + if (i->src[1]) { + set_immd(pc, i->src[1]); + set_src_0(pc, i->src[0]); + } else + set_immd(pc, i->src[0]); + + assert(!mod_mask); +} + +static void +set_ld_st_size(struct nv_pc *pc, int s, ubyte type) +{ + switch (type) { + case NV_TYPE_F64: + pc->emit[1] |= 0x8000 << s; + break; + case NV_TYPE_F32: + case NV_TYPE_S32: + case NV_TYPE_U32: + pc->emit[1] |= 0xc000 << s; + break; + case NV_TYPE_S16: + pc->emit[1] |= 0x6000 << s; + break; + case NV_TYPE_U16: + pc->emit[1] |= 0x4000 << s; + break; + case NV_TYPE_S8: + pc->emit[1] |= 0x2000 << s; + break; + default: + break; + } +} + +static void +emit_ld(struct nv_pc *pc, struct nv_instruction *i) +{ + ubyte sf = SFILE(i, 0); + + if (sf == NV_FILE_IMM) { + sf = NV_FILE_MEM_C(0); + nv_pc_alloc_immd(pc, i->src[0]); + + new_fixup(pc, NV_FIXUP_PARAM_RELOC, SREG(i->src[0])->id, 0xffff, 9); + } + + if (sf == NV_FILE_MEM_S || + sf == NV_FILE_MEM_P) { + pc->emit[0] = 0x10000001; + pc->emit[1] = 0x04200000 | (0x3c << 12); + if (sf == NV_FILE_MEM_P) + pc->emit[0] |= 0x01800000; + } else + if (sf >= NV_FILE_MEM_C(0) && + sf <= NV_FILE_MEM_C(15)) { + pc->emit[0] = 0x10000001; + pc->emit[1] = 0x24000000; + pc->emit[1] |= (sf - NV_FILE_MEM_C(0)) << 22; + } else + if (sf >= NV_FILE_MEM_G(0) && + sf <= NV_FILE_MEM_G(15)) { + pc->emit[0] = 0xd0000001 | ((sf - NV_FILE_MEM_G(0)) << 16); + pc->emit[1] = 0xa0000000; + + assert(i->src[4] && SREG(i->src[4])->file == NV_FILE_GPR); + SID(pc, i->src[4], 9); + } else + if (sf == NV_FILE_MEM_L) { + pc->emit[0] = 0xd0000001; + pc->emit[1] = 0x40000000; + + set_addr(pc, i); + } else { + NOUVEAU_ERR("invalid ld source file\n"); + abort(); + } + + set_ld_st_size(pc, (sf == NV_FILE_MEM_L) ? 8 : 0, STYPE(i, 0)); + + set_dst(pc, i->def[0]); + set_pred_wr(pc, i); + + set_pred(pc, i); + + if (sf < NV_FILE_MEM_G(0) || + sf > NV_FILE_MEM_G(15)) { + SID(pc, i->src[0], 9); + set_addr(pc, i); + } +} + +static void +emit_st(struct nv_pc *pc, struct nv_instruction *i) +{ + assert(SFILE(i, 1) == NV_FILE_GPR); + assert(SFILE(i, 0) == NV_FILE_MEM_L); + + pc->emit[0] = 0xd0000001; + pc->emit[1] = 0x60000000; + + SID(pc, i->src[1], 2); + SID(pc, i->src[0], 9); + + set_ld_st_size(pc, 8, STYPE(i, 1)); + + set_addr(pc, i); + set_pred(pc, i); +} + +static int +verify_mov(struct nv_instruction *i) +{ + ubyte sf = SFILE(i, 0); + ubyte df = DFILE(i, 0); + + if (df == NV_FILE_GPR) + return 0; + + if (df != NV_FILE_OUT && + df != NV_FILE_FLAGS && + df != NV_FILE_ADDR) + return 1; + + if (sf == NV_FILE_FLAGS) + return 2; + if (sf == NV_FILE_ADDR) + return 3; + if (sf == NV_FILE_IMM && df != NV_FILE_OUT) + return 4; + + return 0; +} + +static void +emit_mov(struct nv_pc *pc, struct nv_instruction *i) +{ + assert(!verify_mov(i)); + + if (SFILE(i, 0) >= NV_FILE_MEM_S) + emit_ld(pc, i); + else + if (SFILE(i, 0) == NV_FILE_FLAGS) { + pc->emit[0] = 0x00000001 | (DREG(i->def[0])->id << 2); + pc->emit[1] = 0x20000780 | (SREG(i->src[0])->id << 12); + } else + if (SFILE(i, 0) == NV_FILE_ADDR) { + pc->emit[0] = 0x00000001 | (DREG(i->def[0])->id << 2); + pc->emit[1] = 0x40000780; + set_a16_bits(pc, SREG(i->src[0])->id); + } else + if (DFILE(i, 0) == NV_FILE_FLAGS) { + pc->emit[0] = 0x00000001; + pc->emit[1] = 0xa0000000 | (1 << 6); + set_pred(pc, i); + pc->emit[0] |= SREG(i->src[0])->id << 9; + pc->emit[1] |= DREG(i->def[0])->id << 4; + } else + if (SFILE(i, 0) == NV_FILE_IMM) { + if (i->opcode == NV_OP_LDA) { + emit_ld(pc, i); + } else { + pc->emit[0] = 0x10008001; + pc->emit[1] = 0x00000003; + + emit_form_IMM(pc, i, 0); + } + } else { + pc->emit[0] = 0x10000000; + pc->emit[0] |= DREG(i->def[0])->id << 2; + pc->emit[0] |= SREG(i->src[0])->id << 9; + + if (!i->is_long) { + pc->emit[0] |= 0x8000; + } else { + pc->emit[0] |= 0x00000001; + pc->emit[1] = 0x0403c000; + + set_pred(pc, i); + } + } + + if (DFILE(i, 0) == NV_FILE_OUT) + pc->emit[1] |= 0x8; +} + +static void +emit_interp(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0x80000000; + + assert(DFILE(i, 0) == NV_FILE_GPR); + assert(SFILE(i, 0) == NV_FILE_MEM_V); + + DID(pc, i->def[0], 2); + SID(pc, i->src[0], 16); + + if (i->flat) + pc->emit[0] |= 1 << 8; + else + if (i->opcode == NV_OP_PINTERP) { + pc->emit[0] |= 1 << 25; + pc->emit[0] |= SREG(i->src[1])->id << 9; + } + + if (i->centroid) + pc->emit[0] |= 1 << 24; + + assert(i->is_long || !i->flags_src); + + if (i->is_long) { + set_pred(pc, i); + + pc->emit[1] |= + (pc->emit[0] & (3 << 24)) >> (24 - 16) | + (pc->emit[0] & (1 << 8)) >> (18 - 8); + + pc->emit[0] |= 1; + pc->emit[0] &= ~0x03000100; + } +} + +static void +emit_minmax(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0x30000000; + pc->emit[1] = (i->opcode == NV_OP_MIN) ? (2 << 28) : 0; + + switch (DTYPE(i, 0)) { + case NV_TYPE_F32: + pc->emit[0] |= 0x80000000; + pc->emit[1] |= 0x80000000; + break; + case NV_TYPE_S32: + pc->emit[1] |= 0x8c000000; + break; + case NV_TYPE_U32: + pc->emit[1] |= 0x84000000; + break; + } + + emit_form_MAD(pc, i); + + if (i->src[0]->mod & NV_MOD_ABS) pc->emit[1] |= 0x00100000; + if (i->src[1]->mod & NV_MOD_ABS) pc->emit[1] |= 0x00080000; +} + +static void +emit_add_f32(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0xb0000000; + + assert(!((i->src[0]->mod | i->src[1]->mod) & NV_MOD_ABS)); + + if (SFILE(i, 1) == NV_FILE_IMM) { + emit_form_IMM(pc, i, 0); + + if (i->src[0]->mod & NV_MOD_NEG) pc->emit[0] |= 0x8000; + if (i->src[1]->mod & NV_MOD_NEG) pc->emit[0] |= 1 << 22; + } else + if (i->is_long) { + emit_form_ADD(pc, i); + + if (i->src[0]->mod & NV_MOD_NEG) pc->emit[1] |= 1 << 26; + if (i->src[1]->mod & NV_MOD_NEG) pc->emit[1] |= 1 << 27; + + if (i->saturate) + pc->emit[1] |= 0x20000000; + } else { + emit_form_MUL(pc, i); + + if (i->src[0]->mod & NV_MOD_NEG) pc->emit[0] |= 0x8000; + if (i->src[1]->mod & NV_MOD_NEG) pc->emit[0] |= 1 << 22; + } +} + +static void +emit_add_b32(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0x20008000; + + if (SFILE(i, 1) == NV_FILE_IMM) { + emit_form_IMM(pc, i, 0); + } else + if (i->is_long) { + pc->emit[0] = 0x20000000; + pc->emit[1] = 0x04000000; + emit_form_ADD(pc, i); + } else { + emit_form_MUL(pc, i); + } + + if (i->src[0]->mod & NV_MOD_NEG) pc->emit[0] |= 1 << 28; + if (i->src[1]->mod & NV_MOD_NEG) pc->emit[0] |= 1 << 22; +} + +static void +emit_add_a16(struct nv_pc *pc, struct nv_instruction *i) +{ + int s = (i->opcode == NV_OP_MOV) ? 0 : 1; + + pc->emit[0] = 0xd0000001 | ((uint16_t)get_immd_u32(i->src[s]) << 9); + pc->emit[1] = 0x20000000; + + pc->emit[0] |= (DREG(i->def[0])->id + 1) << 2; + + set_pred(pc, i); + + if (i->src[1]) + set_a16_bits(pc, SREG(i->src[1])->id + 1); +} + +static void +emit_flow(struct nv_pc *pc, struct nv_instruction *i, ubyte flow_op) +{ + pc->emit[0] = 0x00000003 | (flow_op << 28); + pc->emit[1] = 0x00000000; + + set_pred(pc, i); + + if (i->target && (i->opcode != NV_OP_BREAK)) { + new_fixup(pc, NV_FIXUP_CFLOW_RELOC, i->target->bin_pos, 0x7ff800, 11); + pc->emit[0] |= (i->target->bin_pos / 4) << 11; + } +} + +static INLINE void +emit_add(struct nv_pc *pc, struct nv_instruction *i) +{ + if (DFILE(i, 0) == NV_FILE_ADDR) + emit_add_a16(pc, i); + else { + switch (DTYPE(i, 0)) { + case NV_TYPE_F32: + emit_add_f32(pc, i); + break; + case NV_TYPE_U32: + case NV_TYPE_S32: + emit_add_b32(pc, i); + break; + } + } +} + +static void +emit_bitop2(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0xd0000000; + + if (SFILE(i, 1) == NV_FILE_IMM) { + emit_form_IMM(pc, i, 0); + + if (i->opcode == NV_OP_OR) + pc->emit[0] |= 0x0100; + else + if (i->opcode == NV_OP_XOR) + pc->emit[0] |= 0x8000; + } else { + emit_form_MAD(pc, i); + + pc->emit[1] |= 0x04000000; + + if (i->opcode == NV_OP_OR) + pc->emit[1] |= 0x4000; + else + if (i->opcode == NV_OP_XOR) + pc->emit[1] |= 0x8000; + } +} + +static void +emit_arl(struct nv_pc *pc, struct nv_instruction *i) +{ + assert(SFILE(i, 0) == NV_FILE_GPR); + assert(SFILE(i, 1) == NV_FILE_IMM); + + assert(!i->flags_def); + + pc->emit[0] = 0x00000001; + pc->emit[1] = 0xc0000000; + + pc->emit[0] |= (i->def[0]->reg.id + 1) << 2; + set_pred(pc, i); + set_src_0(pc, i->src[0]); + pc->emit[0] |= (get_immd_u32(i->src[1]) & 0x3f) << 16; +} + +static void +emit_shift(struct nv_pc *pc, struct nv_instruction *i) +{ + if (DFILE(i, 0) == NV_FILE_ADDR) { + emit_arl(pc, i); + return; + } + + pc->emit[0] = 0x30000001; + pc->emit[1] = 0xc4000000; + + if (i->opcode == NV_OP_SHR) + pc->emit[1] |= 1 << 29; + + if (SFILE(i, 1) == NV_FILE_IMM) { + pc->emit[1] |= 1 << 20; + pc->emit[0] |= (get_immd_u32(i->src[1]) & 0x7f) << 16; + + set_pred(pc, i); + } else + emit_form_MAD(pc, i); + + if (STYPE(i, 0) == NV_TYPE_S32) + pc->emit[1] |= 1 << 27; +} + +static void +emit_flop(struct nv_pc *pc, struct nv_instruction *i) +{ + struct nv_ref *src0 = i->src[0]; + + pc->emit[0] = 0x90000000; + + assert(STYPE(i, 0) == NV_TYPE_F32); + assert(SFILE(i, 0) == NV_FILE_GPR); + + if (!i->is_long) { + emit_form_MUL(pc, i); + assert(i->opcode == NV_OP_RCP && !src0->mod); + return; + } + + pc->emit[1] = (i->opcode - NV_OP_RCP) << 29; + + emit_form_MAD(pc, i); + + if (src0->mod & NV_MOD_NEG) pc->emit[1] |= 0x04000000; + if (src0->mod & NV_MOD_ABS) pc->emit[1] |= 0x00100000; +} + +static void +emit_mad_f32(struct nv_pc *pc, struct nv_instruction *i) +{ + const boolean neg_mul = (i->src[0]->mod ^ i->src[1]->mod) & NV_MOD_NEG; + const boolean neg_add = (i->src[2]->mod & NV_MOD_NEG); + + pc->emit[0] = 0xe0000000; + + if (!i->is_long) { + emit_form_MUL(pc, i); + assert(!neg_mul && !neg_add); + return; + } + + emit_form_MAD(pc, i); + + if (neg_mul) pc->emit[1] |= 0x04000000; + if (neg_add) pc->emit[1] |= 0x08000000; + + if (i->saturate) + pc->emit[1] |= 0x20000000; +} + +static INLINE void +emit_mad(struct nv_pc *pc, struct nv_instruction *i) +{ + emit_mad_f32(pc, i); +} + +static void +emit_mul_f32(struct nv_pc *pc, struct nv_instruction *i) +{ + boolean neg = (i->src[0]->mod ^ i->src[1]->mod) & NV_MOD_NEG; + + pc->emit[0] = 0xc0000000; + + if (SFILE(i, 1) == NV_FILE_IMM) { + emit_form_IMM(pc, i, 0); + + if (neg) + pc->emit[0] |= 0x8000; + } else + if (i->is_long) { + emit_form_MAD(pc, i); + + if (neg) + pc->emit[1] |= 0x08 << 24; + } else { + emit_form_MUL(pc, i); + + if (neg) + pc->emit[0] |= 0x8000; + } +} + +static void +emit_set(struct nv_pc *pc, struct nv_instruction *nvi) +{ + assert(nvi->is_long); + + pc->emit[0] = 0x30000000; + pc->emit[1] = 0x60000000; + + pc->emit[1] |= nvi->set_cond << 14; + + switch (STYPE(nvi, 0)) { + case NV_TYPE_U32: pc->emit[1] |= 0x04000000; break; + case NV_TYPE_S32: pc->emit[1] |= 0x0c000000; break; + case NV_TYPE_F32: pc->emit[0] |= 0x80000000; break; + default: + assert(0); + break; + } + + emit_form_MAD(pc, nvi); +} + +#define CVT_RN (0x00 << 16) +#define CVT_FLOOR (0x02 << 16) +#define CVT_CEIL (0x04 << 16) +#define CVT_TRUNC (0x06 << 16) +#define CVT_SAT (0x08 << 16) +#define CVT_ABS (0x10 << 16) + +#define CVT_X32_X32 0x04004000 +#define CVT_X32_S32 0x04014000 +#define CVT_F32_F32 ((0xc0 << 24) | CVT_X32_X32) +#define CVT_S32_F32 ((0x88 << 24) | CVT_X32_X32) +#define CVT_U32_F32 ((0x80 << 24) | CVT_X32_X32) +#define CVT_F32_S32 ((0x40 << 24) | CVT_X32_S32) +#define CVT_F32_U32 ((0x40 << 24) | CVT_X32_X32) +#define CVT_S32_S32 ((0x08 << 24) | CVT_X32_S32) +#define CVT_S32_U32 ((0x08 << 24) | CVT_X32_X32) +#define CVT_U32_S32 ((0x00 << 24) | CVT_X32_S32) +#define CVT_U32_U32 ((0x00 << 24) | CVT_X32_X32) + +#define CVT_NEG 0x20000000 +#define CVT_RI 0x08000000 + +static void +emit_cvt(struct nv_pc *pc, struct nv_instruction *nvi) +{ + ubyte dst_type = nvi->def[0] ? DTYPE(nvi, 0) : STYPE(nvi, 0); + + pc->emit[0] = 0xa0000000; + + switch (dst_type) { + case NV_TYPE_F32: + switch (STYPE(nvi, 0)) { + case NV_TYPE_F32: pc->emit[1] = CVT_F32_F32; break; + case NV_TYPE_S32: pc->emit[1] = CVT_F32_S32; break; + case NV_TYPE_U32: pc->emit[1] = CVT_F32_U32; break; + } + break; + case NV_TYPE_S32: + switch (STYPE(nvi, 0)) { + case NV_TYPE_F32: pc->emit[1] = CVT_S32_F32; break; + case NV_TYPE_S32: pc->emit[1] = CVT_S32_S32; break; + case NV_TYPE_U32: pc->emit[1] = CVT_S32_U32; break; + } + break; + case NV_TYPE_U32: + switch (STYPE(nvi, 0)) { + case NV_TYPE_F32: pc->emit[1] = CVT_U32_F32; break; + case NV_TYPE_S32: pc->emit[1] = CVT_U32_S32; break; + case NV_TYPE_U32: pc->emit[1] = CVT_U32_U32; break; + } + break; + } + if (pc->emit[1] == CVT_F32_F32 && + (nvi->opcode == NV_OP_CEIL || nvi->opcode == NV_OP_FLOOR || + nvi->opcode == NV_OP_TRUNC)) + pc->emit[1] |= CVT_RI; + + switch (nvi->opcode) { + case NV_OP_CEIL: pc->emit[1] |= CVT_CEIL; break; + case NV_OP_FLOOR: pc->emit[1] |= CVT_FLOOR; break; + case NV_OP_TRUNC: pc->emit[1] |= CVT_TRUNC; break; + + case NV_OP_ABS: pc->emit[1] |= CVT_ABS; break; + case NV_OP_SAT: pc->emit[1] |= CVT_SAT; break; + case NV_OP_NEG: pc->emit[1] |= CVT_NEG; break; + default: + assert(nvi->opcode == NV_OP_CVT); + break; + } + assert(nvi->opcode != NV_OP_ABS || !(nvi->src[0]->mod & NV_MOD_NEG)); + + if (nvi->src[0]->mod & NV_MOD_NEG) pc->emit[1] ^= CVT_NEG; + if (nvi->src[0]->mod & NV_MOD_ABS) pc->emit[1] |= CVT_ABS; + + emit_form_MAD(pc, nvi); +} + +static void +emit_tex(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0xf0000001; + pc->emit[1] = 0x00000000; + + DID(pc, i->def[0], 2); + + set_pred(pc, i); + + pc->emit[0] |= i->tex_t << 9; + pc->emit[0] |= i->tex_s << 17; + + pc->emit[0] |= (i->tex_argc - 1) << 22; + + pc->emit[0] |= (i->tex_mask & 0x3) << 25; + pc->emit[1] |= (i->tex_mask & 0xc) << 12; + + if (i->tex_live) + pc->emit[1] |= 4; + + if (i->tex_cube) + pc->emit[0] |= 0x08000000; + + if (i->opcode == NV_OP_TXB) + pc->emit[1] |= 0x20000000; + else + if (i->opcode == NV_OP_TXL) + pc->emit[1] |= 0x40000000; +} + +static void +emit_cvt2fixed(struct nv_pc *pc, struct nv_instruction *i) +{ + ubyte mod = i->src[0]->mod; + + pc->emit[0] = 0xb0000000; + pc->emit[1] = 0xc0000000; + + if (i->opcode == NV_OP_PREEX2) + pc->emit[1] |= 0x4000; + + emit_form_MAD(pc, i); + + if (mod & NV_MOD_NEG) pc->emit[1] |= 0x04000000; + if (mod & NV_MOD_ABS) pc->emit[1] |= 0x00100000; +} + +static void +emit_ddx(struct nv_pc *pc, struct nv_instruction *i) +{ + assert(i->is_long && SFILE(i, 0) == NV_FILE_GPR); + + pc->emit[0] = (i->src[0]->mod & NV_MOD_NEG) ? 0xc0240001 : 0xc0140001; + pc->emit[1] = (i->src[0]->mod & NV_MOD_NEG) ? 0x86400000 : 0x89800000; + + DID(pc, i->def[0], 2); + SID(pc, i->src[0], 9); + SID(pc, i->src[0], 32 + 14); + + set_pred(pc, i); + set_pred_wr(pc, i); +} + +static void +emit_ddy(struct nv_pc *pc, struct nv_instruction *i) +{ + assert(i->is_long && SFILE(i, 0) == NV_FILE_GPR); + + pc->emit[0] = (i->src[0]->mod & NV_MOD_NEG) ? 0xc0250001 : 0xc0150001; + pc->emit[1] = (i->src[0]->mod & NV_MOD_NEG) ? 0x85800000 : 0x8a400000; + + DID(pc, i->def[0], 2); + SID(pc, i->src[0], 9); + SID(pc, i->src[0], 32 + 14); + + set_pred(pc, i); + set_pred_wr(pc, i); +} + +static void +emit_quadop(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0xc0000000; + pc->emit[1] = 0x80000000; + + emit_form_ADD(pc, i); + + pc->emit[0] |= i->lanes << 16; + + pc->emit[0] |= (i->quadop & 0x03) << 20; + pc->emit[1] |= (i->quadop & 0xfc) << 20; +} + +void +nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i) +{ + /* nv_print_instruction(i); */ + + switch (i->opcode) { + case NV_OP_MOV: + if (DFILE(i, 0) == NV_FILE_ADDR) + emit_add_a16(pc, i); + else + emit_mov(pc, i); + break; + case NV_OP_LDA: + emit_mov(pc, i); + break; + case NV_OP_STA: + emit_st(pc, i); + break; + case NV_OP_LINTERP: + case NV_OP_PINTERP: + emit_interp(pc, i); + break; + case NV_OP_ADD: + emit_add(pc, i); + break; + case NV_OP_AND: + case NV_OP_OR: + case NV_OP_XOR: + emit_bitop2(pc, i); + break; + case NV_OP_CVT: + case NV_OP_ABS: + case NV_OP_NEG: + case NV_OP_SAT: + case NV_OP_CEIL: + case NV_OP_FLOOR: + case NV_OP_TRUNC: + emit_cvt(pc, i); + break; + case NV_OP_DFDX: + emit_ddx(pc, i); + break; + case NV_OP_DFDY: + emit_ddy(pc, i); + break; + case NV_OP_RCP: + case NV_OP_RSQ: + case NV_OP_LG2: + case NV_OP_SIN: + case NV_OP_COS: + case NV_OP_EX2: + emit_flop(pc, i); + break; + case NV_OP_PRESIN: + case NV_OP_PREEX2: + emit_cvt2fixed(pc, i); + break; + case NV_OP_MAD: + emit_mad(pc, i); + break; + case NV_OP_MAX: + case NV_OP_MIN: + emit_minmax(pc, i); + break; + case NV_OP_MUL: + emit_mul_f32(pc, i); + break; + case NV_OP_SET: + emit_set(pc, i); + break; + case NV_OP_SHL: + case NV_OP_SHR: + emit_shift(pc, i); + break; + case NV_OP_TEX: + case NV_OP_TXB: + case NV_OP_TXL: + emit_tex(pc, i); + break; + case NV_OP_QUADOP: + emit_quadop(pc, i); + break; + case NV_OP_KIL: + emit_flow(pc, i, 0x0); + break; + case NV_OP_BRA: + emit_flow(pc, i, 0x1); + break; + case NV_OP_CALL: + emit_flow(pc, i, 0x2); + break; + case NV_OP_RET: + emit_flow(pc, i, 0x3); + break; + case NV_OP_BREAKADDR: + emit_flow(pc, i, 0x4); + break; + case NV_OP_BREAK: + emit_flow(pc, i, 0x5); + break; + case NV_OP_JOINAT: + emit_flow(pc, i, 0xa); + break; + case NV_OP_NOP: + case NV_OP_JOIN: + pc->emit[0] = 0xf0000001; + pc->emit[1] = 0xe0000000; + break; + case NV_OP_PHI: + case NV_OP_UNDEF: + case NV_OP_SUB: + NOUVEAU_ERR("operation \"%s\" should have been eliminated\n", + nv_opcode_name(i->opcode)); + break; + default: + NOUVEAU_ERR("unhandled NV_OP: %d\n", i->opcode); + abort(); + break; + } + + if (i->is_join) { + assert(i->is_long && !(pc->emit[1] & 1)); + pc->emit[1] |= 2; + } + + assert((pc->emit[0] & 1) == i->is_long); +} diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c new file mode 100644 index 00000000000..921ed156919 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -0,0 +1,1154 @@ +/* + * Copyright 2010 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* #define NV50PC_DEBUG */ + +#include "nv50_pc.h" + +#define DESCEND_ARBITRARY(j, f) \ +do { \ + b->pass_seq = ctx->pc->pass_seq; \ + \ + for (j = 0; j < 2; ++j) \ + if (b->out[j] && b->out[j]->pass_seq < ctx->pc->pass_seq) \ + f(ctx, b->out[j]); \ +} while (0) + +extern unsigned nv50_inst_min_size(struct nv_instruction *); + +struct nv_pc_pass { + struct nv_pc *pc; +}; + +static INLINE boolean +values_equal(struct nv_value *a, struct nv_value *b) +{ + /* XXX: sizes */ + return (a->reg.file == b->reg.file && a->join->reg.id == b->join->reg.id); +} + +static INLINE boolean +inst_commutation_check(struct nv_instruction *a, + struct nv_instruction *b) +{ + int si, di; + + for (di = 0; di < 4; ++di) { + if (!a->def[di]) + break; + for (si = 0; si < 5; ++si) { + if (!b->src[si]) + continue; + if (values_equal(a->def[di], b->src[si]->value)) + return FALSE; + } + } + + if (b->flags_src && b->flags_src->value == a->flags_def) + return FALSE; + + return TRUE; +} + +/* Check whether we can swap the order of the instructions, + * where a & b may be either the earlier or the later one. + */ +static boolean +inst_commutation_legal(struct nv_instruction *a, + struct nv_instruction *b) +{ + return inst_commutation_check(a, b) && inst_commutation_check(b, a); +} + +static INLINE boolean +inst_cullable(struct nv_instruction *nvi) +{ + if (nvi->opcode == NV_OP_STA) + return FALSE; + return (!(nvi->is_terminator || nvi->is_join || + nvi->target || + nvi->fixed || + nv_nvi_refcount(nvi))); +} + +static INLINE boolean +nvi_isnop(struct nv_instruction *nvi) +{ + if (nvi->opcode == NV_OP_EXPORT || nvi->opcode == NV_OP_UNDEF) + return TRUE; + + /* NOTE: 'fixed' now only means that it shouldn't be optimized away, + * but we can still remove it if it is a no-op move. + */ + if (/* nvi->fixed || */ + /* nvi->flags_src || */ /* cond. MOV to same register is still NOP */ + nvi->flags_def || + nvi->is_terminator || + nvi->is_join) + return FALSE; + + if (nvi->def[0] && nvi->def[0]->join->reg.id < 0) + return TRUE; + + if (nvi->opcode != NV_OP_MOV && nvi->opcode != NV_OP_SELECT) + return FALSE; + + if (nvi->def[0]->reg.file != nvi->src[0]->value->reg.file) + return FALSE; + + if (nvi->src[0]->value->join->reg.id < 0) { + NV50_DBGMSG("nvi_isnop: orphaned value detected\n"); + return TRUE; + } + + if (nvi->opcode == NV_OP_SELECT) + if (!values_equal(nvi->def[0], nvi->src[1]->value)) + return FALSE; + + return values_equal(nvi->def[0], nvi->src[0]->value); +} + +struct nv_pass { + struct nv_pc *pc; + int n; + void *priv; +}; + +static int +nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b); + +static void +nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b) +{ + struct nv_pc *pc = (struct nv_pc *)priv; + struct nv_basic_block *in; + struct nv_instruction *nvi, *next; + int j; + uint size, n32 = 0; + + for (j = pc->num_blocks - 1; j >= 0 && !pc->bb_list[j]->bin_size; --j); + if (j >= 0) { + in = pc->bb_list[j]; + + /* check for no-op branches (BRA $PC+8) */ + if (in->exit && in->exit->opcode == NV_OP_BRA && in->exit->target == b) { + in->bin_size -= 8; + pc->bin_size -= 8; + + for (++j; j < pc->num_blocks; ++j) + pc->bb_list[j]->bin_pos -= 8; + + nv_nvi_delete(in->exit); + } + b->bin_pos = in->bin_pos + in->bin_size; + } + + pc->bb_list[pc->num_blocks++] = b; + + /* visit node */ + + for (nvi = b->entry; nvi; nvi = next) { + next = nvi->next; + if (nvi_isnop(nvi)) + nv_nvi_delete(nvi); + } + + for (nvi = b->entry; nvi; nvi = next) { + next = nvi->next; + + size = nv50_inst_min_size(nvi); + if (nvi->next && size < 8) + ++n32; + else + if ((n32 & 1) && nvi->next && + nv50_inst_min_size(nvi->next) == 4 && + inst_commutation_legal(nvi, nvi->next)) { + ++n32; + nv_nvi_permute(nvi, nvi->next); + next = nvi; + } else { + nvi->is_long = 1; + + b->bin_size += n32 & 1; + if (n32 & 1) + nvi->prev->is_long = 1; + n32 = 0; + } + b->bin_size += 1 + nvi->is_long; + } + + if (!b->entry) { + NV50_DBGMSG("block %p is now empty\n", b); + } else + if (!b->exit->is_long) { + assert(n32); + b->exit->is_long = 1; + b->bin_size += 1; + + /* might have del'd a hole tail of instructions */ + if (!b->exit->prev->is_long && !(n32 & 1)) { + b->bin_size += 1; + b->exit->prev->is_long = 1; + } + } + assert(!b->entry || (b->exit && b->exit->is_long)); + + pc->bin_size += b->bin_size *= 4; +} + +static int +nv_pc_pass2(struct nv_pc *pc, struct nv_basic_block *root) +{ + struct nv_pass pass; + + pass.pc = pc; + + pc->pass_seq++; + + nv_pass_flatten(&pass, root); + + nv_pc_pass_in_order(root, nv_pc_pass_pre_emission, pc); + + return 0; +} + +int +nv_pc_exec_pass2(struct nv_pc *pc) +{ + int i, ret; + + NV50_DBGMSG("preparing %u blocks for emission\n", pc->num_blocks); + + pc->num_blocks = 0; /* will reorder bb_list */ + + for (i = 0; i < pc->num_subroutines + 1; ++i) + if (pc->root[i] && (ret = nv_pc_pass2(pc, pc->root[i]))) + return ret; + return 0; +} + +static INLINE boolean +is_cmem_load(struct nv_instruction *nvi) +{ + return (nvi->opcode == NV_OP_LDA && + nvi->src[0]->value->reg.file >= NV_FILE_MEM_C(0) && + nvi->src[0]->value->reg.file <= NV_FILE_MEM_C(15)); +} + +static INLINE boolean +is_smem_load(struct nv_instruction *nvi) +{ + return (nvi->opcode == NV_OP_LDA && + (nvi->src[0]->value->reg.file == NV_FILE_MEM_S || + nvi->src[0]->value->reg.file <= NV_FILE_MEM_P)); +} + +static INLINE boolean +is_immd_move(struct nv_instruction *nvi) +{ + return (nvi->opcode == NV_OP_MOV && + nvi->src[0]->value->reg.file == NV_FILE_IMM); +} + +static INLINE void +check_swap_src_0_1(struct nv_instruction *nvi) +{ + static const ubyte cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; + + struct nv_ref *src0 = nvi->src[0], *src1 = nvi->src[1]; + + if (!nv_op_commutative(nvi->opcode)) + return; + assert(src0 && src1); + + if (src1->value->reg.file == NV_FILE_IMM) + return; + + if (is_cmem_load(src0->value->insn)) { + if (!is_cmem_load(src1->value->insn)) { + nvi->src[0] = src1; + nvi->src[1] = src0; + /* debug_printf("swapping cmem load to 1\n"); */ + } + } else + if (is_smem_load(src1->value->insn)) { + if (!is_smem_load(src0->value->insn)) { + nvi->src[0] = src1; + nvi->src[1] = src0; + /* debug_printf("swapping smem load to 0\n"); */ + } + } + + if (nvi->opcode == NV_OP_SET && nvi->src[0] != src0) + nvi->set_cond = cc_swapped[nvi->set_cond]; +} + +static int +nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *nvi, *sti, *next; + int j; + + for (sti = b->entry; sti; sti = next) { + next = sti->next; + + /* only handling MOV to $oX here */ + if (!sti->def[0] || sti->def[0]->reg.file != NV_FILE_OUT) + continue; + if (sti->opcode != NV_OP_MOV && sti->opcode != NV_OP_STA) + continue; + + nvi = sti->src[0]->value->insn; + if (!nvi || nvi->opcode == NV_OP_PHI || nv_is_vector_op(nvi->opcode)) + continue; + assert(nvi->def[0] == sti->src[0]->value); + + if (nvi->opcode == NV_OP_SELECT) + continue; + if (nvi->def[0]->refc > 1) + continue; + + /* cannot write to $oX when using immediate */ + for (j = 0; j < 4 && nvi->src[j]; ++j) + if (nvi->src[j]->value->reg.file == NV_FILE_IMM || + nvi->src[j]->value->reg.file == NV_FILE_MEM_L) + break; + if (j < 4 && nvi->src[j]) + continue; + + nvi->def[0] = sti->def[0]; + nvi->def[0]->insn = nvi; + nvi->fixed = sti->fixed; + + nv_nvi_delete(sti); + } + DESCEND_ARBITRARY(j, nv_pass_fold_stores); + + return 0; +} + +static int +nv_pass_fold_loads(struct nv_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *nvi, *ld; + int j; + + for (nvi = b->entry; nvi; nvi = nvi->next) { + check_swap_src_0_1(nvi); + + for (j = 0; j < 3; ++j) { + if (!nvi->src[j]) + break; + ld = nvi->src[j]->value->insn; + if (!ld) + continue; + + if (is_immd_move(ld) && nv50_nvi_can_use_imm(nvi, j)) { + nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value); + continue; + } + + if (ld->opcode != NV_OP_LDA) + continue; + if (!nv50_nvi_can_load(nvi, j, ld->src[0]->value)) + continue; + + if (j == 0 && ld->src[4]) /* can't load shared mem */ + continue; + + /* fold it ! */ + nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value); + if (ld->src[4]) + nv_reference(ctx->pc, &nvi->src[4], ld->src[4]->value); + + if (!nv_nvi_refcount(ld)) + nv_nvi_delete(ld); + } + } + DESCEND_ARBITRARY(j, nv_pass_fold_loads); + + return 0; +} + +/* NOTE: Assumes loads have not yet been folded. */ +static int +nv_pass_lower_mods(struct nv_pass *ctx, struct nv_basic_block *b) +{ + int j; + struct nv_instruction *nvi, *mi, *next; + ubyte mod; + + for (nvi = b->entry; nvi; nvi = next) { + next = nvi->next; + if (nvi->opcode == NV_OP_SUB) { + nvi->opcode = NV_OP_ADD; + nvi->src[1]->mod ^= NV_MOD_NEG; + } + + for (j = 0; j < 4 && nvi->src[j]; ++j) { + mi = nvi->src[j]->value->insn; + if (!mi) + continue; + if (mi->def[0]->refc > 1) + continue; + + if (mi->opcode == NV_OP_NEG) mod = NV_MOD_NEG; + else + if (mi->opcode == NV_OP_ABS) mod = NV_MOD_ABS; + else + continue; + assert(!(mod & mi->src[0]->mod & NV_MOD_NEG)); + + mod |= mi->src[0]->mod; + + if (mi->flags_def || mi->flags_src) + continue; + + if ((nvi->opcode == NV_OP_ABS) || (nvi->src[j]->mod & NV_MOD_ABS)) { + /* abs neg [abs] = abs */ + mod &= ~(NV_MOD_NEG | NV_MOD_ABS); + } else + if ((nvi->opcode == NV_OP_NEG) && (mod & NV_MOD_NEG)) { + /* neg as opcode and modifier on same insn cannot occur */ + /* neg neg abs = abs, neg neg = identity */ + assert(j == 0); + if (mod & NV_MOD_ABS) + nvi->opcode = NV_OP_ABS; + else + if (nvi->flags_def) + nvi->opcode = NV_OP_CVT; + else + nvi->opcode = NV_OP_MOV; + mod = 0; + } + + if ((nv50_supported_src_mods(nvi->opcode, j) & mod) != mod) + continue; + + nv_reference(ctx->pc, &nvi->src[j], mi->src[0]->value); + + nvi->src[j]->mod ^= mod; + } + + if (nvi->opcode == NV_OP_SAT) { + mi = nvi->src[0]->value->insn; + + if (mi->opcode != NV_OP_ADD || mi->opcode != NV_OP_MAD) + continue; + if (mi->flags_def || mi->def[0]->refc > 1) + continue; + + mi->saturate = 1; + mi->def[0] = nvi->def[0]; + mi->def[0]->insn = mi; + nv_nvi_delete(nvi); + } + } + DESCEND_ARBITRARY(j, nv_pass_lower_mods); + + return 0; +} + +#define SRC_IS_MUL(s) ((s)->insn && (s)->insn->opcode == NV_OP_MUL) + +static void +modifiers_apply(uint32_t *val, ubyte type, ubyte mod) +{ + if (mod & NV_MOD_ABS) { + if (type == NV_TYPE_F32) + *val &= 0x7fffffff; + else + if ((*val) & (1 << 31)) + *val = ~(*val) + 1; + } + if (mod & NV_MOD_NEG) { + if (type == NV_TYPE_F32) + *val ^= 0x80000000; + else + *val = ~(*val) + 1; + } +} + +static INLINE uint +modifiers_opcode(ubyte mod) +{ + switch (mod) { + case NV_MOD_NEG: return NV_OP_NEG; + case NV_MOD_ABS: return NV_OP_ABS; + case 0: + return NV_OP_MOV; + default: + return NV_OP_NOP; + } +} + +static void +constant_expression(struct nv_pc *pc, struct nv_instruction *nvi, + struct nv_value *src0, struct nv_value *src1) +{ + struct nv_value *val; + union { + float f32; + uint32_t u32; + int32_t s32; + } u0, u1, u; + ubyte type; + + if (!nvi->def[0]) + return; + type = nvi->def[0]->reg.type; + + u.u32 = 0; + u0.u32 = src0->reg.imm.u32; + u1.u32 = src1->reg.imm.u32; + + modifiers_apply(&u0.u32, type, nvi->src[0]->mod); + modifiers_apply(&u1.u32, type, nvi->src[1]->mod); + + switch (nvi->opcode) { + case NV_OP_MAD: + if (nvi->src[2]->value->reg.file != NV_FILE_GPR) + return; + /* fall through */ + case NV_OP_MUL: + switch (type) { + case NV_TYPE_F32: u.f32 = u0.f32 * u1.f32; break; + case NV_TYPE_U32: u.u32 = u0.u32 * u1.u32; break; + case NV_TYPE_S32: u.s32 = u0.s32 * u1.s32; break; + default: + assert(0); + break; + } + break; + case NV_OP_ADD: + switch (type) { + case NV_TYPE_F32: u.f32 = u0.f32 + u1.f32; break; + case NV_TYPE_U32: u.u32 = u0.u32 + u1.u32; break; + case NV_TYPE_S32: u.s32 = u0.s32 + u1.s32; break; + default: + assert(0); + break; + } + break; + case NV_OP_SUB: + switch (type) { + case NV_TYPE_F32: u.f32 = u0.f32 - u1.f32; break; + case NV_TYPE_U32: u.u32 = u0.u32 - u1.u32; break; + case NV_TYPE_S32: u.s32 = u0.s32 - u1.s32; break; + default: + assert(0); + break; + } + break; + default: + return; + } + + nvi->opcode = NV_OP_MOV; + + val = new_value(pc, NV_FILE_IMM, type); + + val->reg.imm.u32 = u.u32; + + nv_reference(pc, &nvi->src[1], NULL); + nv_reference(pc, &nvi->src[0], val); + + if (nvi->src[2]) { /* from MAD */ + nvi->src[1] = nvi->src[0]; + nvi->src[0] = nvi->src[2]; + nvi->src[2] = NULL; + nvi->opcode = NV_OP_ADD; + + if (val->reg.imm.u32 == 0) { + nvi->src[1] = NULL; + nvi->opcode = NV_OP_MOV; + } + } +} + +static void +constant_operand(struct nv_pc *pc, + struct nv_instruction *nvi, struct nv_value *val, int s) +{ + union { + float f32; + uint32_t u32; + int32_t s32; + } u; + int t = s ? 0 : 1; + uint op; + ubyte type; + + if (!nvi->def[0]) + return; + type = nvi->def[0]->reg.type; + + u.u32 = val->reg.imm.u32; + modifiers_apply(&u.u32, type, nvi->src[s]->mod); + + switch (nvi->opcode) { + case NV_OP_MUL: + if ((type == NV_TYPE_F32 && u.f32 == 1.0f) || + (NV_TYPE_ISINT(type) && u.u32 == 1)) { + if ((op = modifiers_opcode(nvi->src[t]->mod)) == NV_OP_NOP) + break; + nvi->opcode = op; + nv_reference(pc, &nvi->src[s], NULL); + nvi->src[0] = nvi->src[t]; + nvi->src[1] = NULL; + } else + if ((type == NV_TYPE_F32 && u.f32 == 2.0f) || + (NV_TYPE_ISINT(type) && u.u32 == 2)) { + nvi->opcode = NV_OP_ADD; + nv_reference(pc, &nvi->src[s], nvi->src[t]->value); + nvi->src[s]->mod = nvi->src[t]->mod; + } else + if (type == NV_TYPE_F32 && u.f32 == -1.0f) { + if (nvi->src[t]->mod & NV_MOD_NEG) + nvi->opcode = NV_OP_MOV; + else + nvi->opcode = NV_OP_NEG; + nv_reference(pc, &nvi->src[s], NULL); + nvi->src[0] = nvi->src[t]; + nvi->src[1] = NULL; + } else + if (type == NV_TYPE_F32 && u.f32 == -2.0f) { + nvi->opcode = NV_OP_ADD; + nv_reference(pc, &nvi->src[s], nvi->src[t]->value); + nvi->src[s]->mod = (nvi->src[t]->mod ^= NV_MOD_NEG); + } else + if (u.u32 == 0) { + nvi->opcode = NV_OP_MOV; + nv_reference(pc, &nvi->src[t], NULL); + if (s) { + nvi->src[0] = nvi->src[1]; + nvi->src[1] = NULL; + } + } + break; + case NV_OP_ADD: + if (u.u32 == 0) { + if ((op = modifiers_opcode(nvi->src[t]->mod)) == NV_OP_NOP) + break; + nvi->opcode = op; + nv_reference(pc, &nvi->src[s], NULL); + nvi->src[0] = nvi->src[t]; + nvi->src[1] = NULL; + } + break; + case NV_OP_RCP: + u.f32 = 1.0f / u.f32; + (val = new_value(pc, NV_FILE_IMM, NV_TYPE_F32))->reg.imm.f32 = u.f32; + nvi->opcode = NV_OP_MOV; + assert(s == 0); + nv_reference(pc, &nvi->src[0], val); + break; + case NV_OP_RSQ: + u.f32 = 1.0f / sqrtf(u.f32); + (val = new_value(pc, NV_FILE_IMM, NV_TYPE_F32))->reg.imm.f32 = u.f32; + nvi->opcode = NV_OP_MOV; + assert(s == 0); + nv_reference(pc, &nvi->src[0], val); + break; + default: + break; + } + + if (nvi->opcode == NV_OP_MOV && nvi->flags_def) { + struct nv_instruction *cvt = new_instruction_at(pc, nvi, NV_OP_CVT); + + nv_reference(pc, &cvt->src[0], nvi->def[0]); + + cvt->flags_def = nvi->flags_def; + nvi->flags_def = NULL; + } +} + +static int +nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *nvi, *next; + int j; + + for (nvi = b->entry; nvi; nvi = next) { + struct nv_value *src0, *src1, *src; + int mod; + + next = nvi->next; + + src0 = nvcg_find_immediate(nvi->src[0]); + src1 = nvcg_find_immediate(nvi->src[1]); + + if (src0 && src1) + constant_expression(ctx->pc, nvi, src0, src1); + else { + if (src0) + constant_operand(ctx->pc, nvi, src0, 0); + else + if (src1) + constant_operand(ctx->pc, nvi, src1, 1); + } + + /* try to combine MUL, ADD into MAD */ + if (nvi->opcode != NV_OP_ADD) + continue; + + src0 = nvi->src[0]->value; + src1 = nvi->src[1]->value; + + if (SRC_IS_MUL(src0) && src0->refc == 1) + src = src0; + else + if (SRC_IS_MUL(src1) && src1->refc == 1) + src = src1; + else + continue; + + /* could have an immediate from above constant_* */ + if (src0->reg.file != NV_FILE_GPR || src1->reg.file != NV_FILE_GPR) + continue; + + nvi->opcode = NV_OP_MAD; + mod = nvi->src[(src == src0) ? 0 : 1]->mod; + nv_reference(ctx->pc, &nvi->src[(src == src0) ? 0 : 1], NULL); + nvi->src[2] = nvi->src[(src == src0) ? 1 : 0]; + + assert(!(mod & ~NV_MOD_NEG)); + nvi->src[0] = new_ref(ctx->pc, src->insn->src[0]->value); + nvi->src[1] = new_ref(ctx->pc, src->insn->src[1]->value); + nvi->src[0]->mod = src->insn->src[0]->mod ^ mod; + nvi->src[1]->mod = src->insn->src[1]->mod; + } + DESCEND_ARBITRARY(j, nv_pass_lower_arith); + + return 0; +} + +/* TODO: redundant store elimination */ + +struct load_record { + struct load_record *next; + uint64_t data[2]; + struct nv_value *value; +}; + +#define LOAD_RECORD_POOL_SIZE 1024 + +struct nv_pass_reld_elim { + struct nv_pc *pc; + + struct load_record *imm; + struct load_record *mem_s; + struct load_record *mem_v; + struct load_record *mem_c[16]; + struct load_record *mem_l; + + struct load_record pool[LOAD_RECORD_POOL_SIZE]; + int alloc; +}; + +/* TODO: properly handle loads from l[] memory in the presence of stores */ +static int +nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b) +{ + struct load_record **rec, *it; + struct nv_instruction *ld, *next; + uint64_t data[2]; + struct nv_value *val; + int j; + + for (ld = b->entry; ld; ld = next) { + next = ld->next; + if (!ld->src[0]) + continue; + val = ld->src[0]->value; + rec = NULL; + + if (ld->opcode == NV_OP_LINTERP || ld->opcode == NV_OP_PINTERP) { + data[0] = val->reg.id; + data[1] = 0; + rec = &ctx->mem_v; + } else + if (ld->opcode == NV_OP_LDA) { + data[0] = val->reg.id; + data[1] = ld->src[4] ? ld->src[4]->value->n : ~0ULL; + if (val->reg.file >= NV_FILE_MEM_C(0) && + val->reg.file <= NV_FILE_MEM_C(15)) + rec = &ctx->mem_c[val->reg.file - NV_FILE_MEM_C(0)]; + else + if (val->reg.file == NV_FILE_MEM_S) + rec = &ctx->mem_s; + else + if (val->reg.file == NV_FILE_MEM_L) + rec = &ctx->mem_l; + } else + if ((ld->opcode == NV_OP_MOV) && (val->reg.file == NV_FILE_IMM)) { + data[0] = val->reg.imm.u32; + data[1] = 0; + rec = &ctx->imm; + } + + if (!rec || !ld->def[0]->refc) + continue; + + for (it = *rec; it; it = it->next) + if (it->data[0] == data[0] && it->data[1] == data[1]) + break; + + if (it) { + if (ld->def[0]->reg.id >= 0) + it->value = ld->def[0]; + else + if (!ld->fixed) + nvcg_replace_value(ctx->pc, ld->def[0], it->value); + } else { + if (ctx->alloc == LOAD_RECORD_POOL_SIZE) + continue; + it = &ctx->pool[ctx->alloc++]; + it->next = *rec; + it->data[0] = data[0]; + it->data[1] = data[1]; + it->value = ld->def[0]; + *rec = it; + } + } + + ctx->imm = NULL; + ctx->mem_s = NULL; + ctx->mem_v = NULL; + for (j = 0; j < 16; ++j) + ctx->mem_c[j] = NULL; + ctx->mem_l = NULL; + ctx->alloc = 0; + + DESCEND_ARBITRARY(j, nv_pass_reload_elim); + + return 0; +} + +static int +nv_pass_tex_mask(struct nv_pass *ctx, struct nv_basic_block *b) +{ + int i, c, j; + + for (i = 0; i < ctx->pc->num_instructions; ++i) { + struct nv_instruction *nvi = &ctx->pc->instructions[i]; + struct nv_value *def[4]; + + if (!nv_is_vector_op(nvi->opcode)) + continue; + nvi->tex_mask = 0; + + for (c = 0; c < 4; ++c) { + if (nvi->def[c]->refc) + nvi->tex_mask |= 1 << c; + def[c] = nvi->def[c]; + } + + j = 0; + for (c = 0; c < 4; ++c) + if (nvi->tex_mask & (1 << c)) + nvi->def[j++] = def[c]; + for (c = 0; c < 4; ++c) + if (!(nvi->tex_mask & (1 << c))) + nvi->def[j++] = def[c]; + assert(j == 4); + } + return 0; +} + +struct nv_pass_dce { + struct nv_pc *pc; + uint removed; +}; + +static int +nv_pass_dce(struct nv_pass_dce *ctx, struct nv_basic_block *b) +{ + int j; + struct nv_instruction *nvi, *next; + + for (nvi = b->phi ? b->phi : b->entry; nvi; nvi = next) { + next = nvi->next; + + if (inst_cullable(nvi)) { + nv_nvi_delete(nvi); + + ++ctx->removed; + } + } + DESCEND_ARBITRARY(j, nv_pass_dce); + + return 0; +} + +/* Register allocation inserted ELSE blocks for all IF/ENDIF without ELSE. + * Returns TRUE if @bb initiates an IF/ELSE/ENDIF clause, or is an IF with + * BREAK and dummy ELSE block. + */ +static INLINE boolean +bb_is_if_else_endif(struct nv_basic_block *bb) +{ + if (!bb->out[0] || !bb->out[1]) + return FALSE; + + if (bb->out[0]->out_kind[0] == CFG_EDGE_LOOP_LEAVE) { + return (bb->out[0]->out[1] == bb->out[1]->out[0] && + !bb->out[1]->out[1]); + } else { + return (bb->out[0]->out[0] == bb->out[1]->out[0] && + !bb->out[0]->out[1] && + !bb->out[1]->out[1]); + } +} + +/* predicate instructions and remove branch at the end */ +static void +predicate_instructions(struct nv_pc *pc, struct nv_basic_block *b, + struct nv_value *p, ubyte cc) +{ + struct nv_instruction *nvi; + + if (!b->entry) + return; + for (nvi = b->entry; nvi->next; nvi = nvi->next) { + if (!nvi_isnop(nvi)) { + nvi->cc = cc; + nv_reference(pc, &nvi->flags_src, p); + } + } + + if (nvi->opcode == NV_OP_BRA) + nv_nvi_delete(nvi); + else + if (!nvi_isnop(nvi)) { + nvi->cc = cc; + nv_reference(pc, &nvi->flags_src, p); + } +} + +/* NOTE: Run this after register allocation, we can just cut out the cflow + * instructions and hook the predicates to the conditional OPs if they are + * not using immediates; better than inserting SELECT to join definitions. + * + * NOTE: Should adapt prior optimization to make this possible more often. + */ +static int +nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *nvi; + struct nv_value *pred; + int i; + int n0 = 0, n1 = 0; + + if (bb_is_if_else_endif(b)) { + + NV50_DBGMSG("pass_flatten: IF/ELSE/ENDIF construct at BB:%i\n", b->id); + + for (n0 = 0, nvi = b->out[0]->entry; nvi; nvi = nvi->next, ++n0) + if (!nv50_nvi_can_predicate(nvi)) + break; + if (!nvi) { + for (n1 = 0, nvi = b->out[1]->entry; nvi; nvi = nvi->next, ++n1) + if (!nv50_nvi_can_predicate(nvi)) + break; +#ifdef NV50PC_DEBUG + if (nvi) { + debug_printf("cannot predicate: "); nv_print_instruction(nvi); + } + } else { + debug_printf("cannot predicate: "); nv_print_instruction(nvi); +#endif + } + + if (!nvi && n0 < 12 && n1 < 12) { /* 12 as arbitrary limit */ + assert(b->exit && b->exit->flags_src); + pred = b->exit->flags_src->value; + + predicate_instructions(ctx->pc, b->out[0], pred, NV_CC_NE | NV_CC_U); + predicate_instructions(ctx->pc, b->out[1], pred, NV_CC_EQ); + + assert(b->exit && b->exit->opcode == NV_OP_BRA); + nv_nvi_delete(b->exit); + + if (b->exit && b->exit->opcode == NV_OP_JOINAT) + nv_nvi_delete(b->exit); + + i = (b->out[0]->out_kind[0] == CFG_EDGE_LOOP_LEAVE) ? 1 : 0; + + if ((nvi = b->out[0]->out[i]->entry)) { + nvi->is_join = 0; + if (nvi->opcode == NV_OP_JOIN) + nv_nvi_delete(nvi); + } + } + } + DESCEND_ARBITRARY(i, nv_pass_flatten); + + return 0; +} + +/* local common subexpression elimination, stupid O(n^2) implementation */ +static int +nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *ir, *ik, *next; + struct nv_instruction *entry = b->phi ? b->phi : b->entry; + int s; + unsigned int reps; + + do { + reps = 0; + for (ir = entry; ir; ir = next) { + next = ir->next; + for (ik = entry; ik != ir; ik = ik->next) { + if (ir->opcode != ik->opcode || ir->fixed) + continue; + + if (!ir->def[0] || !ik->def[0] || + ik->opcode == NV_OP_LDA || + ik->opcode == NV_OP_STA || + ik->opcode == NV_OP_MOV || + nv_is_vector_op(ik->opcode)) + continue; /* ignore loads, stores & moves */ + + if (ik->src[4] || ir->src[4]) + continue; /* don't mess with address registers */ + + if (ik->flags_src || ir->flags_src || + ik->flags_def || ir->flags_def) + continue; /* and also not with flags, for now */ + + if (ik->def[0]->reg.file == NV_FILE_OUT || + ir->def[0]->reg.file == NV_FILE_OUT || + !values_equal(ik->def[0], ir->def[0])) + continue; + + for (s = 0; s < 3; ++s) { + struct nv_value *a, *b; + + if (!ik->src[s]) { + if (ir->src[s]) + break; + continue; + } + if (ik->src[s]->mod != ir->src[s]->mod) + break; + a = ik->src[s]->value; + b = ir->src[s]->value; + if (a == b) + continue; + if (a->reg.file != b->reg.file || + a->reg.id < 0 || + a->reg.id != b->reg.id) + break; + } + if (s == 3) { + nv_nvi_delete(ir); + ++reps; + nvcg_replace_value(ctx->pc, ir->def[0], ik->def[0]); + break; + } + } + } + } while(reps); + + DESCEND_ARBITRARY(s, nv_pass_cse); + + return 0; +} + +static int +nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root) +{ + struct nv_pass_reld_elim *reldelim; + struct nv_pass pass; + struct nv_pass_dce dce; + int ret; + + pass.n = 0; + pass.pc = pc; + + /* Do this first, so we don't have to pay attention + * to whether sources are supported memory loads. + */ + pc->pass_seq++; + ret = nv_pass_lower_arith(&pass, root); + if (ret) + return ret; + + pc->pass_seq++; + ret = nv_pass_lower_mods(&pass, root); + if (ret) + return ret; + + pc->pass_seq++; + ret = nv_pass_fold_loads(&pass, root); + if (ret) + return ret; + + pc->pass_seq++; + ret = nv_pass_fold_stores(&pass, root); + if (ret) + return ret; + + if (pc->opt_reload_elim) { + reldelim = CALLOC_STRUCT(nv_pass_reld_elim); + reldelim->pc = pc; + pc->pass_seq++; + ret = nv_pass_reload_elim(reldelim, root); + FREE(reldelim); + if (ret) + return ret; + } + + pc->pass_seq++; + ret = nv_pass_cse(&pass, root); + if (ret) + return ret; + + dce.pc = pc; + do { + dce.removed = 0; + pc->pass_seq++; + ret = nv_pass_dce(&dce, root); + if (ret) + return ret; + } while (dce.removed); + + ret = nv_pass_tex_mask(&pass, root); + if (ret) + return ret; + + return ret; +} + +int +nv_pc_exec_pass0(struct nv_pc *pc) +{ + int i, ret; + + for (i = 0; i < pc->num_subroutines + 1; ++i) + if (pc->root[i] && (ret = nv_pc_pass0(pc, pc->root[i]))) + return ret; + return 0; +} diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c new file mode 100644 index 00000000000..984f6cbe172 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc_print.c @@ -0,0 +1,320 @@ +/* + * Copyright 2010 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "nv50_context.h" +#include "nv50_pc.h" + +#define NVXX_DEBUG 0 + +#define PRINT(args...) debug_printf(args) + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) +#endif + +static const char *norm = "\x1b[00m"; +static const char *gree = "\x1b[32m"; +static const char *blue = "\x1b[34m"; +static const char *cyan = "\x1b[36m"; +static const char *orng = "\x1b[33m"; +static const char *mgta = "\x1b[35m"; + +static const char *nv_opcode_names[NV_OP_COUNT + 1] = { + "phi", + "extract", + "combine", + "lda", + "sta", + "mov", + "add", + "sub", + "neg", + "mul", + "mad", + "cvt", + "sat", + "not", + "and", + "or", + "xor", + "shl", + "shr", + "rcp", + "undef", + "rsqrt", + "lg2", + "sin", + "cos", + "ex2", + "presin", + "preex2", + "min", + "max", + "set", + "sad", + "kil", + "bra", + "call", + "ret", + "break", + "breakaddr", + "joinat", + "tex", + "texbias", + "texlod", + "texfetch", + "texsize", + "dfdx", + "dfdy", + "quadop", + "linterp", + "pinterp", + "abs", + "ceil", + "floor", + "trunc", + "nop", + "select", + "export", + "join", + "BAD_OP" +}; + +static const char *nv_cond_names[] = +{ + "never", "lt" , "eq" , "le" , "gt" , "ne" , "ge" , "", + "never", "ltu", "equ", "leu", "gtu", "neu", "geu", "", + "o", "c", "a", "s" +}; + +static const char *nv_modifier_strings[] = +{ + "", + "neg", + "abs", + "neg abs", + "not", + "not neg" + "not abs", + "not neg abs", + "sat", + "BAD_MOD" +}; + +const char * +nv_opcode_name(uint opcode) +{ + return nv_opcode_names[MIN2(opcode, ARRAY_SIZE(nv_opcode_names) - 1)]; +} + +static INLINE const char * +nv_type_name(ubyte type) +{ + switch (type) { + case NV_TYPE_U16: return "u16"; + case NV_TYPE_S16: return "s16"; + case NV_TYPE_F32: return "f32"; + case NV_TYPE_U32: return "u32"; + case NV_TYPE_S32: return "s32"; + case NV_TYPE_P32: return "p32"; + case NV_TYPE_F64: return "f64"; + default: + return "BAD_TYPE"; + } +} + +static INLINE const char * +nv_cond_name(ubyte cc) +{ + return nv_cond_names[MIN2(cc, 19)]; +} + +static INLINE const char * +nv_modifier_string(ubyte mod) +{ + return nv_modifier_strings[MIN2(mod, 9)]; +} + +static INLINE int +nv_value_id(struct nv_value *value) +{ + if (value->join->reg.id >= 0) + return value->join->reg.id; + return value->n; +} + +static INLINE boolean +nv_value_allocated(struct nv_value *value) +{ + return (value->reg.id >= 0) ? TRUE : FALSE; +} + +static INLINE void +nv_print_address(const char c, int buf, struct nv_value *a, int offset) +{ + const char ac = (a && nv_value_allocated(a)) ? '$' : '%'; + + if (buf >= 0) + PRINT(" %s%c%i[", cyan, c, buf); + else + PRINT(" %s%c[", cyan, c); + if (a) + PRINT("%s%ca%i%s+", mgta, ac, nv_value_id(a), cyan); + PRINT("%s0x%x%s]", orng, offset, cyan); +} + +static INLINE void +nv_print_cond(struct nv_instruction *nvi) +{ + char pfx = nv_value_allocated(nvi->flags_src->value->join) ? '$' : '%'; + + PRINT("%s%s %s%cc%i ", + gree, nv_cond_name(nvi->cc), + mgta, pfx, nv_value_id(nvi->flags_src->value)); +} + +static INLINE void +nv_print_value(struct nv_value *value, struct nv_value *ind, ubyte type) +{ + char reg_pfx = '$'; + + if (type == NV_TYPE_ANY) + type = value->reg.type; + + if (value->reg.file != NV_FILE_FLAGS) + PRINT(" %s%s", gree, nv_type_name(type)); + + if (!nv_value_allocated(value->join)) + reg_pfx = '%'; + + switch (value->reg.file) { + case NV_FILE_GPR: + PRINT(" %s%cr%i", blue, reg_pfx, nv_value_id(value)); + break; + case NV_FILE_OUT: + PRINT(" %s%co%i", mgta, reg_pfx, nv_value_id(value)); + break; + case NV_FILE_ADDR: + PRINT(" %s%ca%i", mgta, reg_pfx, nv_value_id(value)); + break; + case NV_FILE_FLAGS: + PRINT(" %s%cc%i", mgta, reg_pfx, nv_value_id(value)); + break; + case NV_FILE_MEM_L: + nv_print_address('l', -1, ind, nv_value_id(value)); + break; + case NV_FILE_MEM_S: + nv_print_address('s', -1, ind, 4 * nv_value_id(value)); + break; + case NV_FILE_MEM_P: + nv_print_address('p', -1, ind, 4 * nv_value_id(value)); + break; + case NV_FILE_MEM_V: + nv_print_address('v', -1, ind, 4 * nv_value_id(value)); + break; + case NV_FILE_IMM: + switch (type) { + case NV_TYPE_U16: + case NV_TYPE_S16: + PRINT(" %s0x%04x", orng, value->reg.imm.u32); + break; + case NV_TYPE_F32: + PRINT(" %s%f", orng, value->reg.imm.f32); + break; + case NV_TYPE_F64: + PRINT(" %s%f", orng, value->reg.imm.f64); + break; + case NV_TYPE_U32: + case NV_TYPE_S32: + case NV_TYPE_P32: + PRINT(" %s0x%08x", orng, value->reg.imm.u32); + break; + } + break; + default: + if (value->reg.file >= NV_FILE_MEM_G(0) && + value->reg.file <= NV_FILE_MEM_G(15)) + nv_print_address('g', value->reg.file - NV_FILE_MEM_G(0), ind, + nv_value_id(value) * 4); + else + if (value->reg.file >= NV_FILE_MEM_C(0) && + value->reg.file <= NV_FILE_MEM_C(15)) + nv_print_address('c', value->reg.file - NV_FILE_MEM_C(0), ind, + nv_value_id(value) * 4); + else + NOUVEAU_ERR(" BAD_FILE[%i]", nv_value_id(value)); + break; + } +} + +static INLINE void +nv_print_ref(struct nv_ref *ref, struct nv_value *ind) +{ + nv_print_value(ref->value, ind, ref->typecast); +} + +void +nv_print_instruction(struct nv_instruction *i) +{ + int j; + + PRINT("%i: ", i->serial); + + if (i->flags_src) + nv_print_cond(i); + + PRINT("%s", gree); + if (i->opcode == NV_OP_SET) + PRINT("set %s", nv_cond_name(i->set_cond)); + else + if (i->saturate) + PRINT("sat %s", nv_opcode_name(i->opcode)); + else + PRINT("%s", nv_opcode_name(i->opcode)); + + if (i->flags_def) + nv_print_value(i->flags_def, NULL, NV_TYPE_ANY); + + /* Only STORE & STA can write to MEM, and they do not def + * anything, so the address is thus part of the source. + */ + if (i->def[0]) + nv_print_value(i->def[0], NULL, NV_TYPE_ANY); + else + if (i->target) + PRINT(" %s(BB:%i)", orng, i->target->id); + else + PRINT(" #"); + + for (j = 0; j < 4; ++j) { + if (!i->src[j]) + continue; + + if (i->src[j]->mod) + PRINT(" %s%s", gree, nv_modifier_string(i->src[j]->mod)); + + nv_print_ref(i->src[j], + (j == nv50_indirect_opnd(i)) ? + i->src[4]->value : NULL); + } + PRINT(" %s%c\n", norm, i->is_long ? 'l' : 's'); +} diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c new file mode 100644 index 00000000000..b9d5ba5ef67 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c @@ -0,0 +1,962 @@ +/* + * Copyright 2010 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* #define NV50PC_DEBUG */ + +/* #define NV50_RA_DEBUG_LIVEI */ +/* #define NV50_RA_DEBUG_LIVE_SETS */ +/* #define NV50_RA_DEBUG_JOIN */ + +#include "nv50_context.h" +#include "nv50_pc.h" + +#include "util/u_simple_list.h" + +#define NUM_REGISTER_FILES 4 + +struct register_set { + struct nv_pc *pc; + + uint32_t last[NUM_REGISTER_FILES]; + uint32_t bits[NUM_REGISTER_FILES][8]; +}; + +struct nv_pc_pass { + struct nv_pc *pc; + + struct nv_instruction **insns; + int num_insns; + + uint pass_seq; +}; + +static void +ranges_coalesce(struct nv_range *range) +{ + while (range->next && range->end >= range->next->bgn) { + struct nv_range *rnn = range->next->next; + assert(range->bgn <= range->next->bgn); + range->end = MAX2(range->end, range->next->end); + FREE(range->next); + range->next = rnn; + } +} + +static boolean +add_range_ex(struct nv_value *val, int bgn, int end, struct nv_range *new_range) +{ + struct nv_range *range, **nextp = &val->livei; + + for (range = val->livei; range; range = range->next) { + if (end < range->bgn) + break; /* insert before */ + + if (bgn > range->end) { + nextp = &range->next; + continue; /* insert after */ + } + + /* overlap */ + if (bgn < range->bgn) { + range->bgn = bgn; + if (end > range->end) + range->end = end; + ranges_coalesce(range); + return TRUE; + } + if (end > range->end) { + range->end = end; + ranges_coalesce(range); + return TRUE; + } + assert(bgn >= range->bgn); + assert(end <= range->end); + return TRUE; + } + + if (!new_range) + new_range = CALLOC_STRUCT(nv_range); + + new_range->bgn = bgn; + new_range->end = end; + new_range->next = range; + *(nextp) = new_range; + return FALSE; +} + +static void +add_range(struct nv_value *val, struct nv_basic_block *b, int end) +{ + int bgn; + + if (!val->insn) /* ignore non-def values */ + return; + assert(b->entry->serial <= b->exit->serial); + assert(b->phi->serial <= end); + assert(b->exit->serial + 1 >= end); + + bgn = val->insn->serial; + if (bgn < b->entry->serial || bgn > b->exit->serial) + bgn = b->entry->serial; + + assert(bgn <= end); + + add_range_ex(val, bgn, end, NULL); +} + +#if defined(NV50_RA_DEBUG_JOIN) || defined(NV50_RA_DEBUG_LIVEI) +static void +livei_print(struct nv_value *a) +{ + struct nv_range *r = a->livei; + + debug_printf("livei %i: ", a->n); + while (r) { + debug_printf("[%i, %i) ", r->bgn, r->end); + r = r->next; + } + debug_printf("\n"); +} +#endif + +static void +livei_unify(struct nv_value *dst, struct nv_value *src) +{ + struct nv_range *range, *next; + + for (range = src->livei; range; range = next) { + next = range->next; + if (add_range_ex(dst, range->bgn, range->end, range)) + FREE(range); + } + src->livei = NULL; +} + +static void +livei_release(struct nv_value *val) +{ + struct nv_range *range, *next; + + for (range = val->livei; range; range = next) { + next = range->next; + FREE(range); + } +} + +static boolean +livei_have_overlap(struct nv_value *a, struct nv_value *b) +{ + struct nv_range *r_a, *r_b; + + for (r_a = a->livei; r_a; r_a = r_a->next) { + for (r_b = b->livei; r_b; r_b = r_b->next) { + if (r_b->bgn < r_a->end && + r_b->end > r_a->bgn) + return TRUE; + } + } + return FALSE; +} + +static int +livei_end(struct nv_value *a) +{ + struct nv_range *r = a->livei; + + assert(r); + while (r->next) + r = r->next; + return r->end; +} + +static boolean +livei_contains(struct nv_value *a, int pos) +{ + struct nv_range *r; + + for (r = a->livei; r && r->bgn <= pos; r = r->next) + if (r->end > pos) + return TRUE; + return FALSE; +} + +static boolean +reg_assign(struct register_set *set, struct nv_value **def, int n) +{ + int i, id, s; + uint m; + int f = def[0]->reg.file; + + s = n << (nv_type_order(def[0]->reg.type) - 1); + m = (1 << s) - 1; + + id = set->last[f]; + + for (i = 0; i * 32 < set->last[f]; ++i) { + if (set->bits[f][i] == 0xffffffff) + continue; + + for (id = 0; id < 32; id += s) + if (!(set->bits[f][i] & (m << id))) + break; + if (id < 32) + break; + } + if (i * 32 + id > set->last[f]) + return FALSE; + + set->bits[f][i] |= m << id; + + id += i * 32; + + set->pc->max_reg[f] = MAX2(set->pc->max_reg[f], id + s - 1); + + id >>= nv_type_order(def[0]->reg.type) - 1; + + for (i = 0; i < n; ++i) + if (def[i]->livei) + def[i]->reg.id = id++; + + return TRUE; +} + +static INLINE void +reg_occupy(struct register_set *set, struct nv_value *val) +{ + int s, id = val->reg.id, f = val->reg.file; + uint m; + + if (id < 0) + return; + s = nv_type_order(val->reg.type) - 1; + id <<= s; + m = (1 << (1 << s)) - 1; + + set->bits[f][id / 32] |= m << (id % 32); + + if (set->pc->max_reg[f] < id) + set->pc->max_reg[f] = id; +} + +static INLINE void +reg_release(struct register_set *set, struct nv_value *val) +{ + int s, id = val->reg.id, f = val->reg.file; + uint m; + + if (id < 0) + return; + + s = nv_type_order(val->reg.type) - 1; + id <<= s; + m = (1 << (1 << s)) - 1; + + set->bits[f][id / 32] &= ~(m << (id % 32)); +} + +static INLINE boolean +join_allowed(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b) +{ + int i; + struct nv_value *val; + + if (a->reg.file != b->reg.file || + nv_type_sizeof(a->reg.type) != nv_type_sizeof(b->reg.type)) + return FALSE; + + if (a->join->reg.id == b->join->reg.id) + return TRUE; + +#if 1 + /* either a or b or both have been assigned */ + + if (a->join->reg.id >= 0 && b->join->reg.id >= 0) + return FALSE; + else + if (b->join->reg.id >= 0) { + if (a->join->reg.id >= 0) + return FALSE; + val = a; + a = b; + b = val; + } + + for (i = 0; i < ctx->pc->num_values; ++i) { + val = &ctx->pc->values[i]; + + if (val->join->reg.id != a->join->reg.id) + continue; + if (val->join != a->join && livei_have_overlap(val->join, b->join)) + return FALSE; + } + return TRUE; +#endif + return FALSE; +} + +static INLINE void +do_join_values(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b) +{ + int j; + struct nv_value *bjoin = b->join; + + if (b->join->reg.id >= 0) + a->join->reg.id = b->join->reg.id; + + livei_unify(a->join, b->join); + +#ifdef NV50_RA_DEBUG_JOIN + debug_printf("joining %i to %i\n", b->n, a->n); +#endif + + /* make a->join the new representative */ + for (j = 0; j < ctx->pc->num_values; ++j) + if (ctx->pc->values[j].join == bjoin) + ctx->pc->values[j].join = a->join; + + assert(b->join == a->join); +} + +static INLINE void +try_join_values(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b) +{ + if (!join_allowed(ctx, a, b)) { +#ifdef NV50_RA_DEBUG_JOIN + debug_printf("cannot join %i to %i: not allowed\n", b->n, a->n); +#endif + return; + } + if (livei_have_overlap(a->join, b->join)) { +#ifdef NV50_RA_DEBUG_JOIN + debug_printf("cannot join %i to %i: livei overlap\n", b->n, a->n); + livei_print(a); + livei_print(b); +#endif + return; + } + + do_join_values(ctx, a, b); +} + +static INLINE boolean +need_new_else_block(struct nv_basic_block *b, struct nv_basic_block *p) +{ + int i = 0, n = 0; + + for (; i < 2; ++i) + if (p->out[i] && !IS_LOOP_EDGE(p->out_kind[i])) + ++n; + + return (b->num_in > 1) && (n == 2); +} + +static int +phi_opnd_for_bb(struct nv_instruction *phi, struct nv_basic_block *b, + struct nv_basic_block *tb) +{ + int i, j; + + for (j = -1, i = 0; i < 4 && phi->src[i]; ++i) { + if (!nvbb_reachable_by(b, phi->src[i]->value->insn->bb, tb)) + continue; + /* NOTE: back-edges are ignored by the reachable-by check */ + if (j < 0 || !nvbb_reachable_by(phi->src[j]->value->insn->bb, + phi->src[i]->value->insn->bb, tb)) + j = i; + } + return j; +} + +/* For each operand of each PHI in b, generate a new value by inserting a MOV + * at the end of the block it is coming from and replace the operand with its + * result. This eliminates liveness conflicts and enables us to let values be + * copied to the right register if such a conflict exists nonetheless. + * + * These MOVs are also crucial in making sure the live intervals of phi srces + * are extended until the end of the loop, since they are not included in the + * live-in sets. + */ +static int +pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *i, *ni; + struct nv_value *val; + struct nv_basic_block *p, *pn; + int n, j; + + b->pass_seq = ctx->pc->pass_seq; + + for (n = 0; n < b->num_in; ++n) { + p = pn = b->in[n]; + assert(p); + + if (need_new_else_block(b, p)) { + pn = new_basic_block(ctx->pc); + + if (p->out[0] == b) + p->out[0] = pn; + else + p->out[1] = pn; + + if (p->exit->target == b) /* target to new else-block */ + p->exit->target = pn; + + b->in[n] = pn; + + pn->out[0] = b; + pn->in[0] = p; + pn->num_in = 1; + } + ctx->pc->current_block = pn; + + for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) { + if ((j = phi_opnd_for_bb(i, p, b)) < 0) + continue; + val = i->src[j]->value; + + if (i->src[j]->flags) { + val = val->insn->src[0]->value; + while (j < 4 && i->src[j]) + ++j; + assert(j < 4); + } + + ni = new_instruction(ctx->pc, NV_OP_MOV); + + /* TODO: insert instruction at correct position in the first place */ + if (ni->prev && ni->prev->target) + nv_nvi_permute(ni->prev, ni); + + ni->def[0] = new_value(ctx->pc, val->reg.file, val->reg.type); + ni->def[0]->insn = ni; + ni->src[0] = new_ref(ctx->pc, val); + + nv_reference(ctx->pc, &i->src[j], ni->def[0]); + + i->src[j]->flags = 1; + } + + if (pn != p && pn->exit) { + ctx->pc->current_block = b->in[n ? 0 : 1]; + ni = new_instruction(ctx->pc, NV_OP_BRA); + ni->target = b; + ni->is_terminator = 1; + } + } + + for (j = 0; j < 2; ++j) + if (b->out[j] && b->out[j]->pass_seq < ctx->pc->pass_seq) + pass_generate_phi_movs(ctx, b->out[j]); + + return 0; +} + +static int +pass_join_values(struct nv_pc_pass *ctx, int iter) +{ + int c, n; + + for (n = 0; n < ctx->num_insns; ++n) { + struct nv_instruction *i = ctx->insns[n]; + + switch (i->opcode) { + case NV_OP_PHI: + if (iter != 2) + break; + for (c = 0; c < 4 && i->src[c]; ++c) + try_join_values(ctx, i->def[0], i->src[c]->value); + break; + case NV_OP_MOV: + if ((iter == 2) && i->src[0]->value->insn && + !nv_is_vector_op(i->src[0]->value->join->insn->opcode)) + try_join_values(ctx, i->def[0], i->src[0]->value); + break; + case NV_OP_SELECT: + if (iter != 1) + break; + for (c = 0; c < 4 && i->src[c]; ++c) { + assert(join_allowed(ctx, i->def[0], i->src[c]->value)); + do_join_values(ctx, i->def[0], i->src[c]->value); + } + break; + case NV_OP_TEX: + case NV_OP_TXB: + case NV_OP_TXL: + case NV_OP_TXQ: + if (iter) + break; + for (c = 0; c < 4; ++c) { + if (!i->src[c]) + break; + do_join_values(ctx, i->def[c], i->src[c]->value); + } + break; + default: + break; + } + } + return 0; +} + +/* Order the instructions so that live intervals can be expressed in numbers. */ +static void +pass_order_instructions(void *priv, struct nv_basic_block *b) +{ + struct nv_pc_pass *ctx = (struct nv_pc_pass *)priv; + struct nv_instruction *i; + + b->pass_seq = ctx->pc->pass_seq; + + assert(!b->exit || !b->exit->next); + for (i = b->phi; i; i = i->next) { + i->serial = ctx->num_insns; + ctx->insns[ctx->num_insns++] = i; + } +} + +static void +bb_live_set_print(struct nv_pc *pc, struct nv_basic_block *b) +{ +#ifdef NV50_RA_DEBUG_LIVE_SETS + int j; + struct nv_value *val; + + debug_printf("LIVE-INs of BB:%i: ", b->id); + + for (j = 0; j < pc->num_values; ++j) { + if (!(b->live_set[j / 32] & (1 << (j % 32)))) + continue; + val = &pc->values[j]; + if (!val->insn) + continue; + debug_printf("%i ", val->n); + } + debug_printf("\n"); +#endif +} + +static INLINE void +live_set_add(struct nv_basic_block *b, struct nv_value *val) +{ + if (!val->insn) /* don't add non-def values */ + return; + b->live_set[val->n / 32] |= 1 << (val->n % 32); +} + +static INLINE void +live_set_rem(struct nv_basic_block *b, struct nv_value *val) +{ + b->live_set[val->n / 32] &= ~(1 << (val->n % 32)); +} + +static INLINE boolean +live_set_test(struct nv_basic_block *b, struct nv_ref *ref) +{ + int n = ref->value->n; + return b->live_set[n / 32] & (1 << (n % 32)); +} + +/* The live set of a block contains those values that are live immediately + * before the beginning of the block, so do a backwards scan. + */ +static int +pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *i; + int j, n, ret = 0; + + if (b->pass_seq >= ctx->pc->pass_seq) + return 0; + b->pass_seq = ctx->pc->pass_seq; + + /* slight hack for undecidedness: set phi = entry if it's undefined */ + if (!b->phi) + b->phi = b->entry; + + for (n = 0; n < 2; ++n) { + if (!b->out[n] || b->out[n] == b) + continue; + ret = pass_build_live_sets(ctx, b->out[n]); + if (ret) + return ret; + + if (n == 0) { + for (j = 0; j < (ctx->pc->num_values + 31) / 32; ++j) + b->live_set[j] = b->out[n]->live_set[j]; + } else { + for (j = 0; j < (ctx->pc->num_values + 31) / 32; ++j) + b->live_set[j] |= b->out[n]->live_set[j]; + } + } + + if (!b->entry) + return 0; + + bb_live_set_print(ctx->pc, b); + + for (i = b->exit; i != b->entry->prev; i = i->prev) { + for (j = 0; j < 4; j++) { + if (!i->def[j]) + break; + live_set_rem(b, i->def[j]); + } + for (j = 0; j < 4; j++) { + if (!i->src[j]) + break; + live_set_add(b, i->src[j]->value); + } + if (i->src[4]) + live_set_add(b, i->src[4]->value); + if (i->flags_def) + live_set_rem(b, i->flags_def); + if (i->flags_src) + live_set_add(b, i->flags_src->value); + } + for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) + live_set_rem(b, i->def[0]); + + bb_live_set_print(ctx->pc, b); + + return 0; +} + +static void collect_live_values(struct nv_basic_block *b, const int n) +{ + int i; + + if (b->out[0]) { + if (b->out[1]) { /* what to do about back-edges ? */ + for (i = 0; i < n; ++i) + b->live_set[i] = b->out[0]->live_set[i] | b->out[1]->live_set[i]; + } else { + memcpy(b->live_set, b->out[0]->live_set, n * sizeof(uint32_t)); + } + } else + if (b->out[1]) { + memcpy(b->live_set, b->out[1]->live_set, n * sizeof(uint32_t)); + } else { + memset(b->live_set, 0, n * sizeof(uint32_t)); + } +} + +/* NOTE: the live intervals of phi functions start at the first non-phi insn. */ +static int +pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *i, *i_stop; + int j, s; + const int n = (ctx->pc->num_values + 31) / 32; + + /* verify that first block does not have live-in values */ + if (b->num_in == 0) + for (j = 0; j < n; ++j) + assert(b->live_set[j] == 0); + + collect_live_values(b, n); + + /* remove live-outs def'd in a parallel block, hopefully they're all phi'd */ + for (j = 0; j < 2; ++j) { + if (!b->out[j] || !b->out[j]->phi) + continue; + for (i = b->out[j]->phi; i->opcode == NV_OP_PHI; i = i->next) { + live_set_rem(b, i->def[0]); + + for (s = 0; s < 4; ++s) { + if (!i->src[s]) + break; + assert(i->src[s]->value->insn); + if (nvbb_reachable_by(b, i->src[s]->value->insn->bb, b->out[j])) + live_set_add(b, i->src[s]->value); + else + live_set_rem(b, i->src[s]->value); + } + } + } + + /* remaining live-outs are live until the end */ + if (b->exit) { + for (j = 0; j < ctx->pc->num_values; ++j) { + if (!(b->live_set[j / 32] & (1 << (j % 32)))) + continue; + add_range(&ctx->pc->values[j], b, b->exit->serial + 1); +#ifdef NV50_RA_DEBUG_LIVEI + debug_printf("adding range for live value %i: ", j); + livei_print(&ctx->pc->values[j]); +#endif + + } + } + + i_stop = b->entry ? b->entry->prev : NULL; + + /* don't have to include phi functions here (will have 0 live range) */ + for (i = b->exit; i != i_stop; i = i->prev) { + assert(i->serial >= b->phi->serial && i->serial <= b->exit->serial); + for (j = 0; j < 4; ++j) { + if (i->def[j]) + live_set_rem(b, i->def[j]); + } + if (i->flags_def) + live_set_rem(b, i->flags_def); + + for (j = 0; j < 5; ++j) { + if (i->src[j] && !live_set_test(b, i->src[j])) { + live_set_add(b, i->src[j]->value); + add_range(i->src[j]->value, b, i->serial); +#ifdef NV50_RA_DEBUG_LIVEI + debug_printf("adding range for source %i (ends living): ", + i->src[j]->value->n); + livei_print(i->src[j]->value); +#endif + } + } + if (i->flags_src && !live_set_test(b, i->flags_src)) { + live_set_add(b, i->flags_src->value); + add_range(i->flags_src->value, b, i->serial); +#ifdef NV50_RA_DEBUG_LIVEI + debug_printf("adding range for source %i (ends living): ", + i->flags_src->value->n); + livei_print(i->flags_src->value); +#endif + } + } + + b->pass_seq = ctx->pc->pass_seq; + + if (b->out[0] && b->out[0]->pass_seq < ctx->pc->pass_seq) + pass_build_intervals(ctx, b->out[0]); + + if (b->out[1] && b->out[1]->pass_seq < ctx->pc->pass_seq) + pass_build_intervals(ctx, b->out[1]); + + return 0; +} + +static INLINE void +nv50_ctor_register_set(struct nv_pc *pc, struct register_set *set) +{ + memset(set, 0, sizeof(*set)); + + set->last[NV_FILE_GPR] = 255; + set->last[NV_FILE_OUT] = 127; + set->last[NV_FILE_FLAGS] = 4; + set->last[NV_FILE_ADDR] = 4; + + set->pc = pc; +} + +static void +insert_ordered_tail(struct nv_value *list, struct nv_value *nval) +{ + struct nv_value *elem = list->prev; + + for (elem = list->prev; + elem != list && elem->livei->bgn > nval->livei->bgn; + elem = elem->prev); + /* now elem begins before or at the same time as val */ + + nval->prev = elem; + nval->next = elem->next; + elem->next->prev = nval; + elem->next = nval; +} + +static int +pass_linear_scan(struct nv_pc_pass *ctx, int iter) +{ + struct nv_instruction *i; + struct register_set f, free; + int k, n; + struct nv_value *cur, *val, *tmp[2]; + struct nv_value active, inactive, handled, unhandled; + + make_empty_list(&active); + make_empty_list(&inactive); + make_empty_list(&handled); + make_empty_list(&unhandled); + + nv50_ctor_register_set(ctx->pc, &free); + + /* joined values should have range = NULL and thus not be added; + * also, fixed memory values won't be added because they're not + * def'd, just used + */ + for (n = 0; n < ctx->num_insns; ++n) { + i = ctx->insns[n]; + + for (k = 0; k < 4; ++k) { + if (i->def[k] && i->def[k]->livei) + insert_ordered_tail(&unhandled, i->def[k]); + else + if (0 && i->def[k]) + debug_printf("skipping def'd value %i: no livei\n", i->def[k]->n); + } + if (i->flags_def && i->flags_def->livei) + insert_ordered_tail(&unhandled, i->flags_def); + } + + for (val = unhandled.next; val != unhandled.prev; val = val->next) { + assert(val->join == val); + assert(val->livei->bgn <= val->next->livei->bgn); + } + + foreach_s(cur, tmp[0], &unhandled) { + remove_from_list(cur); + + foreach_s(val, tmp[1], &active) { + if (livei_end(val) <= cur->livei->bgn) { + reg_release(&free, val); + move_to_head(&handled, val); + } else + if (!livei_contains(val, cur->livei->bgn)) { + reg_release(&free, val); + move_to_head(&inactive, val); + } + } + + foreach_s(val, tmp[1], &inactive) { + if (livei_end(val) <= cur->livei->bgn) + move_to_head(&handled, val); + else + if (livei_contains(val, cur->livei->bgn)) { + reg_occupy(&free, val); + move_to_head(&active, val); + } + } + + f = free; + + foreach(val, &inactive) + if (livei_have_overlap(val, cur)) + reg_occupy(&f, val); + + foreach(val, &unhandled) + if (val->reg.id >= 0 && livei_have_overlap(val, cur)) + reg_occupy(&f, val); + + if (cur->reg.id < 0) { + boolean mem = FALSE; + + if (nv_is_vector_op(cur->insn->opcode)) + mem = !reg_assign(&f, &cur->insn->def[0], 4); + else + if (iter) + mem = !reg_assign(&f, &cur, 1); + + if (mem) { + NOUVEAU_ERR("out of registers\n"); + abort(); + } + } + insert_at_head(&active, cur); + reg_occupy(&free, cur); + } + + return 0; +} + +static int +nv_pc_pass1(struct nv_pc *pc, struct nv_basic_block *root) +{ + struct nv_pc_pass *ctx; + int i, ret; + + NV50_DBGMSG("REGISTER ALLOCATION - entering\n"); + + ctx = CALLOC_STRUCT(nv_pc_pass); + if (!ctx) + return -1; + ctx->pc = pc; + + ctx->insns = CALLOC(NV_PC_MAX_INSTRUCTIONS, sizeof(struct nv_instruction *)); + if (!ctx->insns) { + FREE(ctx); + return -1; + } + + pc->pass_seq++; + ret = pass_generate_phi_movs(ctx, root); + assert(!ret); + + for (i = 0; i < pc->loop_nesting_bound; ++i) { + pc->pass_seq++; + ret = pass_build_live_sets(ctx, root); + assert(!ret && "live sets"); + if (ret) { + NOUVEAU_ERR("failed to build live sets (iteration %d)\n", i); + goto out; + } + } + + pc->pass_seq++; + nv_pc_pass_in_order(root, pass_order_instructions, ctx); + + pc->pass_seq++; + ret = pass_build_intervals(ctx, root); + assert(!ret && "build intervals"); + if (ret) { + NOUVEAU_ERR("failed to build live intervals\n"); + goto out; + } + +#ifdef NV50_RA_DEBUG_LIVEI + for (i = 0; i < pc->num_values; ++i) + livei_print(&pc->values[i]); +#endif + + ret = pass_join_values(ctx, 0); + if (ret) + goto out; + ret = pass_linear_scan(ctx, 0); + if (ret) + goto out; + ret = pass_join_values(ctx, 1); + if (ret) + goto out; + ret = pass_join_values(ctx, 2); + if (ret) + goto out; + ret = pass_linear_scan(ctx, 1); + if (ret) + goto out; + + for (i = 0; i < pc->num_values; ++i) + livei_release(&pc->values[i]); + + NV50_DBGMSG("REGISTER ALLOCATION - leaving\n"); + +out: + FREE(ctx->insns); + FREE(ctx); + return ret; +} + +int +nv_pc_exec_pass1(struct nv_pc *pc) +{ + int i, ret; + + for (i = 0; i < pc->num_subroutines + 1; ++i) + if (pc->root[i] && (ret = nv_pc_pass1(pc, pc->root[i]))) + return ret; + return 0; +} diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index cec2290481f..b3600f7ba73 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -1,5 +1,5 @@ /* - * Copyright 2008 Ben Skeggs + * Copyright 2010 Chrsitoph Bumiller * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -20,4678 +20,638 @@ * SOFTWARE. */ -#include "pipe/p_context.h" -#include "pipe/p_defines.h" -#include "pipe/p_state.h" -#include "util/u_inlines.h" +/* #define NV50_PROGRAM_DEBUG */ + +#include "nv50_program.h" +#include "nv50_pc.h" +#include "nv50_context.h" #include "pipe/p_shader_tokens.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" - -#include "nv50_context.h" -#include "nv50_transfer.h" - -#define NV50_SU_MAX_TEMP 127 -#define NV50_SU_MAX_ADDR 4 -//#define NV50_PROGRAM_DUMP - -/* $a5 and $a6 always seem to be 0, and using $a7 gives you noise */ - -/* ARL - gallium craps itself on progs/vp/arl.txt - * - * MSB - Like MAD, but MUL+SUB - * - Fuck it off, introduce a way to negate args for ops that - * support it. - * - * Look into inlining IMMD for ops other than MOV (make it general?) - * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD, - * but can emit to P_TEMP first - then MOV later. NVIDIA does this - * - * In ops such as ADD it's possible to construct a bad opcode in the !is_long() - * case, if the emit_src() causes the inst to suddenly become long. - * - * Verify half-insns work where expected - and force disable them where they - * don't work - MUL has it forcibly disabled atm as it fixes POW.. - * - * FUCK! watch dst==src vectors, can overwrite components that are needed. - * ie. SUB R0, R0.yzxw, R0 - * - * Things to check with renouveau: - * FP attr/result assignment - how? - * attrib - * - 0x16bc maps vp output onto fp hpos - * - 0x16c0 maps vp output onto fp col0 - * result - * - colr always 0-3 - * - depr always 4 - * 0x16bc->0x16e8 --> some binding between vp/fp regs - * 0x16b8 --> VP output count - * - * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005 - * "MOV rcol.x, fcol.y" = 0x00000004 - * 0x19a8 --> as above but 0x00000100 and 0x00000000 - * - 0x00100000 used when KIL used - * 0x196c --> as above but 0x00000011 and 0x00000000 - * - * 0x1988 --> 0xXXNNNNNN - * - XX == FP high something - */ -struct nv50_reg { - enum { - P_TEMP, - P_ATTR, - P_RESULT, - P_CONST, - P_IMMD, - P_ADDR - } type; - int index; - - int hw; - int mod; - - int rhw; /* result hw for FP outputs, or interpolant index */ - int acc; /* instruction where this reg is last read (first insn == 1) */ - - int vtx; /* vertex index, for GP inputs (TGSI Dimension.Index) */ - int indirect[2]; /* index into pc->addr, or -1 */ - - ubyte buf_index; /* c{0 .. 15}[] or g{0 .. 15}[] */ -}; - -#define NV50_MOD_NEG 1 -#define NV50_MOD_ABS 2 -#define NV50_MOD_NEG_ABS (NV50_MOD_NEG | NV50_MOD_ABS) -#define NV50_MOD_SAT 4 -#define NV50_MOD_I32 8 - -/* NV50_MOD_I32 is used to indicate integer mode for neg/abs */ - -/* STACK: Conditionals and loops have to use the (per warp) stack. - * Stack entries consist of an entry type (divergent path, join at), - * a mask indicating the active threads of the warp, and an address. - * MPs can store 12 stack entries internally, if we need more (and - * we probably do), we have to create a stack buffer in VRAM. - */ -/* impose low limits for now */ -#define NV50_MAX_COND_NESTING 4 -#define NV50_MAX_LOOP_NESTING 3 - -#define JOIN_ON(e) e; pc->p->exec_tail->inst[1] |= 2 - -struct nv50_pc { - struct nv50_program *p; - - /* hw resources */ - struct nv50_reg *r_temp[NV50_SU_MAX_TEMP]; - struct nv50_reg r_addr[NV50_SU_MAX_ADDR]; - - /* tgsi resources */ - struct nv50_reg *temp; - int temp_nr; - struct nv50_reg *attr; - int attr_nr; - struct nv50_reg *result; - int result_nr; - struct nv50_reg *param; - int param_nr; - struct nv50_reg *immd; - uint32_t *immd_buf; - int immd_nr; - struct nv50_reg **addr; - int addr_nr; - struct nv50_reg *sysval; - int sysval_nr; - - struct nv50_reg *temp_temp[16]; - struct nv50_program_exec *temp_temp_exec[16]; - unsigned temp_temp_nr; - - /* broadcast and destination replacement regs */ - struct nv50_reg *r_brdc; - struct nv50_reg *r_dst[4]; - - struct nv50_reg reg_instances[16]; - unsigned reg_instance_nr; - - unsigned interp_mode[32]; - /* perspective interpolation registers */ - struct nv50_reg *iv_p; - struct nv50_reg *iv_c; - - struct nv50_program_exec *if_insn[NV50_MAX_COND_NESTING]; - struct nv50_program_exec *if_join[NV50_MAX_COND_NESTING]; - struct nv50_program_exec *loop_brka[NV50_MAX_LOOP_NESTING]; - int if_lvl, loop_lvl; - unsigned loop_pos[NV50_MAX_LOOP_NESTING]; - - unsigned *insn_pos; /* actual program offset of each TGSI insn */ - boolean in_subroutine; - - /* current instruction and total number of insns */ - unsigned insn_cur; - unsigned insn_nr; - - boolean allow32; - - uint8_t edgeflag_out; -}; - -static struct nv50_reg *get_address_reg(struct nv50_pc *, struct nv50_reg *); - -static INLINE void -ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw) -{ - reg->type = type; - reg->index = index; - reg->hw = hw; - reg->mod = 0; - reg->rhw = -1; - reg->vtx = -1; - reg->acc = 0; - reg->indirect[0] = reg->indirect[1] = -1; - reg->buf_index = (type == P_CONST) ? 1 : 0; -} +#include "tgsi/tgsi_dump.h" static INLINE unsigned -popcnt4(uint32_t val) -{ - static const unsigned cnt[16] - = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; - return cnt[val & 0xf]; -} - -static void -terminate_mbb(struct nv50_pc *pc) +bitcount4(const uint32_t val) { - int i; - - /* remove records of temporary address register values */ - for (i = 0; i < NV50_SU_MAX_ADDR; ++i) - if (pc->r_addr[i].index < 0) - pc->r_addr[i].acc = 0; + static const unsigned cnt[16] + = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; + return cnt[val & 0xf]; } -static void -alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg) -{ - int i = 0; - - if (reg->type == P_RESULT) { - if (pc->p->cfg.high_result < (reg->hw + 1)) - pc->p->cfg.high_result = reg->hw + 1; - } - - if (reg->type != P_TEMP) - return; - - if (reg->hw >= 0) { - /*XXX: do this here too to catch FP temp-as-attr usage.. - * not clean, but works */ - if (pc->p->cfg.high_temp < (reg->hw + 1)) - pc->p->cfg.high_temp = reg->hw + 1; - return; - } - - if (reg->rhw != -1) { - /* try to allocate temporary with index rhw first */ - if (!(pc->r_temp[reg->rhw])) { - pc->r_temp[reg->rhw] = reg; - reg->hw = reg->rhw; - if (pc->p->cfg.high_temp < (reg->rhw + 1)) - pc->p->cfg.high_temp = reg->rhw + 1; - return; - } - /* make sure we don't get things like $r0 needs to go - * in $r1 and $r1 in $r0 - */ - i = pc->result_nr * 4; - } - - for (; i < NV50_SU_MAX_TEMP; i++) { - if (!(pc->r_temp[i])) { - pc->r_temp[i] = reg; - reg->hw = i; - if (pc->p->cfg.high_temp < (i + 1)) - pc->p->cfg.high_temp = i + 1; - return; - } - } - - NOUVEAU_ERR("out of registers\n"); - abort(); -} - -static INLINE struct nv50_reg * -reg_instance(struct nv50_pc *pc, struct nv50_reg *reg) -{ - struct nv50_reg *ri; - - assert(pc->reg_instance_nr < 16); - ri = &pc->reg_instances[pc->reg_instance_nr++]; - if (reg) { - alloc_reg(pc, reg); - *ri = *reg; - reg->indirect[0] = reg->indirect[1] = -1; - reg->mod = 0; - } - return ri; +static unsigned +nv50_tgsi_src_mask(const struct tgsi_full_instruction *inst, int c) +{ + unsigned mask = inst->Dst[0].Register.WriteMask; + + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_COS: + case TGSI_OPCODE_SIN: + return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0); + case TGSI_OPCODE_DP3: + return 0x7; + case TGSI_OPCODE_DP4: + case TGSI_OPCODE_DPH: + case TGSI_OPCODE_KIL: /* WriteMask ignored */ + return 0xf; + case TGSI_OPCODE_DST: + return mask & (c ? 0xa : 0x6); + case TGSI_OPCODE_EX2: + case TGSI_OPCODE_EXP: + case TGSI_OPCODE_LG2: + case TGSI_OPCODE_LOG: + case TGSI_OPCODE_POW: + case TGSI_OPCODE_RCP: + case TGSI_OPCODE_RSQ: + case TGSI_OPCODE_SCS: + return 0x1; + case TGSI_OPCODE_IF: + return 0x1; + case TGSI_OPCODE_LIT: + return 0xb; + case TGSI_OPCODE_TEX: + case TGSI_OPCODE_TXB: + case TGSI_OPCODE_TXL: + case TGSI_OPCODE_TXP: + { + const struct tgsi_instruction_texture *tex; + + assert(inst->Instruction.Texture); + tex = &inst->Texture; + + mask = 0x7; + if (inst->Instruction.Opcode != TGSI_OPCODE_TEX && + inst->Instruction.Opcode != TGSI_OPCODE_TXD) + mask |= 0x8; /* bias, lod or proj */ + + switch (tex->Texture) { + case TGSI_TEXTURE_1D: + mask &= 0x9; + break; + case TGSI_TEXTURE_SHADOW1D: + mask &= 0x5; + break; + case TGSI_TEXTURE_2D: + mask &= 0xb; + break; + default: + break; + } + } + return mask; + case TGSI_OPCODE_XPD: + { + unsigned x = 0; + if (mask & 1) x |= 0x6; + if (mask & 2) x |= 0x5; + if (mask & 4) x |= 0x3; + return x; + } + default: + break; + } + + return mask; +} + +static void +nv50_indirect_inputs(struct nv50_translation_info *ti, int id) +{ + int i, c; + + for (i = 0; i < PIPE_MAX_SHADER_INPUTS; ++i) + for (c = 0; c < 4; ++c) + ti->input_access[i][c] = id; + + ti->indirect_inputs = TRUE; +} + +static void +nv50_indirect_outputs(struct nv50_translation_info *ti, int id) +{ + int i, c; + + for (i = 0; i < PIPE_MAX_SHADER_OUTPUTS; ++i) + for (c = 0; c < 4; ++c) + ti->output_access[i][c] = id; + + ti->indirect_outputs = TRUE; +} + +static void +prog_inst(struct nv50_translation_info *ti, + const struct tgsi_full_instruction *inst, int id) +{ + const struct tgsi_dst_register *dst; + const struct tgsi_src_register *src; + int s, c, k; + unsigned mask; + + if (inst->Instruction.Opcode == TGSI_OPCODE_BGNSUB) { + ti->subr[ti->subr_nr].pos = id - 1; + ti->subr[ti->subr_nr].id = ti->subr_nr + 1; /* id 0 is main program */ + ++ti->subr_nr; + } + + if (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) { + dst = &inst->Dst[0].Register; + + for (c = 0; c < 4; ++c) { + if (dst->Indirect) + nv50_indirect_outputs(ti, id); + if (!(dst->WriteMask & (1 << c))) + continue; + ti->output_access[dst->Index][c] = id; + } + + if (inst->Instruction.Opcode == TGSI_OPCODE_MOV && + inst->Src[0].Register.File == TGSI_FILE_INPUT && + dst->Index == ti->edgeflag_out) + ti->p->vp.edgeflag = inst->Src[0].Register.Index; + } else + if (inst->Dst[0].Register.File == TGSI_FILE_TEMPORARY) { + if (inst->Dst[0].Register.Indirect) + ti->store_to_memory = TRUE; + } + + for (s = 0; s < inst->Instruction.NumSrcRegs; ++s) { + src = &inst->Src[s].Register; + if (src->File == TGSI_FILE_TEMPORARY) + if (inst->Src[s].Register.Indirect) + ti->store_to_memory = TRUE; + if (src->File != TGSI_FILE_INPUT) + continue; + mask = nv50_tgsi_src_mask(inst, s); + + if (inst->Src[s].Register.Indirect) + nv50_indirect_inputs(ti, id); + + for (c = 0; c < 4; ++c) { + if (!(mask & (1 << c))) + continue; + k = tgsi_util_get_full_src_register_swizzle(&inst->Src[s], c); + if (k <= TGSI_SWIZZLE_W) + ti->input_access[src->Index][k] = id; + } + } } -/* XXX: For shaders that aren't executed linearly (e.g. shaders that - * contain loops), we need to assign all hw regs to TGSI TEMPs early, - * lest we risk temp_temps overwriting regs alloc'd "later". +/* Probably should introduce something like struct tgsi_function_declaration + * instead of trying to guess inputs/outputs. */ -static struct nv50_reg * -alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst) -{ - struct nv50_reg *r; - int i; - - if (dst && dst->type == P_TEMP && dst->hw == -1) - return dst; - - for (i = 0; i < NV50_SU_MAX_TEMP; i++) { - if (!pc->r_temp[i]) { - r = MALLOC_STRUCT(nv50_reg); - ctor_reg(r, P_TEMP, -1, i); - pc->r_temp[i] = r; - return r; - } - } - - NOUVEAU_ERR("out of registers\n"); - abort(); - return NULL; -} - -#if 0 -/* release the hardware resource held by r */ -static void -release_hw(struct nv50_pc *pc, struct nv50_reg *r) -{ - assert(r->type == P_TEMP); - if (r->hw == -1) - return; - - assert(pc->r_temp[r->hw] == r); - pc->r_temp[r->hw] = NULL; - - r->acc = 0; - if (r->index == -1) - FREE(r); -} -#endif - static void -free_temp(struct nv50_pc *pc, struct nv50_reg *r) +prog_subroutine_inst(struct nv50_subroutine *subr, + const struct tgsi_full_instruction *inst) { - if (r->index == -1) { - unsigned hw = r->hw; + const struct tgsi_dst_register *dst; + const struct tgsi_src_register *src; + int s, c, k; + unsigned mask; - FREE(pc->r_temp[hw]); - pc->r_temp[hw] = NULL; - } -} + for (s = 0; s < inst->Instruction.NumSrcRegs; ++s) { + src = &inst->Src[s].Register; + if (src->File != TGSI_FILE_TEMPORARY) + continue; + mask = nv50_tgsi_src_mask(inst, s); -static int -alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx) -{ - int i; + assert(!inst->Src[s].Register.Indirect); - if ((idx + 4) >= NV50_SU_MAX_TEMP) - return 1; + for (c = 0; c < 4; ++c) { + k = tgsi_util_get_full_src_register_swizzle(&inst->Src[s], c); - if (pc->r_temp[idx] || pc->r_temp[idx + 1] || - pc->r_temp[idx + 2] || pc->r_temp[idx + 3]) - return alloc_temp4(pc, dst, idx + 4); + if ((mask & (1 << c)) && k < TGSI_SWIZZLE_W) + if (!(subr->retv[src->Index / 32][k] & (1 << (src->Index % 32)))) + subr->argv[src->Index / 32][k] |= 1 << (src->Index % 32); + } + } - for (i = 0; i < 4; i++) { - dst[i] = MALLOC_STRUCT(nv50_reg); - ctor_reg(dst[i], P_TEMP, -1, idx + i); - pc->r_temp[idx + i] = dst[i]; - } + if (inst->Dst[0].Register.File == TGSI_FILE_TEMPORARY) { + dst = &inst->Dst[0].Register; - return 0; + for (c = 0; c < 4; ++c) + if (dst->WriteMask & (1 << c)) + subr->retv[dst->Index / 32][c] |= 1 << (dst->Index % 32); + } } static void -free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4]) +prog_immediate(struct nv50_translation_info *ti, + const struct tgsi_full_immediate *imm) { - int i; + int c; + unsigned n = ti->immd32_nr++; - for (i = 0; i < 4; i++) - free_temp(pc, reg[i]); -} + assert(ti->immd32_nr <= ti->scan.immediate_count); -static struct nv50_reg * -temp_temp(struct nv50_pc *pc, struct nv50_program_exec *e) -{ - if (pc->temp_temp_nr >= 16) - assert(0); + for (c = 0; c < 4; ++c) + ti->immd32[n * 4 + c] = imm->u[c].Uint; - pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL); - pc->temp_temp_exec[pc->temp_temp_nr] = e; - return pc->temp_temp[pc->temp_temp_nr++]; + ti->immd32_ty[n] = imm->Immediate.DataType; } -/* This *must* be called for all nv50_program_exec that have been - * given as argument to temp_temp, or the temps will be leaked ! - */ -static void -kill_temp_temp(struct nv50_pc *pc, struct nv50_program_exec *e) -{ - int i; - - for (i = 0; i < pc->temp_temp_nr; i++) - if (pc->temp_temp_exec[i] == e) - free_temp(pc, pc->temp_temp[i]); - if (!e) - pc->temp_temp_nr = 0; +static INLINE unsigned +translate_interpolate(const struct tgsi_full_declaration *decl) +{ + unsigned mode; + + if (decl->Declaration.Interpolate == TGSI_INTERPOLATE_CONSTANT) + mode = NV50_INTERP_FLAT; + else + if (decl->Declaration.Interpolate == TGSI_INTERPOLATE_PERSPECTIVE) + mode = 0; + else + mode = NV50_INTERP_LINEAR; + + if (decl->Declaration.Centroid) + mode |= NV50_INTERP_CENTROID; + + return mode; +} + +static void +prog_decl(struct nv50_translation_info *ti, + const struct tgsi_full_declaration *decl) +{ + unsigned i, first, last, sn = 0, si = 0; + + first = decl->Range.First; + last = decl->Range.Last; + + if (decl->Declaration.Semantic) { + sn = decl->Semantic.Name; + si = decl->Semantic.Index; + } + + switch (decl->Declaration.File) { + case TGSI_FILE_INPUT: + for (i = first; i <= last; ++i) + ti->interp_mode[i] = translate_interpolate(decl); + + if (!decl->Declaration.Semantic) + break; + + for (i = first; i <= last; ++i) { + ti->p->in[i].sn = sn; + ti->p->in[i].si = si; + } + + switch (sn) { + case TGSI_SEMANTIC_FACE: + break; + case TGSI_SEMANTIC_COLOR: + if (ti->p->type == PIPE_SHADER_FRAGMENT) + ti->p->vp.bfc[si] = first; + break; + } + break; + case TGSI_FILE_OUTPUT: + if (!decl->Declaration.Semantic) + break; + + for (i = first; i <= last; ++i) { + ti->p->out[i].sn = sn; + ti->p->out[i].si = si; + } + + switch (sn) { + case TGSI_SEMANTIC_BCOLOR: + ti->p->vp.bfc[si] = first; + break; + case TGSI_SEMANTIC_PSIZE: + ti->p->vp.psiz = first; + break; + case TGSI_SEMANTIC_EDGEFLAG: + ti->edgeflag_out = first; + break; + default: + break; + } + break; + case TGSI_FILE_SYSTEM_VALUE: + switch (decl->Semantic.Name) { + case TGSI_SEMANTIC_FACE: + break; + case TGSI_SEMANTIC_INSTANCEID: + break; + case TGSI_SEMANTIC_PRIMID: + break; + /* + case TGSI_SEMANTIC_PRIMIDIN: + break; + case TGSI_SEMANTIC_VERTEXID: + break; + */ + default: + break; + } + break; + case TGSI_FILE_CONSTANT: + ti->p->parm_size = MAX2(ti->p->parm_size, (last + 1) * 16); + break; + case TGSI_FILE_ADDRESS: + case TGSI_FILE_SAMPLER: + case TGSI_FILE_TEMPORARY: + break; + default: + assert(0); + break; + } } static int -ctor_immd_4u32(struct nv50_pc *pc, - uint32_t x, uint32_t y, uint32_t z, uint32_t w) -{ - unsigned size = pc->immd_nr * 4 * sizeof(uint32_t); - - pc->immd_buf = REALLOC(pc->immd_buf, size, size + 4 * sizeof(uint32_t)); - - pc->immd_buf[(pc->immd_nr * 4) + 0] = x; - pc->immd_buf[(pc->immd_nr * 4) + 1] = y; - pc->immd_buf[(pc->immd_nr * 4) + 2] = z; - pc->immd_buf[(pc->immd_nr * 4) + 3] = w; - - return pc->immd_nr++; -} - -static INLINE int -ctor_immd_4f32(struct nv50_pc *pc, float x, float y, float z, float w) -{ - return ctor_immd_4u32(pc, fui(x), fui(y), fui(z), fui(w)); -} - -static struct nv50_reg * -alloc_immd(struct nv50_pc *pc, float f) -{ - struct nv50_reg *r = MALLOC_STRUCT(nv50_reg); - unsigned hw; - - for (hw = 0; hw < pc->immd_nr * 4; hw++) - if (pc->immd_buf[hw] == fui(f)) - break; - - if (hw == pc->immd_nr * 4) - hw = ctor_immd_4f32(pc, f, -f, 0.5 * f, 0) * 4; - - ctor_reg(r, P_IMMD, -1, hw); - return r; -} - -static struct nv50_program_exec * -exec(struct nv50_pc *pc) -{ - struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec); - - e->param.index = -1; - return e; -} - -static void -emit(struct nv50_pc *pc, struct nv50_program_exec *e) -{ - struct nv50_program *p = pc->p; - - if (p->exec_tail) - p->exec_tail->next = e; - if (!p->exec_head) - p->exec_head = e; - p->exec_tail = e; - p->exec_size += (e->inst[0] & 1) ? 2 : 1; - - kill_temp_temp(pc, e); -} - -static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *); - -static boolean -is_long(struct nv50_program_exec *e) -{ - if (e->inst[0] & 1) - return TRUE; - return FALSE; -} - -static boolean -is_immd(struct nv50_program_exec *e) -{ - if (is_long(e) && (e->inst[1] & 3) == 3) - return TRUE; - return FALSE; -} - -static boolean -is_join(struct nv50_program_exec *e) -{ - if (is_long(e) && (e->inst[1] & 3) == 2) - return TRUE; - return FALSE; -} - -static INLINE boolean -is_control_flow(struct nv50_program_exec *e) -{ - return (e->inst[0] & 2); -} - -static INLINE void -set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx, - struct nv50_program_exec *e) -{ - assert(!is_immd(e)); - set_long(pc, e); - e->inst[1] &= ~((0x1f << 7) | (0x3 << 12)); - e->inst[1] |= (pred << 7) | (idx << 12); -} - -static INLINE void -set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx, - struct nv50_program_exec *e) -{ - set_long(pc, e); - e->inst[1] &= ~((0x3 << 4) | (1 << 6)); - e->inst[1] |= (idx << 4) | (on << 6); -} - -static INLINE void -set_long(struct nv50_pc *pc, struct nv50_program_exec *e) -{ - if (is_long(e)) - return; - - e->inst[0] |= 1; - set_pred(pc, 0xf, 0, e); - set_pred_wr(pc, 0, 0, e); -} - -static INLINE void -set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e) -{ - if (dst->type == P_RESULT) { - set_long(pc, e); - e->inst[1] |= 0x00000008; - } - - alloc_reg(pc, dst); - if (dst->hw > 63) - set_long(pc, e); - e->inst[0] |= (dst->hw << 2); -} - -static INLINE void -set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e) -{ - set_long(pc, e); - /* XXX: can't be predicated - bits overlap; cases where both - * are required should be avoided by using pc->allow32 */ - set_pred(pc, 0, 0, e); - set_pred_wr(pc, 0, 0, e); - - e->inst[1] |= 0x00000002 | 0x00000001; - e->inst[0] |= (pc->immd_buf[imm->hw] & 0x3f) << 16; - e->inst[1] |= (pc->immd_buf[imm->hw] >> 6) << 2; -} - -static INLINE void -set_addr(struct nv50_program_exec *e, struct nv50_reg *a) -{ - assert(a->type == P_ADDR); - - assert(!(e->inst[0] & 0x0c000000)); - assert(!(e->inst[1] & 0x00000004)); - - e->inst[0] |= (a->hw & 3) << 26; - e->inst[1] |= a->hw & 4; -} - -static void -emit_arl(struct nv50_pc *, struct nv50_reg *, struct nv50_reg *, uint8_t); - -static void -emit_shl_imm(struct nv50_pc *, struct nv50_reg *, struct nv50_reg *, int); - -static void -emit_mov_from_addr(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[1] = 0x40000000; - set_long(pc, e); - set_dst(pc, dst, e); - set_addr(e, src); - - emit(pc, e); -} - -static void -emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src0, uint16_t src1_val) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0xd0000000 | (src1_val << 9); - e->inst[1] = 0x20000000; - set_long(pc, e); - e->inst[0] |= dst->hw << 2; - if (src0) /* otherwise will add to $a0, which is always 0 */ - set_addr(e, src0); - - emit(pc, e); -} - -#define INTERP_LINEAR 0 -#define INTERP_FLAT 1 -#define INTERP_PERSPECTIVE 2 -#define INTERP_CENTROID 4 - -/* interpolant index has been stored in dst->rhw */ -static void -emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv, - unsigned mode) -{ - struct nv50_program_exec *e = exec(pc); - assert(dst->rhw != -1); - - e->inst[0] |= 0x80000000; - set_dst(pc, dst, e); - e->inst[0] |= (dst->rhw << 16); - - if (mode & INTERP_FLAT) { - e->inst[0] |= (1 << 8); - } else { - if (mode & INTERP_PERSPECTIVE) { - e->inst[0] |= (1 << 25); - alloc_reg(pc, iv); - e->inst[0] |= (iv->hw << 9); - } - - if (mode & INTERP_CENTROID) - e->inst[0] |= (1 << 24); - } - - emit(pc, e); -} - -static void -set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s, - struct nv50_program_exec *e) -{ - set_long(pc, e); - - e->param.index = src->hw & 127; - e->param.shift = s; - e->param.mask = m << (s % 32); - - if (src->hw < 0 || src->hw > 127) /* need (additional) address reg */ - set_addr(e, get_address_reg(pc, src)); - else - if (src->acc < 0) { - assert(src->type == P_CONST); - set_addr(e, pc->addr[src->indirect[0]]); - } - - e->inst[1] |= (src->buf_index << 22); -} - -/* Never apply nv50_reg::mod in emit_mov, or carefully check the code !!! */ -static void -emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0x10000000; - if (!pc->allow32) - set_long(pc, e); - - set_dst(pc, dst, e); - - if (!is_long(e) && src->type == P_IMMD) { - set_immd(pc, src, e); - /*XXX: 32-bit, but steals part of "half" reg space - need to - * catch and handle this case if/when we do half-regs - */ - } else - if (src->type == P_IMMD || src->type == P_CONST) { - set_long(pc, e); - set_data(pc, src, 0x7f, 9, e); - e->inst[1] |= 0x20000000; /* mov from c[] */ - } else { - if (src->type == P_ATTR) { - set_long(pc, e); - e->inst[1] |= 0x00200000; - - if (src->vtx >= 0) { - /* indirect (vertex base + c) load from p[] */ - e->inst[0] |= 0x01800000; - set_addr(e, get_address_reg(pc, src)); - } - } - - alloc_reg(pc, src); - if (src->hw > 63) - set_long(pc, e); - e->inst[0] |= (src->hw << 9); - } - - if (is_long(e) && !is_immd(e)) { - e->inst[1] |= 0x04000000; /* 32-bit */ - e->inst[1] |= 0x0000c000; /* 32-bit c[] load / lane mask 0:1 */ - if (!(e->inst[1] & 0x20000000)) - e->inst[1] |= 0x00030000; /* lane mask 2:3 */ - } else - e->inst[0] |= 0x00008000; - - emit(pc, e); -} - -static INLINE void -emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f) -{ - struct nv50_reg *imm = alloc_immd(pc, f); - emit_mov(pc, dst, imm); - FREE(imm); -} - -/* Assign the hw of the discarded temporary register src - * to the tgsi register dst and free src. - */ -static void -assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - assert(src->index == -1 && src->hw != -1); - - if (pc->if_lvl || pc->loop_lvl || - (dst->type != P_TEMP) || - (src->hw < pc->result_nr * 4 && - pc->p->type == PIPE_SHADER_FRAGMENT) || - pc->p->info.opcode_count[TGSI_OPCODE_CAL] || - pc->p->info.opcode_count[TGSI_OPCODE_BRA]) { - - emit_mov(pc, dst, src); - free_temp(pc, src); - return; - } - - if (dst->hw != -1) - pc->r_temp[dst->hw] = NULL; - pc->r_temp[src->hw] = dst; - dst->hw = src->hw; - - FREE(src); -} - -static void -emit_nop(struct nv50_pc *pc) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0xf0000000; - set_long(pc, e); - e->inst[1] = 0xe0000000; - emit(pc, e); -} - -static boolean -check_swap_src_0_1(struct nv50_pc *pc, - struct nv50_reg **s0, struct nv50_reg **s1) -{ - struct nv50_reg *src0 = *s0, *src1 = *s1; - - if (src0->type == P_CONST) { - if (src1->type != P_CONST) { - *s0 = src1; - *s1 = src0; - return TRUE; - } - } else - if (src1->type == P_ATTR) { - if (src0->type != P_ATTR) { - *s0 = src1; - *s1 = src0; - return TRUE; - } - } - - return FALSE; -} - -static void -set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src, - struct nv50_program_exec *e) -{ - struct nv50_reg *temp; - - if (src->type != P_TEMP) { - temp = temp_temp(pc, e); - emit_mov(pc, temp, src); - src = temp; - } - - alloc_reg(pc, src); - if (src->hw > 63) - set_long(pc, e); - e->inst[0] |= (src->hw << 9); -} - -static void -set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) -{ - if (src->type == P_ATTR) { - set_long(pc, e); - e->inst[1] |= 0x00200000; - - if (src->vtx >= 0) { - e->inst[0] |= 0x01800000; /* src from p[] */ - set_addr(e, get_address_reg(pc, src)); - } - } else - if (src->type == P_CONST || src->type == P_IMMD) { - struct nv50_reg *temp = temp_temp(pc, e); - - emit_mov(pc, temp, src); - src = temp; - } - - alloc_reg(pc, src); - if (src->hw > 63) - set_long(pc, e); - e->inst[0] |= (src->hw << 9); -} - -static void -set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) -{ - if (src->type == P_ATTR) { - struct nv50_reg *temp = temp_temp(pc, e); - - emit_mov(pc, temp, src); - src = temp; - } else - if (src->type == P_CONST || src->type == P_IMMD) { - if (e->inst[0] & 0x01800000) { - struct nv50_reg *temp = temp_temp(pc, e); - - emit_mov(pc, temp, src); - src = temp; - } else { - assert(!(e->inst[0] & 0x00800000)); - set_data(pc, src, 0x7f, 16, e); - e->inst[0] |= 0x00800000; - } - } - - alloc_reg(pc, src); - if (src->hw > 63) - set_long(pc, e); - e->inst[0] |= ((src->hw & 127) << 16); -} - -static void -set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) -{ - set_long(pc, e); - - if (src->type == P_ATTR) { - struct nv50_reg *temp = temp_temp(pc, e); - - emit_mov(pc, temp, src); - src = temp; - } else - if (src->type == P_CONST || src->type == P_IMMD) { - if (e->inst[0] & 0x01800000) { - struct nv50_reg *temp = temp_temp(pc, e); - - emit_mov(pc, temp, src); - src = temp; - } else { - assert(!(e->inst[0] & 0x01000000)); - set_data(pc, src, 0x7f, 32+14, e); - e->inst[0] |= 0x01000000; - } - } - - alloc_reg(pc, src); - e->inst[1] |= ((src->hw & 127) << 14); -} - -static void -set_half_src(struct nv50_pc *pc, struct nv50_reg *src, int lh, - struct nv50_program_exec *e, int pos) -{ - struct nv50_reg *r = src; - - alloc_reg(pc, r); - if (r->type != P_TEMP) { - r = temp_temp(pc, e); - emit_mov(pc, r, src); - } - - if (r->hw > (NV50_SU_MAX_TEMP / 2)) { - NOUVEAU_ERR("out of low GPRs\n"); - abort(); - } - - e->inst[pos / 32] |= ((src->hw * 2) + lh) << (pos % 32); -} - -#if 0 -static void -emit_mov_from_pred(struct nv50_pc *pc, struct nv50_reg *dst, int pred) -{ - struct nv50_program_exec *e = exec(pc); - - assert(dst->type == P_TEMP); - e->inst[1] = 0x20000000 | (pred << 12); - set_long(pc, e); - set_dst(pc, dst, e); - - emit(pc, e); -} -#endif - -static void -emit_mov_to_pred(struct nv50_pc *pc, int pred, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0x000001fc; - e->inst[1] = 0xa0000008; - set_long(pc, e); - set_pred_wr(pc, 1, pred, e); - set_src_0_restricted(pc, src, e); - - emit(pc, e); -} - -static void -emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, - struct nv50_reg *src1) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] |= 0xc0000000; - - if (!pc->allow32) - set_long(pc, e); - - check_swap_src_0_1(pc, &src0, &src1); - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - if (src1->type == P_IMMD && !is_long(e)) { - if (src0->mod ^ src1->mod) - e->inst[0] |= 0x00008000; - set_immd(pc, src1, e); - } else { - set_src_1(pc, src1, e); - if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) { - if (is_long(e)) - e->inst[1] |= 0x08000000; - else - e->inst[0] |= 0x00008000; - } - } - - emit(pc, e); -} - -static void -emit_add(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src0, struct nv50_reg *src1) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0xb0000000; - - alloc_reg(pc, src1); - check_swap_src_0_1(pc, &src0, &src1); - - if (!pc->allow32 || (src0->mod | src1->mod) || src1->hw > 63) { - set_long(pc, e); - e->inst[1] |= ((src0->mod & NV50_MOD_NEG) << 26) | - ((src1->mod & NV50_MOD_NEG) << 27); - } - - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e)) - set_src_2(pc, src1, e); - else - if (src1->type == P_IMMD) - set_immd(pc, src1, e); - else - set_src_1(pc, src1, e); - - emit(pc, e); -} - -static void -emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, - uint8_t s) -{ - struct nv50_program_exec *e = exec(pc); - - set_long(pc, e); - e->inst[1] |= 0xc0000000; - - e->inst[0] |= dst->hw << 2; - e->inst[0] |= s << 16; /* shift left */ - set_src_0(pc, src, e); - - emit(pc, e); -} - -static boolean -address_reg_suitable(struct nv50_reg *a, struct nv50_reg *r) -{ - if (!r) - return FALSE; - - if (r->vtx != a->vtx) - return FALSE; - if (r->vtx >= 0) - return (r->indirect[1] == a->indirect[1]); - - if (r->hw < a->rhw || (r->hw - a->rhw) >= 128) - return FALSE; - - if (a->index >= 0) - return (a->index == r->indirect[0]); - return (a->indirect[0] == r->indirect[0]); -} - -static void -load_vertex_base(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *a, int shift) -{ - struct nv50_reg mem, *temp; - - ctor_reg(&mem, P_ATTR, -1, dst->vtx); - - assert(dst->type == P_ADDR); - if (!a) { - emit_arl(pc, dst, &mem, 0); - return; - } - temp = alloc_temp(pc, NULL); - - if (shift) { - emit_mov_from_addr(pc, temp, a); - if (shift < 0) - emit_shl_imm(pc, temp, temp, shift); - emit_arl(pc, dst, temp, MAX2(shift, 0)); - } - emit_mov(pc, temp, &mem); - set_addr(pc->p->exec_tail, dst); - - emit_arl(pc, dst, temp, 0); - free_temp(pc, temp); -} - -/* case (ref == NULL): allocate address register for TGSI_FILE_ADDRESS - * case (vtx >= 0, acc >= 0): load vertex base from a[vtx * 4] to $aX - * case (vtx >= 0, acc < 0): load vertex base from s[$aY + vtx * 4] to $aX - * case (vtx < 0, acc >= 0): memory address too high to encode - * case (vtx < 0, acc < 0): get source register for TGSI_FILE_ADDRESS - */ -static struct nv50_reg * -get_address_reg(struct nv50_pc *pc, struct nv50_reg *ref) -{ - int i; - struct nv50_reg *a_ref, *a = NULL; - - for (i = 0; i < NV50_SU_MAX_ADDR; ++i) { - if (pc->r_addr[i].acc == 0) - a = &pc->r_addr[i]; /* an unused address reg */ - else - if (address_reg_suitable(&pc->r_addr[i], ref)) { - pc->r_addr[i].acc = pc->insn_cur; - return &pc->r_addr[i]; - } else - if (!a && pc->r_addr[i].index < 0 && - pc->r_addr[i].acc < pc->insn_cur) - a = &pc->r_addr[i]; - } - if (!a) { - /* We'll be able to spill address regs when this - * mess is replaced with a proper compiler ... - */ - NOUVEAU_ERR("out of address regs\n"); - abort(); - return NULL; - } - - /* initialize and reserve for this TGSI instruction */ - a->rhw = 0; - a->index = a->indirect[0] = a->indirect[1] = -1; - a->acc = pc->insn_cur; - - if (!ref) { - a->vtx = -1; - return a; - } - a->vtx = ref->vtx; - - /* now put in the correct value ... */ - - if (ref->vtx >= 0) { - a->indirect[1] = ref->indirect[1]; - - /* For an indirect vertex index, we need to shift address right - * by 2, the address register will contain vtx * 16, we need to - * load from a[vtx * 4]. - */ - load_vertex_base(pc, a, (ref->acc < 0) ? - pc->addr[ref->indirect[1]] : NULL, -2); - } else { - assert(ref->acc < 0 || ref->indirect[0] < 0); - - a->rhw = ref->hw & ~0x7f; - a->indirect[0] = ref->indirect[0]; - a_ref = (ref->acc < 0) ? pc->addr[ref->indirect[0]] : NULL; - - emit_add_addr_imm(pc, a, a_ref, a->rhw * 4); - } - return a; -} - -#define NV50_MAX_F32 0x880 -#define NV50_MAX_S32 0x08c -#define NV50_MAX_U32 0x084 -#define NV50_MIN_F32 0x8a0 -#define NV50_MIN_S32 0x0ac -#define NV50_MIN_U32 0x0a4 - -static void -emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst, - struct nv50_reg *src0, struct nv50_reg *src1) -{ - struct nv50_program_exec *e = exec(pc); - - set_long(pc, e); - e->inst[0] |= 0x30000000 | ((sub & 0x800) << 20); - e->inst[1] |= (sub << 24); - - check_swap_src_0_1(pc, &src0, &src1); - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - set_src_1(pc, src1, e); - - if (src0->mod & NV50_MOD_ABS) - e->inst[1] |= 0x00100000; - if (src1->mod & NV50_MOD_ABS) - e->inst[1] |= 0x00080000; - - emit(pc, e); -} - -static INLINE void -emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, - struct nv50_reg *src1) -{ - src1->mod ^= NV50_MOD_NEG; - emit_add(pc, dst, src0, src1); - src1->mod ^= NV50_MOD_NEG; -} - -static void -emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, - struct nv50_reg *src1, unsigned op) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0xd0000000; - set_long(pc, e); - - check_swap_src_0_1(pc, &src0, &src1); - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - - if (op != TGSI_OPCODE_AND && op != TGSI_OPCODE_OR && - op != TGSI_OPCODE_XOR) - assert(!"invalid bit op"); - - assert(!(src0->mod | src1->mod)); - - if (src1->type == P_IMMD && src0->type == P_TEMP && pc->allow32) { - set_immd(pc, src1, e); - if (op == TGSI_OPCODE_OR) - e->inst[0] |= 0x0100; - else - if (op == TGSI_OPCODE_XOR) - e->inst[0] |= 0x8000; - } else { - set_src_1(pc, src1, e); - e->inst[1] |= 0x04000000; /* 32 bit */ - if (op == TGSI_OPCODE_OR) - e->inst[1] |= 0x4000; - else - if (op == TGSI_OPCODE_XOR) - e->inst[1] |= 0x8000; - } - - emit(pc, e); -} - -static void -emit_not(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0xd0000000; - e->inst[1] = 0x0402c000; - set_long(pc, e); - set_dst(pc, dst, e); - set_src_1(pc, src, e); - - emit(pc, e); -} - -static void -emit_shift(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src0, struct nv50_reg *src1, unsigned dir) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0x30000000; - e->inst[1] = 0xc4000000; - - set_long(pc, e); - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - - if (src1->type == P_IMMD) { - e->inst[1] |= (1 << 20); - e->inst[0] |= (pc->immd_buf[src1->hw] & 0x7f) << 16; - } else - set_src_1(pc, src1, e); - - if (dir != TGSI_OPCODE_SHL) - e->inst[1] |= (1 << 29); - - if (dir == TGSI_OPCODE_ISHR) - e->inst[1] |= (1 << 27); - - emit(pc, e); -} - -static void -emit_shl_imm(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src, int s) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0x30000000; - e->inst[1] = 0xc4100000; - if (s < 0) { - e->inst[1] |= 1 << 29; - s = -s; - } - e->inst[1] |= ((s & 0x7f) << 16); - - set_long(pc, e); - set_dst(pc, dst, e); - set_src_0(pc, src, e); - - emit(pc, e); -} - -static void -emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, - struct nv50_reg *src1, struct nv50_reg *src2) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] |= 0xe0000000; - - check_swap_src_0_1(pc, &src0, &src1); - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - set_src_1(pc, src1, e); - set_src_2(pc, src2, e); - - if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) - e->inst[1] |= 0x04000000; - if (src2->mod & NV50_MOD_NEG) - e->inst[1] |= 0x08000000; - - emit(pc, e); -} - -static INLINE void -emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, - struct nv50_reg *src1, struct nv50_reg *src2) -{ - src2->mod ^= NV50_MOD_NEG; - emit_mad(pc, dst, src0, src1, src2); - src2->mod ^= NV50_MOD_NEG; -} - -#define NV50_FLOP_RCP 0 -#define NV50_FLOP_RSQ 2 -#define NV50_FLOP_LG2 3 -#define NV50_FLOP_SIN 4 -#define NV50_FLOP_COS 5 -#define NV50_FLOP_EX2 6 - -/* rcp, rsqrt, lg2 support neg and abs */ -static void -emit_flop(struct nv50_pc *pc, unsigned sub, - struct nv50_reg *dst, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] |= 0x90000000; - if (sub || src->mod) { - set_long(pc, e); - e->inst[1] |= (sub << 29); - } - - set_dst(pc, dst, e); - set_src_0_restricted(pc, src, e); - - assert(!src->mod || sub < 4); - - if (src->mod & NV50_MOD_NEG) - e->inst[1] |= 0x04000000; - if (src->mod & NV50_MOD_ABS) - e->inst[1] |= 0x00100000; - - emit(pc, e); -} - -static void -emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] |= 0xb0000000; - - set_dst(pc, dst, e); - set_src_0(pc, src, e); - set_long(pc, e); - e->inst[1] |= (6 << 29) | 0x00004000; - - if (src->mod & NV50_MOD_NEG) - e->inst[1] |= 0x04000000; - if (src->mod & NV50_MOD_ABS) - e->inst[1] |= 0x00100000; - - emit(pc, e); -} - -static void -emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] |= 0xb0000000; - - set_dst(pc, dst, e); - set_src_0(pc, src, e); - set_long(pc, e); - e->inst[1] |= (6 << 29); - - if (src->mod & NV50_MOD_NEG) - e->inst[1] |= 0x04000000; - if (src->mod & NV50_MOD_ABS) - e->inst[1] |= 0x00100000; - - emit(pc, e); -} - -#define CVT_RN (0x00 << 16) -#define CVT_FLOOR (0x02 << 16) -#define CVT_CEIL (0x04 << 16) -#define CVT_TRUNC (0x06 << 16) -#define CVT_SAT (0x08 << 16) -#define CVT_ABS (0x10 << 16) - -#define CVT_X32_X32 0x04004000 -#define CVT_X32_S32 0x04014000 -#define CVT_F32_F32 ((0xc0 << 24) | CVT_X32_X32) -#define CVT_S32_F32 ((0x88 << 24) | CVT_X32_X32) -#define CVT_U32_F32 ((0x80 << 24) | CVT_X32_X32) -#define CVT_F32_S32 ((0x40 << 24) | CVT_X32_S32) -#define CVT_F32_U32 ((0x40 << 24) | CVT_X32_X32) -#define CVT_S32_S32 ((0x08 << 24) | CVT_X32_S32) -#define CVT_S32_U32 ((0x08 << 24) | CVT_X32_X32) -#define CVT_U32_S32 ((0x00 << 24) | CVT_X32_S32) - -#define CVT_NEG 0x20000000 -#define CVT_RI 0x08000000 - -static void -emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, - int wp, uint32_t cvn) -{ - struct nv50_program_exec *e; - - e = exec(pc); - - if (src->mod & NV50_MOD_NEG) cvn |= CVT_NEG; - if (src->mod & NV50_MOD_ABS) cvn |= CVT_ABS; - - e->inst[0] = 0xa0000000; - e->inst[1] = cvn; - set_long(pc, e); - set_src_0(pc, src, e); - - if (wp >= 0) - set_pred_wr(pc, 1, wp, e); - - if (dst) - set_dst(pc, dst, e); - else { - e->inst[0] |= 0x000001fc; - e->inst[1] |= 0x00000008; - } - - emit(pc, e); -} - -/* nv50 Condition codes: - * 0x1 = LT - * 0x2 = EQ - * 0x3 = LE - * 0x4 = GT - * 0x5 = NE - * 0x6 = GE - * 0x7 = set condition code ? (used before bra.lt/le/gt/ge) - * 0x8 = unordered bit (allows NaN) - * - * mode = 0x04 (u32), 0x0c (s32), 0x80 (f32) - */ -static void -emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp, - struct nv50_reg *src0, struct nv50_reg *src1, uint8_t mode) -{ - static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; - - struct nv50_program_exec *e = exec(pc); - struct nv50_reg *rdst; - - assert(ccode < 16); - if (check_swap_src_0_1(pc, &src0, &src1)) - ccode = cc_swapped[ccode & 7] | (ccode & 8); - - rdst = dst; - if (dst && dst->type != P_TEMP) - dst = alloc_temp(pc, NULL); - - set_long(pc, e); - e->inst[0] |= 0x30000000 | (mode << 24); - e->inst[1] |= 0x60000000 | (ccode << 14); - - if (wp >= 0) - set_pred_wr(pc, 1, wp, e); - if (dst) - set_dst(pc, dst, e); - else { - e->inst[0] |= 0x000001fc; - e->inst[1] |= 0x00000008; - } - - set_src_0(pc, src0, e); - set_src_1(pc, src1, e); - - emit(pc, e); - - if (rdst && mode == 0x80) /* convert to float ? */ - emit_cvt(pc, rdst, dst, -1, CVT_ABS | CVT_F32_S32); - if (rdst && rdst != dst) - free_temp(pc, dst); -} - -static INLINE void -map_tgsi_setop_hw(unsigned op, uint8_t *cc, uint8_t *ty) -{ - switch (op) { - case TGSI_OPCODE_SLT: *cc = 0x1; *ty = 0x80; break; - case TGSI_OPCODE_SGE: *cc = 0x6; *ty = 0x80; break; - case TGSI_OPCODE_SEQ: *cc = 0x2; *ty = 0x80; break; - case TGSI_OPCODE_SGT: *cc = 0x4; *ty = 0x80; break; - case TGSI_OPCODE_SLE: *cc = 0x3; *ty = 0x80; break; - case TGSI_OPCODE_SNE: *cc = 0xd; *ty = 0x80; break; - - case TGSI_OPCODE_ISLT: *cc = 0x1; *ty = 0x0c; break; - case TGSI_OPCODE_ISGE: *cc = 0x6; *ty = 0x0c; break; - case TGSI_OPCODE_USEQ: *cc = 0x2; *ty = 0x04; break; - case TGSI_OPCODE_USGE: *cc = 0x6; *ty = 0x04; break; - case TGSI_OPCODE_USLT: *cc = 0x1; *ty = 0x04; break; - case TGSI_OPCODE_USNE: *cc = 0x5; *ty = 0x04; break; - default: - assert(0); - return; - } -} - -static void -emit_add_b32(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src0, struct nv50_reg *rsrc1) -{ - struct nv50_program_exec *e = exec(pc); - struct nv50_reg *src1; - - e->inst[0] = 0x20000000; - - alloc_reg(pc, rsrc1); - check_swap_src_0_1(pc, &src0, &rsrc1); - - src1 = rsrc1; - if (src0->mod & rsrc1->mod & NV50_MOD_NEG) { - src1 = temp_temp(pc, e); - emit_cvt(pc, src1, rsrc1, -1, CVT_S32_S32); - } - - if (!pc->allow32 || src1->hw > 63 || - (src1->type != P_TEMP && src1->type != P_IMMD)) - set_long(pc, e); - - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - - if (is_long(e)) { - e->inst[1] |= 1 << 26; - set_src_2(pc, src1, e); - } else { - e->inst[0] |= 0x8000; - if (src1->type == P_IMMD) - set_immd(pc, src1, e); - else - set_src_1(pc, src1, e); - } - - if (src0->mod & NV50_MOD_NEG) - e->inst[0] |= 1 << 28; - else - if (src1->mod & NV50_MOD_NEG) - e->inst[0] |= 1 << 22; - - emit(pc, e); -} - -static void -emit_mad_u16(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src0, int lh_0, struct nv50_reg *src1, int lh_1, - struct nv50_reg *src2) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0x60000000; - if (!pc->allow32) - set_long(pc, e); - set_dst(pc, dst, e); - - set_half_src(pc, src0, lh_0, e, 9); - set_half_src(pc, src1, lh_1, e, 16); - alloc_reg(pc, src2); - if (is_long(e) || (src2->type != P_TEMP) || (src2->hw != dst->hw)) - set_src_2(pc, src2, e); - - emit(pc, e); -} - -static void -emit_mul_u16(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src0, int lh_0, struct nv50_reg *src1, int lh_1) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0x40000000; - set_long(pc, e); - set_dst(pc, dst, e); - - set_half_src(pc, src0, lh_0, e, 9); - set_half_src(pc, src1, lh_1, e, 16); - - emit(pc, e); -} - -static void -emit_sad(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src0, struct nv50_reg *src1, struct nv50_reg *src2) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0x50000000; - if (!pc->allow32) - set_long(pc, e); - check_swap_src_0_1(pc, &src0, &src1); - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - set_src_1(pc, src1, e); - alloc_reg(pc, src2); - if (is_long(e) || (src2->type != dst->type) || (src2->hw != dst->hw)) - set_src_2(pc, src2, e); - - if (is_long(e)) - e->inst[1] |= 0x0c << 24; - else - e->inst[0] |= 0x81 << 8; - - emit(pc, e); -} - -static INLINE void -emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - emit_cvt(pc, dst, src, -1, CVT_FLOOR | CVT_F32_F32 | CVT_RI); -} - -static void -emit_pow(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *v, struct nv50_reg *e) -{ - struct nv50_reg *temp = alloc_temp(pc, NULL); - - emit_flop(pc, NV50_FLOP_LG2, temp, v); - emit_mul(pc, temp, temp, e); - emit_preex2(pc, temp, temp); - emit_flop(pc, NV50_FLOP_EX2, dst, temp); - - free_temp(pc, temp); -} - -static INLINE void -emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - emit_cvt(pc, dst, src, -1, CVT_SAT | CVT_F32_F32); -} - -static void -emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, - struct nv50_reg **src) -{ - struct nv50_reg *one = alloc_immd(pc, 1.0); - struct nv50_reg *zero = alloc_immd(pc, 0.0); - struct nv50_reg *neg128 = alloc_immd(pc, -127.999999); - struct nv50_reg *pos128 = alloc_immd(pc, 127.999999); - struct nv50_reg *tmp[4] = { 0 }; - boolean allow32 = pc->allow32; - - pc->allow32 = FALSE; - - if (mask & (3 << 1)) { - tmp[0] = alloc_temp(pc, NULL); - emit_minmax(pc, NV50_MAX_F32, tmp[0], src[0], zero); - } - - if (mask & (1 << 2)) { - set_pred_wr(pc, 1, 0, pc->p->exec_tail); - - tmp[1] = temp_temp(pc, NULL); - emit_minmax(pc, NV50_MAX_F32, tmp[1], src[1], zero); - - tmp[3] = temp_temp(pc, NULL); - emit_minmax(pc, NV50_MAX_F32, tmp[3], src[3], neg128); - emit_minmax(pc, NV50_MIN_F32, tmp[3], tmp[3], pos128); - - emit_pow(pc, dst[2], tmp[1], tmp[3]); - emit_mov(pc, dst[2], zero); - set_pred(pc, 3, 0, pc->p->exec_tail); - } - - if (mask & (1 << 1)) - assimilate_temp(pc, dst[1], tmp[0]); - else - if (mask & (1 << 2)) - free_temp(pc, tmp[0]); - - pc->allow32 = allow32; - - /* do this last, in case src[i,j] == dst[0,3] */ - if (mask & (1 << 0)) - emit_mov(pc, dst[0], one); - - if (mask & (1 << 3)) - emit_mov(pc, dst[3], one); - - FREE(pos128); - FREE(neg128); - FREE(zero); - FREE(one); -} - -static void -emit_kil(struct nv50_pc *pc, struct nv50_reg *src) -{ - struct nv50_program_exec *e; - const int r_pred = 1; - - e = exec(pc); - e->inst[0] = 0x00000002; /* discard */ - set_long(pc, e); /* sets cond code to ALWAYS */ - - if (src) { - set_pred(pc, 0x1 /* cc = LT */, r_pred, e); - /* write to predicate reg */ - emit_cvt(pc, NULL, src, r_pred, CVT_F32_F32); - } - - emit(pc, e); -} - -static struct nv50_program_exec * -emit_control_flow(struct nv50_pc *pc, unsigned op, int pred, unsigned cc) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = (op << 28) | 2; - set_long(pc, e); - if (pred >= 0) - set_pred(pc, cc, pred, e); - - emit(pc, e); - return e; -} - -static INLINE struct nv50_program_exec * -emit_breakaddr(struct nv50_pc *pc) -{ - return emit_control_flow(pc, 0x4, -1, 0); -} - -static INLINE void -emit_break(struct nv50_pc *pc, int pred, unsigned cc) -{ - emit_control_flow(pc, 0x5, pred, cc); -} - -static INLINE struct nv50_program_exec * -emit_joinat(struct nv50_pc *pc) -{ - return emit_control_flow(pc, 0xa, -1, 0); -} - -static INLINE struct nv50_program_exec * -emit_branch(struct nv50_pc *pc, int pred, unsigned cc) -{ - return emit_control_flow(pc, 0x1, pred, cc); -} - -static INLINE struct nv50_program_exec * -emit_call(struct nv50_pc *pc, int pred, unsigned cc) -{ - return emit_control_flow(pc, 0x2, pred, cc); -} - -static INLINE void -emit_ret(struct nv50_pc *pc, int pred, unsigned cc) -{ - emit_control_flow(pc, 0x3, pred, cc); -} - -static void -emit_prim_cmd(struct nv50_pc *pc, unsigned cmd) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0xf0000000 | (cmd << 9); - e->inst[1] = 0xc0000000; - set_long(pc, e); - - emit(pc, e); -} - -#define QOP_ADD 0 -#define QOP_SUBR 1 -#define QOP_SUB 2 -#define QOP_MOV_SRC1 3 - -/* For a quad of threads / top left, top right, bottom left, bottom right - * pixels, do a different operation, and take src0 from a specific thread. - */ -static void -emit_quadop(struct nv50_pc *pc, struct nv50_reg *dst, int wp, int lane_src0, - struct nv50_reg *src0, struct nv50_reg *src1, ubyte qop) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0xc0000000; - e->inst[1] = 0x80000000; - set_long(pc, e); - e->inst[0] |= lane_src0 << 16; - set_src_0(pc, src0, e); - set_src_2(pc, src1, e); - - if (wp >= 0) - set_pred_wr(pc, 1, wp, e); - - if (dst) - set_dst(pc, dst, e); - else { - e->inst[0] |= 0x000001fc; - e->inst[1] |= 0x00000008; - } - - e->inst[0] |= (qop & 3) << 20; - e->inst[1] |= (qop >> 2) << 22; - - emit(pc, e); -} - -static void -load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4], - struct nv50_reg **src, unsigned arg, boolean proj) +nv50_vertprog_prepare(struct nv50_translation_info *ti) { - int mod[3] = { src[0]->mod, src[1]->mod, src[2]->mod }; - - src[0]->mod |= NV50_MOD_ABS; - src[1]->mod |= NV50_MOD_ABS; - src[2]->mod |= NV50_MOD_ABS; - - emit_minmax(pc, NV50_MAX_F32, t[2], src[0], src[1]); - emit_minmax(pc, NV50_MAX_F32, t[2], src[2], t[2]); + struct nv50_program *p = ti->p; + int i, c; + unsigned num_inputs = 0; - src[0]->mod = mod[0]; - src[1]->mod = mod[1]; - src[2]->mod = mod[2]; + ti->input_file = NV_FILE_MEM_S; + ti->output_file = NV_FILE_OUT; - if (proj && 0 /* looks more correct without this */) - emit_mul(pc, t[2], t[2], src[3]); - else - if (arg == 4) /* there is no textureProj(samplerCubeShadow) */ - emit_mov(pc, t[3], src[3]); + for (i = 0; i <= ti->scan.file_max[TGSI_FILE_INPUT]; ++i) { + p->in[i].id = i; + p->in[i].hw = num_inputs; - emit_flop(pc, NV50_FLOP_RCP, t[2], t[2]); + for (c = 0; c < 4; ++c) { + if (!ti->input_access[i][c]) + continue; + ti->input_map[i][c] = num_inputs++; + p->vp.attrs[(4 * i + c) / 32] |= 1 << ((i * 4 + c) % 32); + } + } - emit_mul(pc, t[0], src[0], t[2]); - emit_mul(pc, t[1], src[1], t[2]); - emit_mul(pc, t[2], src[2], t[2]); -} - -static void -load_proj_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4], - struct nv50_reg **src, unsigned dim, unsigned arg) -{ - unsigned c, mode; + for (i = 0; i <= ti->scan.file_max[TGSI_FILE_OUTPUT]; ++i) { + p->out[i].id = i; + p->out[i].hw = p->max_out; - if (src[0]->type == P_TEMP && src[0]->rhw != -1) { - mode = pc->interp_mode[src[0]->index] | INTERP_PERSPECTIVE; + for (c = 0; c < 4; ++c) { + if (!ti->output_access[i][c]) + continue; + ti->output_map[i][c] = p->max_out++; + p->out[i].mask |= 1 << c; + } + } - t[3]->rhw = src[3]->rhw; - emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID)); - emit_flop(pc, NV50_FLOP_RCP, t[3], t[3]); + if (p->vp.psiz < 0x40) + p->vp.psiz = p->out[p->vp.psiz].hw; - for (c = 0; c < dim; ++c) { - t[c]->rhw = src[c]->rhw; - emit_interp(pc, t[c], t[3], mode); - } - if (arg != dim) { /* depth reference value */ - t[dim]->rhw = src[2]->rhw; - emit_interp(pc, t[dim], t[3], mode); - } - } else { - /* XXX: for some reason the blob sometimes uses MAD - * (mad f32 $rX $rY $rZ neg $r63) - */ - emit_flop(pc, NV50_FLOP_RCP, t[3], src[3]); - for (c = 0; c < dim; ++c) - emit_mul(pc, t[c], src[c], t[3]); - if (arg != dim) /* depth reference value */ - emit_mul(pc, t[dim], src[2], t[3]); - } + return 0; } -static INLINE void -get_tex_dim(unsigned type, unsigned *dim, unsigned *arg) -{ - switch (type) { - case TGSI_TEXTURE_1D: - *arg = *dim = 1; - break; - case TGSI_TEXTURE_SHADOW1D: - *dim = 1; - *arg = 2; - break; - case TGSI_TEXTURE_UNKNOWN: - case TGSI_TEXTURE_2D: - case TGSI_TEXTURE_RECT: - *arg = *dim = 2; - break; - case TGSI_TEXTURE_SHADOW2D: - case TGSI_TEXTURE_SHADOWRECT: - *dim = 2; - *arg = 3; - break; - case TGSI_TEXTURE_3D: - case TGSI_TEXTURE_CUBE: - *dim = *arg = 3; - break; - default: - assert(0); - break; - } -} - -/* We shouldn't execute TEXLOD if any of the pixels in a quad have - * different LOD values, so branch off groups of equal LOD. - */ -static void -emit_texlod_sequence(struct nv50_pc *pc, struct nv50_reg *tlod, - struct nv50_reg *src, struct nv50_program_exec *tex) -{ - struct nv50_program_exec *join_at; - unsigned i, target = pc->p->exec_size + 9 * 2; - - if (pc->p->type != PIPE_SHADER_FRAGMENT) { - emit(pc, tex); - return; - } - pc->allow32 = FALSE; - - /* Subtract lod of each pixel from lod of top left pixel, jump - * texlod insn if result is 0, then repeat for 2 other pixels. - */ - join_at = emit_joinat(pc); - emit_quadop(pc, NULL, 0, 0, tlod, tlod, 0x55); - emit_branch(pc, 0, 2)->param.index = target; - - for (i = 1; i < 4; ++i) { - emit_quadop(pc, NULL, 0, i, tlod, tlod, 0x55); - emit_branch(pc, 0, 2)->param.index = target; - } - - emit_mov(pc, tlod, src); /* target */ - emit(pc, tex); /* texlod */ - - join_at->param.index = target + 2 * 2; - JOIN_ON(emit_nop(pc)); /* join _after_ tex */ -} - -static void -emit_texbias_sequence(struct nv50_pc *pc, struct nv50_reg *t[4], unsigned arg, - struct nv50_program_exec *tex) -{ - struct nv50_program_exec *e; - struct nv50_reg imm_1248, *t123[4][4], *r_bits = alloc_temp(pc, NULL); - int r_pred = 0; - unsigned n, c, i, cc[4] = { 0x0a, 0x13, 0x11, 0x10 }; - - pc->allow32 = FALSE; - ctor_reg(&imm_1248, P_IMMD, -1, ctor_immd_4u32(pc, 1, 2, 4, 8) * 4); - - /* Subtract bias value of thread i from bias values of each thread, - * store result in r_pred, and set bit i in r_bits if result was 0. - */ - assert(arg < 4); - for (i = 0; i < 4; ++i, ++imm_1248.hw) { - emit_quadop(pc, NULL, r_pred, i, t[arg], t[arg], 0x55); - emit_mov(pc, r_bits, &imm_1248); - set_pred(pc, 2, r_pred, pc->p->exec_tail); - } - emit_mov_to_pred(pc, r_pred, r_bits); - - /* The lanes of a quad are now grouped by the bit in r_pred they have - * set. Put the input values for TEX into a new register set for each - * group and execute TEX only for a specific group. - * We cannot use the same register set for each group because we need - * the derivatives, which are implicitly calculated, to be correct. - */ - for (i = 1; i < 4; ++i) { - alloc_temp4(pc, t123[i], 0); - - for (c = 0; c <= arg; ++c) - emit_mov(pc, t123[i][c], t[c]); - - *(e = exec(pc)) = *(tex); - e->inst[0] &= ~0x01fc; - set_dst(pc, t123[i][0], e); - set_pred(pc, cc[i], r_pred, e); - emit(pc, e); - } - /* finally TEX on the original regs (where we kept the input) */ - set_pred(pc, cc[0], r_pred, tex); - emit(pc, tex); - - /* put the 3 * n other results into regs for lane 0 */ - n = popcnt4(((e->inst[0] >> 25) & 0x3) | ((e->inst[1] >> 12) & 0xc)); - for (i = 1; i < 4; ++i) { - for (c = 0; c < n; ++c) { - emit_mov(pc, t[c], t123[i][c]); - set_pred(pc, cc[i], r_pred, pc->p->exec_tail); - } - free_temp4(pc, t123[i]); - } - - emit_nop(pc); - free_temp(pc, r_bits); -} - -static void -emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, - struct nv50_reg **src, unsigned unit, unsigned type, - boolean proj, int bias_lod) -{ - struct nv50_reg *t[4]; - struct nv50_program_exec *e; - unsigned c, dim, arg; - - /* t[i] must be within a single 128 bit super-reg */ - alloc_temp4(pc, t, 0); - - e = exec(pc); - e->inst[0] = 0xf0000000; - set_long(pc, e); - set_dst(pc, t[0], e); - - /* TIC and TSC binding indices (TSC is ignored as TSC_LINKED = TRUE): */ - e->inst[0] |= (unit << 9) /* | (unit << 17) */; - - /* live flag (don't set if TEX results affect input to another TEX): */ - /* e->inst[0] |= 0x00000004; */ - - get_tex_dim(type, &dim, &arg); - - if (type == TGSI_TEXTURE_CUBE) { - e->inst[0] |= 0x08000000; - load_cube_tex_coords(pc, t, src, arg, proj); - } else - if (proj) - load_proj_tex_coords(pc, t, src, dim, arg); - else { - for (c = 0; c < dim; c++) - emit_mov(pc, t[c], src[c]); - if (arg != dim) /* depth reference value (always src.z here) */ - emit_mov(pc, t[dim], src[2]); - } - - e->inst[0] |= (mask & 0x3) << 25; - e->inst[1] |= (mask & 0xc) << 12; - - if (!bias_lod) { - e->inst[0] |= (arg - 1) << 22; - emit(pc, e); - } else - if (bias_lod < 0) { - assert(pc->p->type == PIPE_SHADER_FRAGMENT); - e->inst[0] |= arg << 22; - e->inst[1] |= 0x20000000; /* texbias */ - emit_mov(pc, t[arg], src[3]); - emit_texbias_sequence(pc, t, arg, e); - } else { - e->inst[0] |= arg << 22; - e->inst[1] |= 0x40000000; /* texlod */ - emit_mov(pc, t[arg], src[3]); - emit_texlod_sequence(pc, t[arg], src[3], e); - } - -#if 1 - c = 0; - if (mask & 1) emit_mov(pc, dst[0], t[c++]); - if (mask & 2) emit_mov(pc, dst[1], t[c++]); - if (mask & 4) emit_mov(pc, dst[2], t[c++]); - if (mask & 8) emit_mov(pc, dst[3], t[c]); - - free_temp4(pc, t); -#else - /* XXX: if p.e. MUL is used directly after TEX, it would still use - * the texture coordinates, not the fetched values: latency ? */ - - for (c = 0; c < 4; c++) { - if (mask & (1 << c)) - assimilate_temp(pc, dst[c], t[c]); - else - free_temp(pc, t[c]); - } -#endif -} - -static void -emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - assert(src->type == P_TEMP); - - e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0240000 : 0xc0140000; - e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x86400000 : 0x89800000; - set_long(pc, e); - set_dst(pc, dst, e); - set_src_0(pc, src, e); - set_src_2(pc, src, e); - - emit(pc, e); -} - -static void -emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - assert(src->type == P_TEMP); - - e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0250000 : 0xc0150000; - e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x85800000 : 0x8a400000; - set_long(pc, e); - set_dst(pc, dst, e); - set_src_0(pc, src, e); - set_src_2(pc, src, e); - - emit(pc, e); -} - -static void -convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e) -{ - unsigned q = 0, m = ~0; - - assert(!is_long(e)); - - switch (e->inst[0] >> 28) { - case 0x1: - /* MOV */ - q = 0x0403c000; - m = 0xffff7fff; - break; - case 0x2: - case 0x3: - /* ADD, SUB, SUBR b32 */ - m = ~(0x8000 | (127 << 16)); - q = ((e->inst[0] & (~m)) >> 2) | (1 << 26); - break; - case 0x5: - /* SAD */ - m = ~(0x81 << 8); - q = (0x0c << 24) | ((e->inst[0] & (0x7f << 2)) << 12); - break; - case 0x6: - /* MAD u16 */ - q = (e->inst[0] & (0x7f << 2)) << 12; - break; - case 0x8: - /* INTERP (move centroid, perspective and flat bits) */ - m = ~0x03000100; - q = (e->inst[0] & (3 << 24)) >> (24 - 16); - q |= (e->inst[0] & (1 << 8)) << (18 - 8); - break; - case 0x9: - /* RCP */ - break; - case 0xB: - /* ADD */ - m = ~(127 << 16); - q = ((e->inst[0] & (~m)) >> 2); - break; - case 0xC: - /* MUL */ - m = ~0x00008000; - q = ((e->inst[0] & (~m)) << 12); - break; - case 0xE: - /* MAD (if src2 == dst) */ - q = ((e->inst[0] & 0x1fc) << 12); - break; - default: - assert(0); - break; - } - - set_long(pc, e); - pc->p->exec_size++; - - e->inst[0] &= m; - e->inst[1] |= q; -} - -/* Some operations support an optional negation flag. */ static int -get_supported_mods(const struct tgsi_full_instruction *insn, int i) -{ - switch (insn->Instruction.Opcode) { - case TGSI_OPCODE_ADD: - case TGSI_OPCODE_COS: - case TGSI_OPCODE_DDX: - case TGSI_OPCODE_DDY: - case TGSI_OPCODE_DP3: - case TGSI_OPCODE_DP4: - case TGSI_OPCODE_EX2: - case TGSI_OPCODE_KIL: - case TGSI_OPCODE_LG2: - case TGSI_OPCODE_MAD: - case TGSI_OPCODE_MUL: - case TGSI_OPCODE_POW: - case TGSI_OPCODE_RCP: - case TGSI_OPCODE_RSQ: /* ignored, RSQ = rsqrt(abs(src.x)) */ - case TGSI_OPCODE_SCS: - case TGSI_OPCODE_SIN: - case TGSI_OPCODE_SUB: - return NV50_MOD_NEG; - case TGSI_OPCODE_MAX: - case TGSI_OPCODE_MIN: - case TGSI_OPCODE_INEG: /* tgsi src sign toggle/set would be stupid */ - return NV50_MOD_ABS; - case TGSI_OPCODE_CEIL: - case TGSI_OPCODE_FLR: - case TGSI_OPCODE_TRUNC: - return NV50_MOD_NEG | NV50_MOD_ABS; - case TGSI_OPCODE_F2I: - case TGSI_OPCODE_F2U: - case TGSI_OPCODE_I2F: - case TGSI_OPCODE_U2F: - return NV50_MOD_NEG | NV50_MOD_ABS | NV50_MOD_I32; - case TGSI_OPCODE_UADD: - return NV50_MOD_NEG | NV50_MOD_I32; - case TGSI_OPCODE_SAD: - case TGSI_OPCODE_SHL: - case TGSI_OPCODE_IMAX: - case TGSI_OPCODE_IMIN: - case TGSI_OPCODE_ISHR: - case TGSI_OPCODE_NOT: - case TGSI_OPCODE_UMAD: - case TGSI_OPCODE_UMAX: - case TGSI_OPCODE_UMIN: - case TGSI_OPCODE_UMUL: - case TGSI_OPCODE_USHR: - return NV50_MOD_I32; - default: - return 0; - } -} - -/* Return a read mask for source registers deduced from opcode & write mask. */ -static unsigned -nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c) -{ - unsigned x, mask = insn->Dst[0].Register.WriteMask; - - switch (insn->Instruction.Opcode) { - case TGSI_OPCODE_COS: - case TGSI_OPCODE_SIN: - return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0); - case TGSI_OPCODE_DP3: - return 0x7; - case TGSI_OPCODE_DP4: - case TGSI_OPCODE_DPH: - case TGSI_OPCODE_KIL: /* WriteMask ignored */ - return 0xf; - case TGSI_OPCODE_DST: - return mask & (c ? 0xa : 0x6); - case TGSI_OPCODE_EX2: - case TGSI_OPCODE_EXP: - case TGSI_OPCODE_LG2: - case TGSI_OPCODE_LOG: - case TGSI_OPCODE_POW: - case TGSI_OPCODE_RCP: - case TGSI_OPCODE_RSQ: - case TGSI_OPCODE_SCS: - return 0x1; - case TGSI_OPCODE_IF: - return 0x1; - case TGSI_OPCODE_LIT: - return 0xb; - case TGSI_OPCODE_TEX: - case TGSI_OPCODE_TXB: - case TGSI_OPCODE_TXL: - case TGSI_OPCODE_TXP: - { - const struct tgsi_instruction_texture *tex; - - assert(insn->Instruction.Texture); - tex = &insn->Texture; - - mask = 0x7; - if (insn->Instruction.Opcode != TGSI_OPCODE_TEX && - insn->Instruction.Opcode != TGSI_OPCODE_TXD) - mask |= 0x8; /* bias, lod or proj */ - - switch (tex->Texture) { - case TGSI_TEXTURE_1D: - mask &= 0x9; - break; - case TGSI_TEXTURE_SHADOW1D: - mask &= 0x5; - break; - case TGSI_TEXTURE_2D: - mask &= 0xb; - break; - default: - break; - } - } - return mask; - case TGSI_OPCODE_XPD: - x = 0; - if (mask & 1) x |= 0x6; - if (mask & 2) x |= 0x5; - if (mask & 4) x |= 0x3; - return x; - default: - break; - } - - return mask; -} - -static struct nv50_reg * -tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst) -{ - switch (dst->Register.File) { - case TGSI_FILE_TEMPORARY: - return &pc->temp[dst->Register.Index * 4 + c]; - case TGSI_FILE_OUTPUT: - return &pc->result[dst->Register.Index * 4 + c]; - case TGSI_FILE_ADDRESS: - { - struct nv50_reg *r = pc->addr[dst->Register.Index * 4 + c]; - if (!r) { - r = get_address_reg(pc, NULL); - r->index = dst->Register.Index * 4 + c; - pc->addr[r->index] = r; - } - assert(r); - return r; - } - case TGSI_FILE_NULL: - return NULL; - case TGSI_FILE_SYSTEM_VALUE: - assert(pc->sysval[dst->Register.Index].type == P_RESULT); - assert(c == 0); - return &pc->sysval[dst->Register.Index]; - default: - break; - } - - return NULL; -} - -static struct nv50_reg * -tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src, - int mod) -{ - struct nv50_reg *r = NULL; - struct nv50_reg *temp = NULL; - unsigned sgn, c, swz, cvn; - - if (src->Register.File != TGSI_FILE_CONSTANT) - assert(!src->Register.Indirect); - - sgn = tgsi_util_get_full_src_register_sign_mode(src, chan); - - c = tgsi_util_get_full_src_register_swizzle(src, chan); - switch (c) { - case TGSI_SWIZZLE_X: - case TGSI_SWIZZLE_Y: - case TGSI_SWIZZLE_Z: - case TGSI_SWIZZLE_W: - switch (src->Register.File) { - case TGSI_FILE_INPUT: - r = &pc->attr[src->Register.Index * 4 + c]; - - if (!src->Dimension.Dimension) - break; - r = reg_instance(pc, r); - r->vtx = src->Dimension.Index; - - if (!src->Dimension.Indirect) - break; - swz = tgsi_util_get_src_register_swizzle( - &src->DimIndirect, 0); - r->acc = -1; - r->indirect[1] = src->DimIndirect.Index * 4 + swz; - break; - case TGSI_FILE_TEMPORARY: - r = &pc->temp[src->Register.Index * 4 + c]; - break; - case TGSI_FILE_CONSTANT: - if (!src->Register.Indirect) { - r = &pc->param[src->Register.Index * 4 + c]; - break; - } - /* Indicate indirection by setting r->acc < 0 and - * use the index field to select the address reg. - */ - r = reg_instance(pc, NULL); - ctor_reg(r, P_CONST, -1, src->Register.Index * 4 + c); - - swz = tgsi_util_get_src_register_swizzle( - &src->Indirect, 0); - r->acc = -1; - r->indirect[0] = src->Indirect.Index * 4 + swz; - break; - case TGSI_FILE_IMMEDIATE: - r = &pc->immd[src->Register.Index * 4 + c]; - break; - case TGSI_FILE_SAMPLER: - return NULL; - case TGSI_FILE_ADDRESS: - r = pc->addr[src->Register.Index * 4 + c]; - assert(r); - break; - case TGSI_FILE_SYSTEM_VALUE: - assert(c == 0); - r = &pc->sysval[src->Register.Index]; - break; - default: - assert(0); - break; - } - break; - default: - assert(0); - break; - } - - cvn = (mod & NV50_MOD_I32) ? CVT_S32_S32 : CVT_F32_F32; - - switch (sgn) { - case TGSI_UTIL_SIGN_CLEAR: - r->mod = NV50_MOD_ABS; - break; - case TGSI_UTIL_SIGN_SET: - r->mod = NV50_MOD_NEG_ABS; - break; - case TGSI_UTIL_SIGN_TOGGLE: - r->mod = NV50_MOD_NEG; - break; - default: - assert(!r->mod && sgn == TGSI_UTIL_SIGN_KEEP); - break; - } - - if ((r->mod & mod) != r->mod) { - temp = temp_temp(pc, NULL); - emit_cvt(pc, temp, r, -1, cvn); - r->mod = 0; - r = temp; - } else - r->mod |= mod & NV50_MOD_I32; - - assert(r); - if (r->acc >= 0 && r->vtx < 0 && r != temp) - return reg_instance(pc, r); /* will clear r->mod */ - return r; -} - -/* return TRUE for ops that produce only a single result */ -static boolean -is_scalar_op(unsigned op) -{ - switch (op) { - case TGSI_OPCODE_COS: - case TGSI_OPCODE_DP2: - case TGSI_OPCODE_DP3: - case TGSI_OPCODE_DP4: - case TGSI_OPCODE_DPH: - case TGSI_OPCODE_EX2: - case TGSI_OPCODE_LG2: - case TGSI_OPCODE_POW: - case TGSI_OPCODE_RCP: - case TGSI_OPCODE_RSQ: - case TGSI_OPCODE_SIN: - /* - case TGSI_OPCODE_KIL: - case TGSI_OPCODE_LIT: - case TGSI_OPCODE_SCS: - */ - return TRUE; - default: - return FALSE; - } -} - -/* Returns a bitmask indicating which dst components depend - * on source s, component c (reverse of nv50_tgsi_src_mask). - */ -static unsigned -nv50_tgsi_dst_revdep(unsigned op, int s, int c) -{ - if (is_scalar_op(op)) - return 0x1; - - switch (op) { - case TGSI_OPCODE_DST: - return (1 << c) & (s ? 0xa : 0x6); - case TGSI_OPCODE_XPD: - switch (c) { - case 0: return 0x6; - case 1: return 0x5; - case 2: return 0x3; - case 3: return 0x0; - default: - assert(0); - return 0x0; - } - case TGSI_OPCODE_EXP: - case TGSI_OPCODE_LOG: - case TGSI_OPCODE_LIT: - case TGSI_OPCODE_SCS: - case TGSI_OPCODE_TEX: - case TGSI_OPCODE_TXB: - case TGSI_OPCODE_TXL: - case TGSI_OPCODE_TXP: - /* these take care of dangerous swizzles themselves */ - return 0x0; - case TGSI_OPCODE_IF: - case TGSI_OPCODE_KIL: - /* don't call this function for these ops */ - assert(0); - return 0; - default: - /* linear vector instruction */ - return (1 << c); - } -} - -static INLINE boolean -has_pred(struct nv50_program_exec *e, unsigned cc) -{ - if (!is_long(e) || is_immd(e)) - return FALSE; - return ((e->inst[1] & 0x780) == (cc << 7)); -} - -/* on ENDIF see if we can do "@p0.neu single_op" instead of: - * join_at ENDIF - * @p0.eq bra ENDIF - * single_op - * ENDIF: nop.join - */ -static boolean -nv50_kill_branch(struct nv50_pc *pc) -{ - int lvl = pc->if_lvl; - - if (pc->if_insn[lvl]->next != pc->p->exec_tail) - return FALSE; - if (is_immd(pc->p->exec_tail)) - return FALSE; - - /* if ccode == 'true', the BRA is from an ELSE and the predicate - * reg may no longer be valid, since we currently always use $p0 - */ - if (has_pred(pc->if_insn[lvl], 0xf)) - return FALSE; - assert(pc->if_insn[lvl] && pc->if_join[lvl]); - - /* We'll use the exec allocated for JOIN_AT (we can't easily - * access nv50_program_exec's prev). - */ - pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */ - - *pc->if_join[lvl] = *pc->p->exec_tail; - - FREE(pc->if_insn[lvl]); - FREE(pc->p->exec_tail); - - pc->p->exec_tail = pc->if_join[lvl]; - pc->p->exec_tail->next = NULL; - set_pred(pc, 0xd, 0, pc->p->exec_tail); - - return TRUE; -} - -static void -nv50_fp_move_results(struct nv50_pc *pc) -{ - struct nv50_reg reg; - unsigned i; - - ctor_reg(®, P_TEMP, -1, -1); - - for (i = 0; i < pc->result_nr * 4; ++i) { - if (pc->result[i].rhw < 0 || pc->result[i].hw < 0) - continue; - if (pc->result[i].rhw != pc->result[i].hw) { - reg.hw = pc->result[i].rhw; - emit_mov(pc, ®, &pc->result[i]); - } - } -} - -static boolean -nv50_program_tx_insn(struct nv50_pc *pc, - const struct tgsi_full_instruction *inst) -{ - struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp; - unsigned mask, sat, unit = 0; - int i, c; - - mask = inst->Dst[0].Register.WriteMask; - sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE; - - memset(src, 0, sizeof(src)); - - for (c = 0; c < 4; c++) { - if ((mask & (1 << c)) && !pc->r_dst[c]) - dst[c] = tgsi_dst(pc, c, &inst->Dst[0]); - else - dst[c] = pc->r_dst[c]; - rdst[c] = dst[c]; - } - - for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { - const struct tgsi_full_src_register *fs = &inst->Src[i]; - unsigned src_mask; - int mod_supp; - - src_mask = nv50_tgsi_src_mask(inst, i); - mod_supp = get_supported_mods(inst, i); - - if (fs->Register.File == TGSI_FILE_SAMPLER) - unit = fs->Register.Index; - - for (c = 0; c < 4; c++) - if (src_mask & (1 << c)) - src[i][c] = tgsi_src(pc, c, fs, mod_supp); - } - - brdc = temp = pc->r_brdc; - if (brdc && brdc->type != P_TEMP) { - temp = temp_temp(pc, NULL); - if (sat) - brdc = temp; - } else - if (sat) { - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c)) || dst[c]->type == P_TEMP) - continue; - /* rdst[c] = dst[c]; */ /* done above */ - dst[c] = temp_temp(pc, NULL); - } - } - - assert(brdc || !is_scalar_op(inst->Instruction.Opcode)); - - switch (inst->Instruction.Opcode) { - case TGSI_OPCODE_ABS: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, - CVT_ABS | CVT_F32_F32); - } - break; - case TGSI_OPCODE_ADD: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_add(pc, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_AND: - case TGSI_OPCODE_XOR: - case TGSI_OPCODE_OR: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_bitop2(pc, dst[c], src[0][c], src[1][c], - inst->Instruction.Opcode); - } - break; - case TGSI_OPCODE_ARL: - temp = temp_temp(pc, NULL); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, temp, src[0][c], -1, - CVT_FLOOR | CVT_S32_F32); - emit_arl(pc, dst[c], temp, 4); - } - break; - case TGSI_OPCODE_BGNLOOP: - pc->loop_brka[pc->loop_lvl] = emit_breakaddr(pc); - pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size; - terminate_mbb(pc); - break; - case TGSI_OPCODE_BGNSUB: - assert(!pc->in_subroutine); - pc->in_subroutine = TRUE; - /* probably not necessary, but align to 8 byte boundary */ - if (!is_long(pc->p->exec_tail)) - convert_to_long(pc, pc->p->exec_tail); - break; - case TGSI_OPCODE_BRK: - assert(pc->loop_lvl > 0); - emit_break(pc, -1, 0); - break; - case TGSI_OPCODE_CAL: - assert(inst->Label.Label < pc->insn_nr); - emit_call(pc, -1, 0)->param.index = inst->Label.Label; - /* replaced by actual offset in nv50_program_fixup_insns */ - break; - case TGSI_OPCODE_CEIL: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, - CVT_CEIL | CVT_F32_F32 | CVT_RI); - } - break; - case TGSI_OPCODE_CMP: - pc->allow32 = FALSE; - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, NULL, src[0][c], 1, CVT_F32_F32); - emit_mov(pc, dst[c], src[1][c]); - set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */ - emit_mov(pc, dst[c], src[2][c]); - set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */ - } - break; - case TGSI_OPCODE_CONT: - assert(pc->loop_lvl > 0); - emit_branch(pc, -1, 0)->param.index = - pc->loop_pos[pc->loop_lvl - 1]; - break; - case TGSI_OPCODE_COS: - if (mask & 8) { - emit_precossin(pc, temp, src[0][3]); - emit_flop(pc, NV50_FLOP_COS, dst[3], temp); - if (!(mask &= 7)) - break; - if (temp == dst[3]) - temp = brdc = temp_temp(pc, NULL); - } - emit_precossin(pc, temp, src[0][0]); - emit_flop(pc, NV50_FLOP_COS, brdc, temp); - break; - case TGSI_OPCODE_DDX: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_ddx(pc, dst[c], src[0][c]); - } - break; - case TGSI_OPCODE_DDY: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_ddy(pc, dst[c], src[0][c]); - } - break; - case TGSI_OPCODE_DP3: - emit_mul(pc, temp, src[0][0], src[1][0]); - emit_mad(pc, temp, src[0][1], src[1][1], temp); - emit_mad(pc, brdc, src[0][2], src[1][2], temp); - break; - case TGSI_OPCODE_DP4: - emit_mul(pc, temp, src[0][0], src[1][0]); - emit_mad(pc, temp, src[0][1], src[1][1], temp); - emit_mad(pc, temp, src[0][2], src[1][2], temp); - emit_mad(pc, brdc, src[0][3], src[1][3], temp); - break; - case TGSI_OPCODE_DPH: - emit_mul(pc, temp, src[0][0], src[1][0]); - emit_mad(pc, temp, src[0][1], src[1][1], temp); - emit_mad(pc, temp, src[0][2], src[1][2], temp); - emit_add(pc, brdc, src[1][3], temp); - break; - case TGSI_OPCODE_DST: - if (mask & (1 << 1)) - emit_mul(pc, dst[1], src[0][1], src[1][1]); - if (mask & (1 << 2)) - emit_mov(pc, dst[2], src[0][2]); - if (mask & (1 << 3)) - emit_mov(pc, dst[3], src[1][3]); - if (mask & (1 << 0)) - emit_mov_immdval(pc, dst[0], 1.0f); - break; - case TGSI_OPCODE_ELSE: - emit_branch(pc, -1, 0); - pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; - pc->if_insn[pc->if_lvl++] = pc->p->exec_tail; - terminate_mbb(pc); - break; - case TGSI_OPCODE_EMIT: - emit_prim_cmd(pc, 1); - break; - case TGSI_OPCODE_ENDIF: - pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; - - /* try to replace branch over 1 insn with a predicated insn */ - if (nv50_kill_branch(pc) == TRUE) - break; - - if (pc->if_join[pc->if_lvl]) { - pc->if_join[pc->if_lvl]->param.index = pc->p->exec_size; - pc->if_join[pc->if_lvl] = NULL; - } - terminate_mbb(pc); - /* emit a NOP as join point, we could set it on the next - * one, but would have to make sure it is long and !immd - */ - JOIN_ON(emit_nop(pc)); - break; - case TGSI_OPCODE_ENDLOOP: - emit_branch(pc, -1, 0)->param.index = - pc->loop_pos[--pc->loop_lvl]; - pc->loop_brka[pc->loop_lvl]->param.index = pc->p->exec_size; - terminate_mbb(pc); - break; - case TGSI_OPCODE_ENDPRIM: - emit_prim_cmd(pc, 2); - break; - case TGSI_OPCODE_ENDSUB: - assert(pc->in_subroutine); - terminate_mbb(pc); - pc->in_subroutine = FALSE; - break; - case TGSI_OPCODE_EX2: - emit_preex2(pc, temp, src[0][0]); - emit_flop(pc, NV50_FLOP_EX2, brdc, temp); - break; - case TGSI_OPCODE_EXP: - { - struct nv50_reg *t[2]; - - assert(!temp); - t[0] = temp_temp(pc, NULL); - t[1] = temp_temp(pc, NULL); - - if (mask & 0x6) - emit_mov(pc, t[0], src[0][0]); - if (mask & 0x3) - emit_flr(pc, t[1], src[0][0]); - - if (mask & (1 << 1)) - emit_sub(pc, dst[1], t[0], t[1]); - if (mask & (1 << 0)) { - emit_preex2(pc, t[1], t[1]); - emit_flop(pc, NV50_FLOP_EX2, dst[0], t[1]); - } - if (mask & (1 << 2)) { - emit_preex2(pc, t[0], t[0]); - emit_flop(pc, NV50_FLOP_EX2, dst[2], t[0]); - } - if (mask & (1 << 3)) - emit_mov_immdval(pc, dst[3], 1.0f); - } - break; - case TGSI_OPCODE_F2I: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, - CVT_TRUNC | CVT_S32_F32); - } - break; - case TGSI_OPCODE_F2U: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, - CVT_TRUNC | CVT_U32_F32); - } - break; - case TGSI_OPCODE_FLR: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_flr(pc, dst[c], src[0][c]); - } - break; - case TGSI_OPCODE_FRC: - temp = temp_temp(pc, NULL); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_flr(pc, temp, src[0][c]); - emit_sub(pc, dst[c], src[0][c], temp); - } - break; - case TGSI_OPCODE_I2F: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_S32); - } - break; - case TGSI_OPCODE_IF: - assert(pc->if_lvl < NV50_MAX_COND_NESTING); - emit_cvt(pc, NULL, src[0][0], 0, CVT_ABS | CVT_F32_F32); - pc->if_join[pc->if_lvl] = emit_joinat(pc); - pc->if_insn[pc->if_lvl++] = emit_branch(pc, 0, 2);; - terminate_mbb(pc); - break; - case TGSI_OPCODE_IMAX: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_minmax(pc, 0x08c, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_IMIN: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_minmax(pc, 0x0ac, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_INEG: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, - CVT_S32_S32 | CVT_NEG); - } - break; - case TGSI_OPCODE_KIL: - assert(src[0][0] && src[0][1] && src[0][2] && src[0][3]); - emit_kil(pc, src[0][0]); - emit_kil(pc, src[0][1]); - emit_kil(pc, src[0][2]); - emit_kil(pc, src[0][3]); - break; - case TGSI_OPCODE_KILP: - emit_kil(pc, NULL); - break; - case TGSI_OPCODE_LIT: - emit_lit(pc, &dst[0], mask, &src[0][0]); - break; - case TGSI_OPCODE_LG2: - emit_flop(pc, NV50_FLOP_LG2, brdc, src[0][0]); - break; - case TGSI_OPCODE_LOG: - { - struct nv50_reg *t[2]; - - t[0] = temp_temp(pc, NULL); - if (mask & (1 << 1)) - t[1] = temp_temp(pc, NULL); - else - t[1] = t[0]; - - emit_cvt(pc, t[0], src[0][0], -1, CVT_ABS | CVT_F32_F32); - emit_flop(pc, NV50_FLOP_LG2, t[1], t[0]); - if (mask & (1 << 2)) - emit_mov(pc, dst[2], t[1]); - emit_flr(pc, t[1], t[1]); - if (mask & (1 << 0)) - emit_mov(pc, dst[0], t[1]); - if (mask & (1 << 1)) { - t[1]->mod = NV50_MOD_NEG; - emit_preex2(pc, t[1], t[1]); - t[1]->mod = 0; - emit_flop(pc, NV50_FLOP_EX2, t[1], t[1]); - emit_mul(pc, dst[1], t[0], t[1]); - } - if (mask & (1 << 3)) - emit_mov_immdval(pc, dst[3], 1.0f); - } - break; - case TGSI_OPCODE_LRP: - temp = temp_temp(pc, NULL); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_sub(pc, temp, src[1][c], src[2][c]); - emit_mad(pc, dst[c], temp, src[0][c], src[2][c]); - } - break; - case TGSI_OPCODE_MAD: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]); - } - break; - case TGSI_OPCODE_MAX: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_minmax(pc, 0x880, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_MIN: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_minmax(pc, 0x8a0, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_MOV: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mov(pc, dst[c], src[0][c]); - } - break; - case TGSI_OPCODE_MUL: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mul(pc, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_NOT: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_not(pc, dst[c], src[0][c]); - } - break; - case TGSI_OPCODE_POW: - emit_pow(pc, brdc, src[0][0], src[1][0]); - break; - case TGSI_OPCODE_RCP: - if (!sat && popcnt4(mask) == 1) - brdc = dst[ffs(mask) - 1]; - emit_flop(pc, NV50_FLOP_RCP, brdc, src[0][0]); - break; - case TGSI_OPCODE_RET: - if (pc->p->type == PIPE_SHADER_FRAGMENT && !pc->in_subroutine) - nv50_fp_move_results(pc); - emit_ret(pc, -1, 0); - break; - case TGSI_OPCODE_RSQ: - if (!sat && popcnt4(mask) == 1) - brdc = dst[ffs(mask) - 1]; - src[0][0]->mod |= NV50_MOD_ABS; - emit_flop(pc, NV50_FLOP_RSQ, brdc, src[0][0]); - break; - case TGSI_OPCODE_SAD: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_sad(pc, dst[c], src[0][c], src[1][c], src[2][c]); - } - break; - case TGSI_OPCODE_SCS: - temp = temp_temp(pc, NULL); - if (mask & 3) - emit_precossin(pc, temp, src[0][0]); - if (mask & (1 << 0)) - emit_flop(pc, NV50_FLOP_COS, dst[0], temp); - if (mask & (1 << 1)) - emit_flop(pc, NV50_FLOP_SIN, dst[1], temp); - if (mask & (1 << 2)) - emit_mov_immdval(pc, dst[2], 0.0); - if (mask & (1 << 3)) - emit_mov_immdval(pc, dst[3], 1.0); - break; - case TGSI_OPCODE_SHL: - case TGSI_OPCODE_ISHR: - case TGSI_OPCODE_USHR: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_shift(pc, dst[c], src[0][c], src[1][c], - inst->Instruction.Opcode); - } - break; - case TGSI_OPCODE_SIN: - if (mask & 8) { - emit_precossin(pc, temp, src[0][3]); - emit_flop(pc, NV50_FLOP_SIN, dst[3], temp); - if (!(mask &= 7)) - break; - if (temp == dst[3]) - temp = brdc = temp_temp(pc, NULL); - } - emit_precossin(pc, temp, src[0][0]); - emit_flop(pc, NV50_FLOP_SIN, brdc, temp); - break; - case TGSI_OPCODE_SLT: - case TGSI_OPCODE_SGE: - case TGSI_OPCODE_SEQ: - case TGSI_OPCODE_SGT: - case TGSI_OPCODE_SLE: - case TGSI_OPCODE_SNE: - case TGSI_OPCODE_ISLT: - case TGSI_OPCODE_ISGE: - case TGSI_OPCODE_USEQ: - case TGSI_OPCODE_USGE: - case TGSI_OPCODE_USLT: - case TGSI_OPCODE_USNE: - { - uint8_t cc, ty; - - map_tgsi_setop_hw(inst->Instruction.Opcode, &cc, &ty); - - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_set(pc, cc, dst[c], -1, src[0][c], src[1][c], ty); - } - } - break; - case TGSI_OPCODE_SUB: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_sub(pc, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_TEX: - emit_tex(pc, dst, mask, src[0], unit, - inst->Texture.Texture, FALSE, 0); - break; - case TGSI_OPCODE_TXB: - emit_tex(pc, dst, mask, src[0], unit, - inst->Texture.Texture, FALSE, -1); - break; - case TGSI_OPCODE_TXL: - emit_tex(pc, dst, mask, src[0], unit, - inst->Texture.Texture, FALSE, 1); - break; - case TGSI_OPCODE_TXP: - emit_tex(pc, dst, mask, src[0], unit, - inst->Texture.Texture, TRUE, 0); - break; - case TGSI_OPCODE_TRUNC: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, - CVT_TRUNC | CVT_F32_F32 | CVT_RI); - } - break; - case TGSI_OPCODE_U2F: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_U32); - } - break; - case TGSI_OPCODE_UADD: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_add_b32(pc, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_UMAX: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_minmax(pc, 0x084, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_UMIN: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_minmax(pc, 0x0a4, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_UMAD: - { - assert(!temp); - temp = temp_temp(pc, NULL); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mul_u16(pc, temp, src[0][c], 0, src[1][c], 1); - emit_mad_u16(pc, temp, src[0][c], 1, src[1][c], 0, - temp); - emit_shl_imm(pc, temp, temp, 16); - emit_mad_u16(pc, temp, src[0][c], 0, src[1][c], 0, - temp); - emit_add_b32(pc, dst[c], temp, src[2][c]); - } - } - break; - case TGSI_OPCODE_UMUL: - { - assert(!temp); - temp = temp_temp(pc, NULL); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mul_u16(pc, temp, src[0][c], 0, src[1][c], 1); - emit_mad_u16(pc, temp, src[0][c], 1, src[1][c], 0, - temp); - emit_shl_imm(pc, temp, temp, 16); - emit_mad_u16(pc, dst[c], src[0][c], 0, src[1][c], 0, - temp); - } - } - break; - case TGSI_OPCODE_XPD: - temp = temp_temp(pc, NULL); - if (mask & (1 << 0)) { - emit_mul(pc, temp, src[0][2], src[1][1]); - emit_msb(pc, dst[0], src[0][1], src[1][2], temp); - } - if (mask & (1 << 1)) { - emit_mul(pc, temp, src[0][0], src[1][2]); - emit_msb(pc, dst[1], src[0][2], src[1][0], temp); - } - if (mask & (1 << 2)) { - emit_mul(pc, temp, src[0][1], src[1][0]); - emit_msb(pc, dst[2], src[0][0], src[1][1], temp); - } - if (mask & (1 << 3)) - emit_mov_immdval(pc, dst[3], 1.0); - break; - case TGSI_OPCODE_END: - if (pc->p->type == PIPE_SHADER_FRAGMENT) - nv50_fp_move_results(pc); - - if (!pc->p->exec_tail || - is_immd(pc->p->exec_tail) || - is_join(pc->p->exec_tail) || - is_control_flow(pc->p->exec_tail)) - emit_nop(pc); - - /* last insn must be long so it can have the exit bit set */ - if (!is_long(pc->p->exec_tail)) - convert_to_long(pc, pc->p->exec_tail); - - pc->p->exec_tail->inst[1] |= 1; /* set exit bit */ - - terminate_mbb(pc); - break; - default: - NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode); - return FALSE; - } - - if (brdc) { - if (sat) - emit_sat(pc, brdc, brdc); - for (c = 0; c < 4; c++) - if ((mask & (1 << c)) && dst[c] != brdc) - emit_mov(pc, dst[c], brdc); - } else - if (sat) { - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - /* In this case we saturate later, and dst[c] won't - * be another temp_temp (and thus lost), since rdst - * already is TEMP (see above). */ - if (rdst[c]->type == P_TEMP && rdst[c]->index < 0) - continue; - emit_sat(pc, rdst[c], dst[c]); - } - } - - kill_temp_temp(pc, NULL); - pc->reg_instance_nr = 0; - - return TRUE; -} - -static void -prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn) -{ - struct nv50_reg *r, *reg = NULL; - const struct tgsi_full_src_register *src; - const struct tgsi_dst_register *dst; - unsigned i, c, k, mask; - - dst = &insn->Dst[0].Register; - mask = dst->WriteMask; - - if (dst->File == TGSI_FILE_TEMPORARY) - reg = pc->temp; - else - if (dst->File == TGSI_FILE_OUTPUT) { - reg = pc->result; - - if (insn->Instruction.Opcode == TGSI_OPCODE_MOV && - dst->Index == pc->edgeflag_out && - insn->Src[0].Register.File == TGSI_FILE_INPUT) - pc->p->cfg.edgeflag_in = insn->Src[0].Register.Index; - } - - if (reg) { - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - reg[dst->Index * 4 + c].acc = pc->insn_nr; - } - } - - for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { - src = &insn->Src[i]; - - if (src->Register.File == TGSI_FILE_TEMPORARY) - reg = pc->temp; - else - if (src->Register.File == TGSI_FILE_INPUT) - reg = pc->attr; - else - continue; - - mask = nv50_tgsi_src_mask(insn, i); - - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - k = tgsi_util_get_full_src_register_swizzle(src, c); - - r = ®[src->Register.Index * 4 + k]; - - /* If used before written, pre-allocate the reg, - * lest we overwrite results from a subroutine. - */ - if (!r->acc && r->type == P_TEMP) - alloc_reg(pc, r); - - r->acc = pc->insn_nr; - } - } -} - -/* Returns a bitmask indicating which dst components need to be - * written to temporaries first to avoid 'corrupting' sources. - * - * m[i] (out) indicate component to write in the i-th position - * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source - */ -static unsigned -nv50_revdep_reorder(unsigned m[4], unsigned rdep[4]) -{ - unsigned i, c, x, unsafe = 0; - - for (c = 0; c < 4; c++) - m[c] = c; - - /* Swap as long as a dst component written earlier is depended on - * by one written later, but the next one isn't depended on by it. - */ - for (c = 0; c < 3; c++) { - if (rdep[m[c + 1]] & (1 << m[c])) - continue; /* if next one is depended on by us */ - for (i = c + 1; i < 4; i++) - /* if we are depended on by a later one */ - if (rdep[m[c]] & (1 << m[i])) - break; - if (i == 4) - continue; - /* now, swap */ - x = m[c]; - m[c] = m[c + 1]; - m[c + 1] = x; - - /* restart */ - c = 0; - } - - /* mark dependencies that could not be resolved by reordering */ - for (i = 0; i < 3; ++i) - for (c = i + 1; c < 4; ++c) - if (rdep[m[i]] & (1 << m[c])) - unsafe |= (1 << i); - - /* NOTE: $unsafe is with respect to order, not component */ - return unsafe; -} - -/* Select a suitable dst register for broadcasting scalar results, - * or return NULL if we have to allocate an extra TEMP. - * - * If e.g. only 1 component is written, we may also emit the final - * result to a write-only register. - */ -static struct nv50_reg * -tgsi_broadcast_dst(struct nv50_pc *pc, - const struct tgsi_full_dst_register *fd, unsigned mask) -{ - if (fd->Register.File == TGSI_FILE_TEMPORARY) { - int c = ffs(~mask & fd->Register.WriteMask); - if (c) - return tgsi_dst(pc, c - 1, fd); - } else { - int c = ffs(fd->Register.WriteMask) - 1; - if ((1 << c) == fd->Register.WriteMask) - return tgsi_dst(pc, c, fd); - } - - return NULL; -} - -/* Scan source swizzles and return a bitmask indicating dst regs that - * also occur among the src regs, and fill rdep for nv50_revdep_reoder. - */ -static unsigned -nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn, - unsigned rdep[4]) -{ - const struct tgsi_full_dst_register *fd = &insn->Dst[0]; - const struct tgsi_full_src_register *fs; - unsigned i, deqs = 0; - - for (i = 0; i < 4; ++i) - rdep[i] = 0; - - for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { - unsigned chn, mask = nv50_tgsi_src_mask(insn, i); - int ms = get_supported_mods(insn, i); - - fs = &insn->Src[i]; - if (fs->Register.File != fd->Register.File || - fs->Register.Index != fd->Register.Index) - continue; - - for (chn = 0; chn < 4; ++chn) { - unsigned s, c; - - if (!(mask & (1 << chn))) /* src is not read */ - continue; - c = tgsi_util_get_full_src_register_swizzle(fs, chn); - s = tgsi_util_get_full_src_register_sign_mode(fs, chn); - - if (!(fd->Register.WriteMask & (1 << c))) - continue; - - if (s == TGSI_UTIL_SIGN_TOGGLE && !(ms & NV50_MOD_NEG)) - continue; - if (s == TGSI_UTIL_SIGN_CLEAR && !(ms & NV50_MOD_ABS)) - continue; - if ((s == TGSI_UTIL_SIGN_SET) && ((ms & 3) != 3)) - continue; - - rdep[c] |= nv50_tgsi_dst_revdep( - insn->Instruction.Opcode, i, chn); - deqs |= (1 << c); - } - } - - return deqs; -} - -static boolean -nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) -{ - struct tgsi_full_instruction insn = tok->FullInstruction; - const struct tgsi_full_dst_register *fd; - unsigned i, deqs, rdep[4], m[4]; - - fd = &tok->FullInstruction.Dst[0]; - deqs = nv50_tgsi_scan_swizzle(&insn, rdep); - - if (is_scalar_op(insn.Instruction.Opcode)) { - pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs); - if (!pc->r_brdc) - pc->r_brdc = temp_temp(pc, NULL); - return nv50_program_tx_insn(pc, &insn); - } - pc->r_brdc = NULL; - - if (!deqs || (!rdep[0] && !rdep[1] && !rdep[2] && !rdep[3])) - return nv50_program_tx_insn(pc, &insn); - - deqs = nv50_revdep_reorder(m, rdep); - - for (i = 0; i < 4; ++i) { - assert(pc->r_dst[m[i]] == NULL); - - insn.Dst[0].Register.WriteMask = - fd->Register.WriteMask & (1 << m[i]); - - if (!insn.Dst[0].Register.WriteMask) - continue; - - if (deqs & (1 << i)) - pc->r_dst[m[i]] = alloc_temp(pc, NULL); - - if (!nv50_program_tx_insn(pc, &insn)) - return FALSE; - } - - for (i = 0; i < 4; i++) { - struct nv50_reg *reg = pc->r_dst[i]; - if (!reg) - continue; - pc->r_dst[i] = NULL; - - if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE) - emit_sat(pc, tgsi_dst(pc, i, fd), reg); - else - emit_mov(pc, tgsi_dst(pc, i, fd), reg); - free_temp(pc, reg); - } - - return TRUE; -} - -static void -load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg) -{ - struct nv50_reg *iv, **ppiv; - unsigned mode = pc->interp_mode[reg->index]; - - ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p; - iv = *ppiv; - - if ((mode & INTERP_PERSPECTIVE) && !iv) { - iv = *ppiv = alloc_temp(pc, NULL); - iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1; - - emit_interp(pc, iv, NULL, mode & INTERP_CENTROID); - emit_flop(pc, NV50_FLOP_RCP, iv, iv); - - /* XXX: when loading interpolants dynamically, move these - * to the program head, or make sure it can't be skipped. - */ - } - - emit_interp(pc, reg, iv, mode); -} - -/* The face input is always at v[255] (varying space), with a - * value of 0 for back-facing, and 0xffffffff for front-facing. - */ -static void -load_frontfacing(struct nv50_pc *pc, struct nv50_reg *sv) -{ - struct nv50_reg *temp = alloc_temp(pc, NULL); - int r_pred = 0; - - temp->rhw = 255; - emit_interp(pc, temp, NULL, INTERP_FLAT); - - emit_cvt(pc, sv, temp, r_pred, CVT_ABS | CVT_F32_S32); - - emit_not(pc, temp, temp); - set_pred(pc, 0x2, r_pred, pc->p->exec_tail); - emit_cvt(pc, sv, temp, -1, CVT_F32_S32); - set_pred(pc, 0x2, r_pred, pc->p->exec_tail); - - free_temp(pc, temp); -} - -static void -load_instance_id(struct nv50_pc *pc, unsigned index) -{ - struct nv50_reg reg, mem; - - ctor_reg(®, P_TEMP, -1, -1); - ctor_reg(&mem, P_CONST, -1, 24); /* startInstance */ - mem.buf_index = 2; - - emit_add_b32(pc, ®, &pc->sysval[index], &mem); - pc->sysval[index] = reg; -} - -static void -copy_semantic_info(struct nv50_program *p) -{ - unsigned i, id; - - for (i = 0; i < p->cfg.in_nr; ++i) { - id = p->cfg.in[i].id; - p->cfg.in[i].sn = p->info.input_semantic_name[id]; - p->cfg.in[i].si = p->info.input_semantic_index[id]; - } - - for (i = 0; i < p->cfg.out_nr; ++i) { - id = p->cfg.out[i].id; - p->cfg.out[i].sn = p->info.output_semantic_name[id]; - p->cfg.out[i].si = p->info.output_semantic_index[id]; - } -} - -static boolean -nv50_program_tx_prep(struct nv50_pc *pc) -{ - struct tgsi_parse_context tp; - struct nv50_program *p = pc->p; - boolean ret = FALSE; - unsigned i, c, instance_id = 0, vertex_id = 0, flat_nr = 0; - - tgsi_parse_init(&tp, pc->p->pipe.tokens); - while (!tgsi_parse_end_of_tokens(&tp)) { - const union tgsi_full_token *tok = &tp.FullToken; - - tgsi_parse_token(&tp); - switch (tok->Token.Type) { - case TGSI_TOKEN_TYPE_IMMEDIATE: - { - const struct tgsi_full_immediate *imm = - &tp.FullToken.FullImmediate; - - ctor_immd_4f32(pc, imm->u[0].Float, - imm->u[1].Float, - imm->u[2].Float, - imm->u[3].Float); - } - break; - case TGSI_TOKEN_TYPE_DECLARATION: - { - const struct tgsi_full_declaration *d; - unsigned si, last, first, mode; - - d = &tp.FullToken.FullDeclaration; - first = d->Range.First; - last = d->Range.Last; - - switch (d->Declaration.File) { - case TGSI_FILE_TEMPORARY: - break; - case TGSI_FILE_OUTPUT: - if (!d->Declaration.Semantic || - p->type == PIPE_SHADER_FRAGMENT) - break; - - si = d->Semantic.Index; - switch (d->Semantic.Name) { - case TGSI_SEMANTIC_BCOLOR: - p->cfg.two_side[si].hw = first; - if (p->cfg.out_nr > first) - p->cfg.out_nr = first; - break; - case TGSI_SEMANTIC_PSIZE: - p->cfg.psiz = first; - if (p->cfg.out_nr > first) - p->cfg.out_nr = first; - break; - case TGSI_SEMANTIC_EDGEFLAG: - pc->edgeflag_out = first; - break; - /* - case TGSI_SEMANTIC_CLIP_DISTANCE: - p->cfg.clpd = MIN2(p->cfg.clpd, first); - break; - */ - default: - break; - } - break; - case TGSI_FILE_INPUT: - { - if (p->type != PIPE_SHADER_FRAGMENT) - break; - - switch (d->Declaration.Interpolate) { - case TGSI_INTERPOLATE_CONSTANT: - mode = INTERP_FLAT; - flat_nr++; - break; - case TGSI_INTERPOLATE_PERSPECTIVE: - mode = INTERP_PERSPECTIVE; - p->cfg.regs[1] |= 0x08 << 24; - break; - default: - mode = INTERP_LINEAR; - break; - } - if (d->Declaration.Centroid) - mode |= INTERP_CENTROID; - - assert(last < 32); - for (i = first; i <= last; i++) - pc->interp_mode[i] = mode; - } - break; - case TGSI_FILE_SYSTEM_VALUE: - assert(d->Declaration.Semantic); - switch (d->Semantic.Name) { - case TGSI_SEMANTIC_FACE: - assert(p->type == PIPE_SHADER_FRAGMENT); - load_frontfacing(pc, - &pc->sysval[first]); - break; - case TGSI_SEMANTIC_INSTANCEID: - assert(p->type == PIPE_SHADER_VERTEX); - instance_id = first; - p->cfg.regs[0] |= (1 << 4); - break; - case TGSI_SEMANTIC_PRIMID: - assert(p->type != PIPE_SHADER_VERTEX); - p->cfg.prim_id = first; - break; - /* - case TGSI_SEMANTIC_PRIMIDIN: - assert(p->type == PIPE_SHADER_GEOMETRY); - pc->sysval[first].hw = 6; - p->cfg.regs[0] |= (1 << 8); - break; - case TGSI_SEMANTIC_VERTEXID: - assert(p->type == PIPE_SHADER_VERTEX); - vertex_id = first; - p->cfg.regs[0] |= (1 << 12) | (1 << 0); - break; - */ - } - break; - case TGSI_FILE_ADDRESS: - case TGSI_FILE_CONSTANT: - case TGSI_FILE_SAMPLER: - break; - default: - NOUVEAU_ERR("bad decl file %d\n", - d->Declaration.File); - goto out_err; - } - } - break; - case TGSI_TOKEN_TYPE_INSTRUCTION: - pc->insn_nr++; - prep_inspect_insn(pc, &tok->FullInstruction); - break; - default: - break; - } - } - - if (p->type == PIPE_SHADER_VERTEX || p->type == PIPE_SHADER_GEOMETRY) { - int rid = 0; - - if (p->type == PIPE_SHADER_GEOMETRY) { - for (i = 0; i < pc->attr_nr; ++i) { - p->cfg.in[i].hw = rid; - p->cfg.in[i].id = i; - - for (c = 0; c < 4; ++c) { - int n = i * 4 + c; - if (!pc->attr[n].acc) - continue; - pc->attr[n].hw = rid++; - p->cfg.in[i].mask |= 1 << c; - } - } - } else { - for (i = 0; i < pc->attr_nr * 4; ++i) { - if (pc->attr[i].acc) { - pc->attr[i].hw = rid++; - p->cfg.attr[i / 32] |= 1 << (i % 32); - } - } - if (p->cfg.regs[0] & (1 << 0)) - pc->sysval[vertex_id].hw = rid++; - if (p->cfg.regs[0] & (1 << 4)) { - pc->sysval[instance_id].hw = rid++; - load_instance_id(pc, instance_id); - } - } - - for (i = 0, rid = 0; i < pc->result_nr; ++i) { - p->cfg.out[i].hw = rid; - p->cfg.out[i].id = i; - - for (c = 0; c < 4; ++c) { - int n = i * 4 + c; - if (!pc->result[n].acc) - continue; - pc->result[n].hw = rid++; - p->cfg.out[i].mask |= 1 << c; - } - } - if (p->cfg.prim_id < 0x40) { - /* GP has to write to PrimitiveID */ - ctor_reg(&pc->sysval[p->cfg.prim_id], - P_RESULT, p->cfg.prim_id, rid); - p->cfg.prim_id = rid++; - } - - for (c = 0; c < 2; ++c) - if (p->cfg.two_side[c].hw < 0x40) - p->cfg.two_side[c] = p->cfg.out[ - p->cfg.two_side[c].hw]; - - if (p->cfg.psiz < 0x40) - p->cfg.psiz = p->cfg.out[p->cfg.psiz].hw; - - copy_semantic_info(p); - } else - if (p->type == PIPE_SHADER_FRAGMENT) { - int rid = 0, aid; - unsigned n = 0, m = pc->attr_nr - flat_nr; - - pc->allow32 = TRUE; - - /* do we read FragCoord ? */ - if (pc->attr_nr && - p->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION) { - /* select FCRD components we want accessible */ - for (c = 0; c < 4; ++c) - if (pc->attr[c].acc) - p->cfg.regs[1] |= 1 << (24 + c); - aid = 0; - } else /* offset by 1 if FCRD.w is needed for pinterp */ - aid = popcnt4(p->cfg.regs[1] >> 24); - - /* non-flat interpolants have to be mapped to - * the lower hardware IDs, so sort them: - */ - for (i = 0; i < pc->attr_nr; i++) { - if (pc->interp_mode[i] == INTERP_FLAT) - p->cfg.in[m++].id = i; - else { - if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE)) - p->cfg.in[n].linear = TRUE; - p->cfg.in[n++].id = i; - } - } - copy_semantic_info(p); - - for (n = 0; n < pc->attr_nr; ++n) { - p->cfg.in[n].hw = rid = aid; - i = p->cfg.in[n].id; - - if (p->info.input_semantic_name[i] == - TGSI_SEMANTIC_FACE) { - load_frontfacing(pc, &pc->attr[i * 4]); - continue; - } - - for (c = 0; c < 4; ++c) { - if (!pc->attr[i * 4 + c].acc) - continue; - pc->attr[i * 4 + c].rhw = rid++; - p->cfg.in[n].mask |= 1 << c; - - load_interpolant(pc, &pc->attr[i * 4 + c]); - } - aid += popcnt4(p->cfg.in[n].mask); - } - - m = popcnt4(p->cfg.regs[1] >> 24); - - /* set count of non-position inputs and of non-flat - * non-position inputs for FP_INTERPOLANT_CTRL - */ - p->cfg.regs[1] |= aid - m; - - if (flat_nr) { - i = p->cfg.in[pc->attr_nr - flat_nr].hw; - p->cfg.regs[1] |= (i - m) << 16; - } else - p->cfg.regs[1] |= p->cfg.regs[1] << 16; - - /* mark color semantic for light-twoside */ - n = 0x80; - for (i = 0; i < p->cfg.in_nr; i++) { - if (p->cfg.in[i].sn == TGSI_SEMANTIC_COLOR) { - n = MIN2(n, p->cfg.in[i].hw - m); - p->cfg.two_side[p->cfg.in[i].si] = p->cfg.in[i]; - - p->cfg.regs[0] += /* increase colour count */ - popcnt4(p->cfg.in[i].mask) << 16; - } - } - if (n < 0x80) - p->cfg.regs[0] += n; - - if (p->cfg.prim_id < 0x40) { - pc->sysval[p->cfg.prim_id].rhw = rid++; - emit_interp(pc, &pc->sysval[p->cfg.prim_id], NULL, - INTERP_FLAT); - /* increase FP_INTERPOLANT_CTRL_COUNT */ - p->cfg.regs[1] += 1; - } - - /* Initialize FP results: - * FragDepth is always first TGSI and last hw output - */ - i = p->info.writes_z ? 4 : 0; - for (rid = 0; i < pc->result_nr * 4; i++) - pc->result[i].rhw = rid++; - if (p->info.writes_z) - pc->result[2].rhw = rid++; - - p->cfg.high_result = rid; - - /* separate/different colour results for MRTs ? */ - if (pc->result_nr - (p->info.writes_z ? 1 : 0) > 1) - p->cfg.regs[2] |= 1; - } - - if (pc->immd_nr) { - int rid = 0; - - pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg)); - if (!pc->immd) - goto out_err; - - for (i = 0; i < pc->immd_nr; i++) { - for (c = 0; c < 4; c++, rid++) - ctor_reg(&pc->immd[rid], P_IMMD, i, rid); - } - } - - ret = TRUE; -out_err: - if (pc->iv_p) - free_temp(pc, pc->iv_p); - if (pc->iv_c) - free_temp(pc, pc->iv_c); - - tgsi_parse_free(&tp); - return ret; -} - -static void -free_nv50_pc(struct nv50_pc *pc) -{ - if (pc->immd) - FREE(pc->immd); - if (pc->param) - FREE(pc->param); - if (pc->result) - FREE(pc->result); - if (pc->attr) - FREE(pc->attr); - if (pc->temp) - FREE(pc->temp); - if (pc->sysval) - FREE(pc->sysval); - if (pc->insn_pos) - FREE(pc->insn_pos); - - FREE(pc); -} - -static INLINE uint32_t -nv50_map_gs_output_prim(unsigned pprim) -{ - switch (pprim) { - case PIPE_PRIM_POINTS: - return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_POINTS; - case PIPE_PRIM_LINE_STRIP: - return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP; - case PIPE_PRIM_TRIANGLE_STRIP: - return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP; - default: - NOUVEAU_ERR("invalid GS_OUTPUT_PRIMITIVE: %u\n", pprim); - abort(); - return 0; - } -} - -static boolean -ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p) -{ - int i, c; - unsigned rtype[2] = { P_ATTR, P_RESULT }; - - pc->p = p; - pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1; - pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1; - pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1; - pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1; - pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1; - assert(pc->addr_nr <= 2); - pc->sysval_nr = p->info.file_max[TGSI_FILE_SYSTEM_VALUE] + 1; - - p->cfg.high_temp = 4; - - p->cfg.two_side[0].hw = 0x40; - p->cfg.two_side[1].hw = 0x40; - p->cfg.prim_id = 0x40; - - p->cfg.edgeflag_in = pc->edgeflag_out = 0xff; - - for (i = 0; i < p->info.num_properties; ++i) { - unsigned *data = &p->info.properties[i].data[0]; - - switch (p->info.properties[i].name) { - case TGSI_PROPERTY_GS_OUTPUT_PRIM: - p->cfg.prim_type = nv50_map_gs_output_prim(data[0]); - break; - case TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES: - p->cfg.vert_count = data[0]; - break; - default: - break; - } - } - - switch (p->type) { - case PIPE_SHADER_VERTEX: - p->cfg.psiz = 0x40; - p->cfg.clpd = 0x40; - p->cfg.out_nr = pc->result_nr; - break; - case PIPE_SHADER_GEOMETRY: - assert(p->cfg.prim_type); - assert(p->cfg.vert_count); - - p->cfg.psiz = 0x80; - p->cfg.clpd = 0x80; - p->cfg.prim_id = 0x80; - p->cfg.out_nr = pc->result_nr; - p->cfg.in_nr = pc->attr_nr; - - p->cfg.two_side[0].hw = 0x80; - p->cfg.two_side[1].hw = 0x80; - break; - case PIPE_SHADER_FRAGMENT: - rtype[0] = rtype[1] = P_TEMP; - - p->cfg.regs[0] = 0x01000004; - p->cfg.in_nr = pc->attr_nr; - - if (p->info.writes_z) { - p->cfg.regs[2] |= 0x00000100; - p->cfg.regs[3] |= 0x00000011; - } - if (p->info.uses_kill) - p->cfg.regs[2] |= 0x00100000; - break; - } - - if (pc->temp_nr) { - pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg)); - if (!pc->temp) - return FALSE; - - for (i = 0; i < pc->temp_nr * 4; ++i) - ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1); - } - - if (pc->attr_nr) { - pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg)); - if (!pc->attr) - return FALSE; - - for (i = 0; i < pc->attr_nr * 4; ++i) - ctor_reg(&pc->attr[i], rtype[0], i / 4, -1); - } - - if (pc->result_nr) { - unsigned nr = pc->result_nr * 4; - - pc->result = MALLOC(nr * sizeof(struct nv50_reg)); - if (!pc->result) - return FALSE; - - for (i = 0; i < nr; ++i) - ctor_reg(&pc->result[i], rtype[1], i / 4, -1); - } - - if (pc->param_nr) { - int rid = 0; - - pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg)); - if (!pc->param) - return FALSE; - - for (i = 0; i < pc->param_nr; ++i) - for (c = 0; c < 4; ++c, ++rid) - ctor_reg(&pc->param[rid], P_CONST, i, rid); - } - - if (pc->addr_nr) { - pc->addr = CALLOC(pc->addr_nr * 4, sizeof(struct nv50_reg *)); - if (!pc->addr) - return FALSE; - } - for (i = 0; i < NV50_SU_MAX_ADDR; ++i) - ctor_reg(&pc->r_addr[i], P_ADDR, -1, i + 1); - - if (pc->sysval_nr) { - pc->sysval = CALLOC(pc->sysval_nr, sizeof(struct nv50_reg *)); - if (!pc->sysval) - return FALSE; - /* will only ever use SYSTEM_VALUE[i].x (hopefully) */ - for (i = 0; i < pc->sysval_nr; ++i) - ctor_reg(&pc->sysval[i], rtype[0], i, -1); - } - - return TRUE; -} - -static void -nv50_program_fixup_insns(struct nv50_pc *pc) -{ - struct nv50_program_exec *e, **bra_list; - unsigned i, n, pos; - - bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *)); - - /* Collect branch instructions, we need to adjust their offsets - * when converting 32 bit instructions to 64 bit ones - */ - for (n = 0, e = pc->p->exec_head; e; e = e->next) - if (e->param.index >= 0 && !e->param.mask) - bra_list[n++] = e; - - /* Make sure we don't have any single 32 bit instructions. */ - for (e = pc->p->exec_head, pos = 0; e; e = e->next) { - pos += is_long(e) ? 2 : 1; - - if ((pos & 1) && (!e->next || is_long(e->next))) { - for (i = 0; i < n; ++i) - if (bra_list[i]->param.index >= pos) - bra_list[i]->param.index += 1; - for (i = 0; i < pc->insn_nr; ++i) - if (pc->insn_pos[i] >= pos) - pc->insn_pos[i] += 1; - convert_to_long(pc, e); - ++pos; - } - } - - FREE(bra_list); - - if (!pc->p->info.opcode_count[TGSI_OPCODE_CAL]) - return; - - /* fill in CALL offsets */ - for (e = pc->p->exec_head; e; e = e->next) { - if ((e->inst[0] & 2) && (e->inst[0] >> 28) == 0x2) - e->param.index = pc->insn_pos[e->param.index]; - } -} - -static boolean -nv50_program_tx(struct nv50_program *p) -{ - struct tgsi_parse_context parse; - struct nv50_pc *pc; - boolean ret; - - pc = CALLOC_STRUCT(nv50_pc); - if (!pc) - return FALSE; - - ret = ctor_nv50_pc(pc, p); - if (ret == FALSE) - goto out_cleanup; - - ret = nv50_program_tx_prep(pc); - if (ret == FALSE) - goto out_cleanup; - - pc->insn_pos = MALLOC(pc->insn_nr * sizeof(unsigned)); - - tgsi_parse_init(&parse, pc->p->pipe.tokens); - while (!tgsi_parse_end_of_tokens(&parse)) { - const union tgsi_full_token *tok = &parse.FullToken; - - /* previously allow32 was FALSE for first & last instruction */ - pc->allow32 = TRUE; - - tgsi_parse_token(&parse); - - switch (tok->Token.Type) { - case TGSI_TOKEN_TYPE_INSTRUCTION: - pc->insn_pos[pc->insn_cur] = pc->p->exec_size; - ++pc->insn_cur; - ret = nv50_tgsi_insn(pc, tok); - if (ret == FALSE) - goto out_err; - break; - default: - break; - } - } - - nv50_program_fixup_insns(pc); - - p->param_nr = pc->param_nr * 4; - p->immd_nr = pc->immd_nr * 4; - p->immd = pc->immd_buf; - -out_err: - tgsi_parse_free(&parse); - -out_cleanup: - free_nv50_pc(pc); - return ret; -} - -static void -nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p) -{ - if (nv50_program_tx(p) == FALSE) - assert(0); - p->translated = TRUE; -} - -static void -nv50_program_upload_data(struct nv50_context *nv50, uint32_t *map, - unsigned start, unsigned count, unsigned cbuf) -{ - struct nouveau_channel *chan = nv50->screen->base.channel; - struct nouveau_grobj *tesla = nv50->screen->tesla; - - while (count) { - unsigned nr = count > 2047 ? 2047 : count; - - BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); - OUT_RING (chan, (cbuf << 0) | (start << 8)); - BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr); - OUT_RINGp (chan, map, nr); - - map += nr; - start += nr; - count -= nr; - } -} - -static void -nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p) -{ - struct pipe_context *pipe = &nv50->pipe; - struct pipe_transfer *transfer; - - if (!p->data[0] && p->immd_nr) { - struct nouveau_resource *heap = nv50->screen->immd_heap; - - if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) { - while (heap->next && heap->size < p->immd_nr) { - struct nv50_program *evict = heap->next->priv; - nouveau_resource_free(&evict->data[0]); - } - - if (nouveau_resource_alloc(heap, p->immd_nr, p, - &p->data[0])) - assert(0); - } - - /* immediates only need to be uploaded again when freed */ - nv50_program_upload_data(nv50, p->immd, p->data[0]->start, - p->immd_nr, NV50_CB_PMISC); - } - - assert(p->param_nr <= 16384); - - if (p->param_nr) { - unsigned cb; - uint32_t *map = pipe_buffer_map(pipe, - nv50->constbuf[p->type], - PIPE_TRANSFER_READ, - &transfer); - switch (p->type) { - case PIPE_SHADER_GEOMETRY: cb = NV50_CB_PGP; break; - case PIPE_SHADER_FRAGMENT: cb = NV50_CB_PFP; break; - default: - cb = NV50_CB_PVP; - assert(p->type == PIPE_SHADER_VERTEX); - break; - } - - nv50_program_upload_data(nv50, map, 0, p->param_nr, cb); - pipe_buffer_unmap(pipe, nv50->constbuf[p->type], - transfer); - } -} - -static void -nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) -{ - struct nouveau_channel *chan = nv50->screen->base.channel; - struct nouveau_grobj *tesla = nv50->screen->tesla; - struct nv50_program_exec *e; - uint32_t *up, i; - boolean upload = FALSE; - unsigned offset; - int width; - - if (!p->bo) { - nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100, - p->exec_size * 4, &p->bo); - upload = TRUE; - } - - if (p->data[0] && p->data[0]->start != p->data_start[0]) - upload = TRUE; - - if (!upload) - return; - - up = MALLOC(p->exec_size * 4); - - for (i = 0, e = p->exec_head; e; e = e->next) { - unsigned ei, ci, bs; - - if (e->param.index >= 0 && e->param.mask) { - bs = (e->inst[1] >> 22) & 0x07; - assert(bs < 2); - ei = e->param.shift >> 5; - ci = e->param.index; - if (bs == 0) - ci += p->data[bs]->start; - - e->inst[ei] &= ~e->param.mask; - e->inst[ei] |= (ci << e->param.shift); - } else - if (e->param.index >= 0) { - /* zero mask means param is a jump/branch offset */ - assert(!(e->param.index & 1)); - /* seem to be 8 byte steps */ - ei = (e->param.index >> 1) + 0 /* START_ID */; - - e->inst[0] &= 0xf0000fff; - e->inst[0] |= ei << 12; - } - - up[i++] = e->inst[0]; - if (is_long(e)) - up[i++] = e->inst[1]; - } - assert(i == p->exec_size); - - if (p->data[0]) - p->data_start[0] = p->data[0]->start; - -#ifdef NV50_PROGRAM_DUMP - NOUVEAU_ERR("-------\n"); - for (e = p->exec_head; e; e = e->next) { - NOUVEAU_ERR("0x%08x\n", e->inst[0]); - if (is_long(e)) - NOUVEAU_ERR("0x%08x\n", e->inst[1]); - } -#endif - - /* SIFC_HEIGHT/SIFC_WIDTH of 65536 do not work, and are not reported - * as data error either. hw bug ? */ -#define SIFC_MAX_WIDTH (65536 - 256) - offset = 0; - width = p->exec_size * 4; - while (width > 0) { - nv50_upload_sifc(nv50, p->bo, offset, NOUVEAU_BO_VRAM, - NV50_2D_DST_FORMAT_R8_UNORM, 65536, 1, 262144, - &up[offset / 4], NV50_2D_SIFC_FORMAT_R8_UNORM, - 0, 0, 0, MIN2(SIFC_MAX_WIDTH, width), 1, 1); - width -= SIFC_MAX_WIDTH; - offset += SIFC_MAX_WIDTH; - } - BEGIN_RING(chan, tesla, NV50TCL_CODE_CB_FLUSH, 1); - OUT_RING (chan, 0); - - FREE(up); -} - -struct nouveau_stateobj * -nv50_vertprog_validate(struct nv50_context *nv50) -{ - struct nouveau_grobj *tesla = nv50->screen->tesla; - struct nv50_program *p = nv50->vertprog; - struct nouveau_stateobj *so; - - if (!p->translated) { - nv50_program_validate(nv50, p); - if (!p->translated) - assert(0); - } - - nv50_program_validate_data(nv50, p); - nv50_program_validate_code(nv50, p); - - if (!(nv50->dirty & NV50_NEW_VERTPROG)) - return NULL; - - so = so_new(5, 7, 2); - so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2); - so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | - NOUVEAU_BO_HIGH, 0, 0); - so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | - NOUVEAU_BO_LOW, 0, 0); - so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2); - so_data (so, p->cfg.attr[0]); - so_data (so, p->cfg.attr[1]); - so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1); - so_data (so, p->cfg.high_result); - so_method(so, tesla, NV50TCL_VP_REG_ALLOC_TEMP, 1); - so_data (so, p->cfg.high_temp); - so_method(so, tesla, NV50TCL_VP_START_ID, 1); - so_data (so, 0); /* program start offset */ - return so; -} - -struct nouveau_stateobj * -nv50_fragprog_validate(struct nv50_context *nv50) -{ - struct nouveau_grobj *tesla = nv50->screen->tesla; - struct nv50_program *p = nv50->fragprog; - struct nouveau_stateobj *so; - - if (!p->translated) { - nv50_program_validate(nv50, p); - if (!p->translated) - assert(0); - } - - nv50_program_validate_data(nv50, p); - nv50_program_validate_code(nv50, p); - - if (!(nv50->dirty & NV50_NEW_FRAGPROG)) - return NULL; - - so = so_new(6, 7, 2); - so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2); - so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | - NOUVEAU_BO_HIGH, 0, 0); - so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | - NOUVEAU_BO_LOW, 0, 0); - so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1); - so_data (so, p->cfg.high_temp); - so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1); - so_data (so, p->cfg.high_result); - so_method(so, tesla, NV50TCL_FP_CONTROL, 1); - so_data (so, p->cfg.regs[2]); - so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1); - so_data (so, p->cfg.regs[3]); - so_method(so, tesla, NV50TCL_FP_START_ID, 1); - so_data (so, 0); /* program start offset */ - return so; -} - -struct nouveau_stateobj * -nv50_geomprog_validate(struct nv50_context *nv50) -{ - struct nouveau_grobj *tesla = nv50->screen->tesla; - struct nv50_program *p = nv50->geomprog; - struct nouveau_stateobj *so; - - if (!p->translated) { - nv50_program_validate(nv50, p); - if (!p->translated) - assert(0); - } - - nv50_program_validate_data(nv50, p); - nv50_program_validate_code(nv50, p); - - if (!(nv50->dirty & NV50_NEW_GEOMPROG)) - return NULL; - - so = so_new(6, 7, 2); - so_method(so, tesla, NV50TCL_GP_ADDRESS_HIGH, 2); - so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | - NOUVEAU_BO_HIGH, 0, 0); - so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | - NOUVEAU_BO_LOW, 0, 0); - so_method(so, tesla, NV50TCL_GP_REG_ALLOC_TEMP, 1); - so_data (so, p->cfg.high_temp); - so_method(so, tesla, NV50TCL_GP_REG_ALLOC_RESULT, 1); - so_data (so, p->cfg.high_result); - so_method(so, tesla, NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE, 1); - so_data (so, p->cfg.prim_type); - so_method(so, tesla, NV50TCL_GP_VERTEX_OUTPUT_COUNT, 1); - so_data (so, p->cfg.vert_count); - so_method(so, tesla, NV50TCL_GP_START_ID, 1); - so_data (so, 0); - return so; -} - -static uint32_t -nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base) -{ - struct nv50_program *vp; - struct nv50_program *fp = nv50->fragprog; - unsigned i, c, m = base; - uint32_t origin = 0x00000010; - - vp = nv50->geomprog ? nv50->geomprog : nv50->vertprog; - - /* XXX: this might not work correctly in all cases yet - we'll - * just assume that an FP generic input that is not written in - * the VP is PointCoord. - */ - memset(pntc, 0, 8 * sizeof(uint32_t)); - - for (i = 0; i < fp->cfg.in_nr; i++) { - unsigned j, n = popcnt4(fp->cfg.in[i].mask); - - if (fp->cfg.in[i].sn != TGSI_SEMANTIC_GENERIC) { - m += n; - continue; - } - - for (j = 0; j < vp->cfg.out_nr; ++j) - if (vp->cfg.out[j].sn == fp->cfg.in[i].sn && - vp->cfg.out[j].si == fp->cfg.in[i].si) - break; - - if (j < vp->info.num_outputs) { - ubyte enable = - (nv50->rasterizer->pipe.sprite_coord_enable >> vp->cfg.out[j].si) & 1; - - if (enable == 0) { - m += n; - continue; - } - } - - /* this is either PointCoord or replaced by sprite coords */ - for (c = 0; c < 4; c++) { - if (!(fp->cfg.in[i].mask & (1 << c))) - continue; - pntc[m / 8] |= (c + 1) << ((m % 8) * 4); - ++m; - } - } - return (nv50->rasterizer->pipe.sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT ? 0 : origin); +nv50_fragprog_prepare(struct nv50_translation_info *ti) +{ + struct nv50_program *p = ti->p; + int i, j, c; + unsigned nvary, nintp, depr; + unsigned n = 0, m = 0, skip = 0; + ubyte sn[16], si[16]; + + /* FP flags */ + + if (ti->scan.writes_z) { + p->fp.flags[1] = 0x11; + p->fp.flags[0] |= NV50TCL_FP_CONTROL_EXPORTS_Z; + } + + if (ti->scan.uses_kill) + p->fp.flags[0] |= NV50TCL_FP_CONTROL_USES_KIL; + + /* FP inputs */ + + ti->input_file = NV_FILE_MEM_V; + ti->output_file = NV_FILE_GPR; + + /* count non-flat inputs, save semantic info */ + for (i = 0; i < p->in_nr; ++i) { + m += (ti->interp_mode[i] & NV50_INTERP_FLAT) ? 0 : 1; + sn[i] = p->in[i].sn; + si[i] = p->in[i].si; + } + + /* reorder p->in[] so that non-flat inputs are first and + * kick out special inputs that don't use VP/GP_RESULT_MAP + */ + nintp = 0; + for (i = 0; i < p->in_nr; ++i) { + if (sn[i] == TGSI_SEMANTIC_POSITION) { + for (c = 0; c < 4; ++c) { + ti->input_map[i][c] = nintp; + if (ti->input_access[i][c]) { + p->fp.interp |= 1 << (24 + c); + ++nintp; + } + } + skip++; + continue; + } else + if (sn[i] == TGSI_SEMANTIC_FACE) { + ti->input_map[i][0] = 255; + skip++; + continue; + } + + j = (ti->interp_mode[i] & NV50_INTERP_FLAT) ? m++ : n++; + + if (sn[i] == TGSI_SEMANTIC_COLOR) + p->vp.bfc[si[i]] = j; + + p->in[j].linear = (ti->interp_mode[i] & NV50_INTERP_LINEAR) ? 1 : 0; + p->in[j].id = i; + p->in[j].sn = sn[i]; + p->in[j].si = si[i]; + } + assert(n <= m); + p->in_nr -= skip; + + if (!(p->fp.interp & (8 << 24))) { + p->fp.interp |= (8 << 24); + ++nintp; + } + + p->fp.colors = (1 << 24) | 4; /* CLAMP, FFC0_ID = 4 */ + + for (i = 0; i < p->in_nr; ++i) { + int j = p->in[i].id; + p->in[i].hw = nintp; + + for (c = 0; c < 4; ++c) { + if (!ti->input_access[j][c]) + continue; + p->in[i].mask |= 1 << c; + ti->input_map[j][c] = nintp++; + } + /* count color inputs */ + if (i == p->vp.bfc[0] || i == p->vp.bfc[1]) + p->fp.colors += bitcount4(p->in[i].mask) << 16; + } + nintp -= bitcount4(p->fp.interp >> 24); /* subtract position inputs */ + nvary = nintp; + if (n < m) + nvary -= p->in[n].hw; + + p->fp.interp |= nvary << NV50TCL_FP_INTERPOLANT_CTRL_COUNT_NONFLAT_SHIFT; + p->fp.interp |= nintp << NV50TCL_FP_INTERPOLANT_CTRL_COUNT_SHIFT; + + /* FP outputs */ + + if (p->out_nr > (1 + (ti->scan.writes_z ? 1 : 0))) + p->fp.flags[0] |= NV50TCL_FP_CONTROL_MULTIPLE_RESULTS; + + depr = p->out_nr; + for (i = 0; i < p->out_nr; ++i) { + p->out[i].id = i; + if (p->out[i].sn == TGSI_SEMANTIC_POSITION) { + depr = i; + continue; + } + p->out[i].hw = p->max_out; + p->out[i].mask = 0xf; + + for (c = 0; c < 4; ++c) + ti->output_map[i][c] = p->max_out++; + } + if (depr < p->out_nr) { + p->out[depr].mask = 0x4; + p->out[depr].hw = ti->output_map[depr][2] = p->max_out++; + } else { + /* allowed values are 1, 4, 5, 8, 9, ... */ + p->max_out = MAX2(4, p->max_out); + } + + return 0; } static int -nv50_vec4_map(uint32_t *map32, int mid, uint8_t zval, uint32_t lin[4], - struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo) +nv50_geomprog_prepare(struct nv50_translation_info *ti) { - int c; - uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw; - uint8_t *map = (uint8_t *)map32; - - for (c = 0; c < 4; ++c) { - if (mf & 1) { - if (fpi->linear == TRUE) - lin[mid / 32] |= 1 << (mid % 32); - if (mv & 1) - map[mid] = oid; - else - map[mid] = (c == 3) ? (zval + 1) : zval; - ++mid; - } - - oid += mv & 1; - mf >>= 1; - mv >>= 1; - } + ti->input_file = NV_FILE_MEM_S; + ti->output_file = NV_FILE_OUT; - return mid; -} - -struct nouveau_stateobj * -nv50_fp_linkage_validate(struct nv50_context *nv50) -{ - struct nouveau_grobj *tesla = nv50->screen->tesla; - struct nv50_program *vp = nv50->vertprog; - struct nv50_program *fp = nv50->fragprog; - struct nouveau_stateobj *so; - struct nv50_sreg4 dummy; - int i, n, c, m = 0; - uint32_t map[16], lin[4], reg[6], pcrd[8]; - uint8_t zval = 0x40; - - if (nv50->geomprog) { - vp = nv50->geomprog; - zval = 0x80; - } - memset(map, 0, sizeof(map)); - memset(lin, 0, sizeof(lin)); - - reg[1] = 0x00000004; /* low and high clip distance map ids */ - reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */ - reg[3] = 0x00000000; /* point size map id & enable */ - reg[5] = 0x00000000; /* primitive ID map slot */ - reg[0] = fp->cfg.regs[0]; /* colour semantic reg */ - reg[4] = fp->cfg.regs[1]; /* interpolant info */ - - dummy.linear = FALSE; - dummy.mask = 0xf; /* map all components of HPOS */ - m = nv50_vec4_map(map, m, zval, lin, &dummy, &vp->cfg.out[0]); - - dummy.mask = 0x0; - - if (vp->cfg.clpd < 0x40) { - for (c = 0; c < vp->cfg.clpd_nr; ++c) { - map[m / 4] |= (vp->cfg.clpd + c) << ((m % 4) * 8); - ++m; - } - reg[1] = (m << 8); - } - - reg[0] |= m << 8; /* adjust BFC0 id */ - - /* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */ - if (nv50->rasterizer->pipe.light_twoside) { - struct nv50_sreg4 *vpo = &vp->cfg.two_side[0]; - struct nv50_sreg4 *fpi = &fp->cfg.two_side[0]; - - m = nv50_vec4_map(map, m, zval, lin, &fpi[0], &vpo[0]); - m = nv50_vec4_map(map, m, zval, lin, &fpi[1], &vpo[1]); - } - - reg[0] += m - 4; /* adjust FFC0 id */ - reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */ - - for (i = 0; i < fp->cfg.in_nr; i++) { - /* maybe even remove these from cfg.io */ - if (fp->cfg.in[i].sn == TGSI_SEMANTIC_POSITION || - fp->cfg.in[i].sn == TGSI_SEMANTIC_FACE) - continue; - - for (n = 0; n < vp->cfg.out_nr; ++n) - if (vp->cfg.out[n].sn == fp->cfg.in[i].sn && - vp->cfg.out[n].si == fp->cfg.in[i].si) - break; - - m = nv50_vec4_map(map, m, zval, lin, &fp->cfg.in[i], - (n < vp->cfg.out_nr) ? - &vp->cfg.out[n] : &dummy); - } - /* PrimitiveID either is replaced by the system value, or - * written by the geometry shader into an output register - */ - if (fp->cfg.prim_id < 0x40) { - map[m / 4] |= vp->cfg.prim_id << ((m % 4) * 8); - reg[5] = m++; - } - - if (nv50->rasterizer->pipe.point_size_per_vertex) { - map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8); - reg[3] = (m++ << 4) | 1; - } - - /* now fill the stateobj (at most 28 so_data) */ - so = so_new(10, 54, 0); - - n = (m + 3) / 4; - assert(m <= 64); - if (vp->type == PIPE_SHADER_GEOMETRY) { - so_method(so, tesla, NV50TCL_GP_RESULT_MAP_SIZE, 1); - so_data (so, m); - so_method(so, tesla, NV50TCL_GP_RESULT_MAP(0), n); - so_datap (so, map, n); - } else { - so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1); - so_data (so, vp->cfg.regs[0]); - - so_method(so, tesla, NV50TCL_MAP_SEMANTIC_4, 1); - so_data (so, reg[5]); - - so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); - so_data (so, m); - so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n); - so_datap (so, map, n); - } - - so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4); - so_datap (so, reg, 4); - - so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1); - so_data (so, reg[4]); - - so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4); - so_datap (so, lin, 4); - - if (nv50->rasterizer->pipe.sprite_coord_enable) { - so_method(so, tesla, NV50TCL_POINT_SPRITE_CTRL, 1); - so_data (so, - nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff)); - - so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8); - so_datap (so, pcrd, 8); - } - - so_method(so, tesla, NV50TCL_GP_ENABLE, 1); - so_data (so, (vp->type == PIPE_SHADER_GEOMETRY) ? 1 : 0); - - return so; + assert(0); + return 1; } static int -construct_vp_gp_mapping(uint32_t *map32, int m, - struct nv50_program *vp, struct nv50_program *gp) +nv50_prog_scan(struct nv50_translation_info *ti) { - uint8_t *map = (uint8_t *)map32; - int i, j, c; + struct nv50_program *p = ti->p; + struct tgsi_parse_context parse; + int ret, i; - for (i = 0; i < gp->cfg.in_nr; ++i) { - uint8_t oid = 0, mv = 0, mg = gp->cfg.in[i].mask; + p->vp.edgeflag = 0x40; + p->vp.psiz = 0x40; + p->vp.bfc[0] = 0x40; + p->vp.bfc[1] = 0x40; + p->gp.primid = 0x80; - for (j = 0; j < vp->cfg.out_nr; ++j) { - if (vp->cfg.out[j].sn == gp->cfg.in[i].sn && - vp->cfg.out[j].si == gp->cfg.in[i].si) { - mv = vp->cfg.out[j].mask; - oid = vp->cfg.out[j].hw; - break; - } - } + tgsi_scan_shader(p->pipe.tokens, &ti->scan); - for (c = 0; c < 4; ++c, mv >>= 1, mg >>= 1) { - if (mg & mv & 1) - map[m++] = oid; - else - if (mg & 1) - map[m++] = (c == 3) ? 0x41 : 0x40; - oid += mv & 1; - } - } - return m; -} +#ifdef NV50_PROGRAM_DEBUG + tgsi_dump(p->pipe.tokens, 0); +#endif -struct nouveau_stateobj * -nv50_gp_linkage_validate(struct nv50_context *nv50) + ti->subr = + CALLOC(ti->scan.opcode_count[TGSI_OPCODE_BGNSUB], sizeof(ti->subr[0])); + + ti->immd32 = (uint32_t *)MALLOC(ti->scan.immediate_count * 16); + ti->immd32_ty = (ubyte *)MALLOC(ti->scan.immediate_count * sizeof(ubyte)); + + ti->insns = MALLOC(ti->scan.num_instructions * sizeof(ti->insns[0])); + + tgsi_parse_init(&parse, p->pipe.tokens); + while (!tgsi_parse_end_of_tokens(&parse)) { + tgsi_parse_token(&parse); + + switch (parse.FullToken.Token.Type) { + case TGSI_TOKEN_TYPE_IMMEDIATE: + prog_immediate(ti, &parse.FullToken.FullImmediate); + break; + case TGSI_TOKEN_TYPE_DECLARATION: + prog_decl(ti, &parse.FullToken.FullDeclaration); + break; + case TGSI_TOKEN_TYPE_INSTRUCTION: + ti->insns[ti->inst_nr] = parse.FullToken.FullInstruction; + prog_inst(ti, &parse.FullToken.FullInstruction, ++ti->inst_nr); + break; + } + } + + /* Scan to determine which registers are inputs/outputs of a subroutine. */ + for (i = 0; i < ti->subr_nr; ++i) { + int pc = ti->subr[i].id; + while (ti->insns[pc].Instruction.Opcode != TGSI_OPCODE_ENDSUB) + prog_subroutine_inst(&ti->subr[i], &ti->insns[pc++]); + } + + p->in_nr = ti->scan.file_max[TGSI_FILE_INPUT] + 1; + p->out_nr = ti->scan.file_max[TGSI_FILE_OUTPUT] + 1; + + switch (p->type) { + case PIPE_SHADER_VERTEX: + ret = nv50_vertprog_prepare(ti); + break; + case PIPE_SHADER_FRAGMENT: + ret = nv50_fragprog_prepare(ti); + break; + case PIPE_SHADER_GEOMETRY: + ret = nv50_geomprog_prepare(ti); + break; + default: + assert(!"unsupported program type"); + ret = -1; + break; + } + + assert(!ret); + return ret; +} + +boolean +nv50_program_tx(struct nv50_program *p) { - struct nouveau_grobj *tesla = nv50->screen->tesla; - struct nouveau_stateobj *so; - struct nv50_program *vp = nv50->vertprog; - struct nv50_program *gp = nv50->geomprog; - uint32_t map[16]; - int m = 0; + struct nv50_translation_info *ti; + int ret; - if (!gp) - return NULL; - memset(map, 0, sizeof(map)); + ti = CALLOC_STRUCT(nv50_translation_info); + ti->p = p; - m = construct_vp_gp_mapping(map, m, vp, gp); + ti->edgeflag_out = PIPE_MAX_SHADER_OUTPUTS; - so = so_new(3, 24 - 3, 0); + ret = nv50_prog_scan(ti); + if (ret) { + NOUVEAU_ERR("unsupported shader program\n"); + goto out; + } - so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1); - so_data (so, vp->cfg.regs[0] | gp->cfg.regs[0]); + ret = nv50_generate_code(ti); + if (ret) { + NOUVEAU_ERR("error during shader translation\n"); + goto out; + } - assert(m <= 32); - so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); - so_data (so, m); - - m = (m + 3) / 4; - so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), m); - so_datap (so, map, m); - - return so; +out: + if (ti->immd32) + FREE(ti->immd32); + if (ti->immd32_ty) + FREE(ti->immd32_ty); + if (ti->insns) + FREE(ti->insns); + if (ti->subr) + FREE(ti->subr); + FREE(ti); + return ret ? FALSE : TRUE; } void nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p) { - while (p->exec_head) { - struct nv50_program_exec *e = p->exec_head; - - p->exec_head = e->next; - FREE(e); - } - p->exec_tail = NULL; - p->exec_size = 0; + nouveau_bo_ref(NULL, &p->bo); - nouveau_bo_ref(NULL, &p->bo); + so_ref(NULL, &p->so); - FREE(p->immd); - nouveau_resource_free(&p->data[0]); + if (p->code) + FREE(p->code); - p->translated = 0; + p->translated = FALSE; } diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h index 1e3ad6bff05..97d2933c3ee 100644 --- a/src/gallium/drivers/nv50/nv50_program.h +++ b/src/gallium/drivers/nv50/nv50_program.h @@ -1,75 +1,131 @@ -#ifndef __NV50_PROGRAM_H__ -#define __NV50_PROGRAM_H__ +/* + * Copyright 2010 Ben Skeggs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __NV50_PROG_H__ +#define __NV50_PROG_H__ #include "pipe/p_state.h" #include "tgsi/tgsi_scan.h" -struct nv50_program_exec { - struct nv50_program_exec *next; +#define NV50_CAP_MAX_PROGRAM_TEMPS 64 - unsigned inst[2]; - struct { - int index; - unsigned mask; - unsigned shift; - } param; -}; - -struct nv50_sreg4 { - uint8_t hw; /* hw index, nv50 wants flat FP inputs last */ - uint8_t id; /* tgsi index */ +struct nv50_varying { + uint8_t id; /* tgsi index */ + uint8_t hw; /* hw index, nv50 wants flat FP inputs last */ - uint8_t mask; - boolean linear; + uint8_t mask : 4; + uint8_t linear : 1; + uint8_t pad : 3; - ubyte sn, si; /* semantic name & index */ + ubyte sn; /* semantic name */ + ubyte si; /* semantic index */ }; struct nv50_program { - struct pipe_shader_state pipe; - struct tgsi_shader_info info; - boolean translated; - - unsigned type; - struct nv50_program_exec *exec_head; - struct nv50_program_exec *exec_tail; - unsigned exec_size; - struct nouveau_resource *data[1]; - unsigned data_start[1]; - - struct nouveau_bo *bo; - - uint32_t *immd; - unsigned immd_nr; - unsigned param_nr; - - struct { - unsigned high_temp; - unsigned high_result; - - uint32_t attr[2]; - uint32_t regs[4]; - - /* for VPs, io_nr doesn't count 'private' results (PSIZ etc.) */ - unsigned in_nr, out_nr; - struct nv50_sreg4 in[PIPE_MAX_SHADER_INPUTS]; - struct nv50_sreg4 out[PIPE_MAX_SHADER_OUTPUTS]; - - /* FP colour inputs, VP/GP back colour outputs */ - struct nv50_sreg4 two_side[2]; - - /* GP only */ - unsigned vert_count; - uint8_t prim_type; - - /* VP & GP only */ - uint8_t clpd, clpd_nr; - uint8_t psiz; - uint8_t edgeflag_in; - - /* FP & GP only */ - uint8_t prim_id; - } cfg; + struct pipe_shader_state pipe; + + ubyte type; + boolean translated; + + struct nouveau_bo *bo; + struct nouveau_stateobj *so; + + uint32_t *code; + unsigned code_size; + unsigned code_start; /* offset inside bo */ + uint32_t *immd; + unsigned immd_size; + unsigned parm_size; /* size limit of uniform buffer */ + + ubyte max_gpr; /* REG_ALLOC_TEMP */ + ubyte max_out; /* REG_ALLOC_RESULT or FP_RESULT_COUNT */ + + ubyte in_nr; + ubyte out_nr; + struct nv50_varying in[16]; + struct nv50_varying out[16]; + + struct { + uint32_t attrs[3]; /* VP_ATTR_EN_0,1 and VP_GP_BUILTIN_ATTR_EN */ + ubyte psiz; + ubyte bfc[2]; + ubyte edgeflag; + ubyte clpd; + ubyte clpd_nr; + } vp; + + struct { + uint32_t flags[2]; /* 0x19a8, 196c */ + uint32_t interp; /* 0x1988 */ + uint32_t colors; /* 0x1904 */ + } fp; + + struct { + ubyte primid; /* primitive id output register */ + uint8_t vert_count; + uint8_t prim_type; /* point, line strip or tri strip */ + } gp; + + void *fixups; + unsigned num_fixups; +}; + +#define NV50_INTERP_LINEAR (1 << 0) +#define NV50_INTERP_FLAT (1 << 1) +#define NV50_INTERP_CENTROID (1 << 2) + +/* analyze TGSI and see which TEMP[] are used as subroutine inputs/outputs */ +struct nv50_subroutine { + unsigned id; + unsigned pos; + /* function inputs and outputs */ + uint32_t argv[NV50_CAP_MAX_PROGRAM_TEMPS][4]; + uint32_t retv[NV50_CAP_MAX_PROGRAM_TEMPS][4]; }; -#endif +struct nv50_translation_info { + struct nv50_program *p; + unsigned inst_nr; + struct tgsi_full_instruction *insns; + ubyte input_file; + ubyte output_file; + ubyte input_map[PIPE_MAX_SHADER_INPUTS][4]; + ubyte output_map[PIPE_MAX_SHADER_OUTPUTS][4]; + ubyte interp_mode[PIPE_MAX_SHADER_INPUTS]; + int input_access[PIPE_MAX_SHADER_INPUTS][4]; + int output_access[PIPE_MAX_SHADER_OUTPUTS][4]; + boolean indirect_inputs; + boolean indirect_outputs; + boolean store_to_memory; + struct tgsi_shader_info scan; + uint32_t *immd32; + unsigned immd32_nr; + ubyte *immd32_ty; + ubyte edgeflag_out; + struct nv50_subroutine *subr; + unsigned subr_nr; +}; + +int nv50_generate_code(struct nv50_translation_info *ti); +boolean nv50_program_tx(struct nv50_program *p); + +#endif /* __NV50_PROG_H__ */ diff --git a/src/gallium/drivers/nv50/nv50_push.c b/src/gallium/drivers/nv50/nv50_push.c index 57c0010bf4d..380f69406a2 100644 --- a/src/gallium/drivers/nv50/nv50_push.c +++ b/src/gallium/drivers/nv50/nv50_push.c @@ -228,7 +228,7 @@ nv50_push_elements_instanced(struct pipe_context *pipe, ctx.idxbuf = NULL; ctx.vtx_size = 0; ctx.edgeflag = 0.5f; - ctx.edgeflag_attr = nv50->vertprog->cfg.edgeflag_in; + ctx.edgeflag_attr = nv50->vertprog->vp.edgeflag; /* map vertex buffers, determine vertex size */ for (i = 0; i < nv50->vtxelt->num_elements; i++) { diff --git a/src/gallium/drivers/nv50/nv50_reg.h b/src/gallium/drivers/nv50/nv50_reg.h index e2a8e513440..365576fdd07 100644 --- a/src/gallium/drivers/nv50/nv50_reg.h +++ b/src/gallium/drivers/nv50/nv50_reg.h @@ -448,7 +448,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_DMA_TIC 0x000001a0 #define NV50TCL_DMA_TEXTURE 0x000001a4 #define NV50TCL_DMA_STRMOUT 0x000001a8 -#define NV50TCL_DMA_UNK01AC 0x000001ac +#define NV50TCL_DMA_CLIPID 0x000001ac #define NV50TCL_DMA_COLOR(x) (0x000001c0+((x)*4)) #define NV50TCL_DMA_COLOR__SIZE 0x00000008 #define NV50TCL_RT_ADDRESS_HIGH(x) (0x00000200+((x)*32)) @@ -665,8 +665,20 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_DEPTH_RANGE_FAR__SIZE 0x00000010 #define NV50TCL_VIEWPORT_CLIP_HORIZ(x) (0x00000d00+((x)*8)) #define NV50TCL_VIEWPORT_CLIP_HORIZ__SIZE 0x00000008 +#define NV50TCL_VIEWPORT_CLIP_HORIZ_MIN_SHIFT 0 +#define NV50TCL_VIEWPORT_CLIP_HORIZ_MIN_MASK 0x0000ffff +#define NV50TCL_VIEWPORT_CLIP_HORIZ_MAX_SHIFT 16 +#define NV50TCL_VIEWPORT_CLIP_HORIZ_MAX_MASK 0xffff0000 #define NV50TCL_VIEWPORT_CLIP_VERT(x) (0x00000d04+((x)*8)) #define NV50TCL_VIEWPORT_CLIP_VERT__SIZE 0x00000008 +#define NV50TCL_VIEWPORT_CLIP_VERT_MIN_SHIFT 0 +#define NV50TCL_VIEWPORT_CLIP_VERT_MIN_MASK 0x0000ffff +#define NV50TCL_VIEWPORT_CLIP_VERT_MAX_SHIFT 16 +#define NV50TCL_VIEWPORT_CLIP_VERT_MAX_MASK 0xffff0000 +#define NV50TCL_CLIPID_REGION_HORIZ(x) (0x00000d40+((x)*8)) +#define NV50TCL_CLIPID_REGION_HORIZ__SIZE 0x00000004 +#define NV50TCL_CLIPID_REGION_VERT(x) (0x00000d44+((x)*8)) +#define NV50TCL_CLIPID_REGION_VERT__SIZE 0x00000004 #define NV50TCL_VERTEX_BUFFER_FIRST 0x00000d74 #define NV50TCL_VERTEX_BUFFER_COUNT 0x00000d78 #define NV50TCL_CLEAR_COLOR(x) (0x00000d80+((x)*4)) @@ -724,14 +736,16 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_GP_ADDRESS_LOW 0x00000f74 #define NV50TCL_VP_ADDRESS_HIGH 0x00000f7c #define NV50TCL_VP_ADDRESS_LOW 0x00000f80 -#define NV50TCL_UNK0F84_ADDRESS_HIGH 0x00000f84 -#define NV50TCL_UNK0F84_ADDRESS_LOW 0x00000f88 +#define NV50TCL_VERTEX_RUNOUT_HIGH 0x00000f84 +#define NV50TCL_VERTEX_RUNOUT_LOW 0x00000f88 #define NV50TCL_DEPTH_BOUNDS(x) (0x00000f9c+((x)*4)) #define NV50TCL_DEPTH_BOUNDS__SIZE 0x00000002 #define NV50TCL_FP_ADDRESS_HIGH 0x00000fa4 #define NV50TCL_FP_ADDRESS_LOW 0x00000fa8 #define NV50TCL_MSAA_MASK(x) (0x00000fbc+((x)*4)) #define NV50TCL_MSAA_MASK__SIZE 0x00000004 +#define NV50TCL_CLIPID_ADDRESS_HIGH 0x00000fcc +#define NV50TCL_CLIPID_ADDRESS_LOW 0x00000fd0 #define NV50TCL_ZETA_ADDRESS_HIGH 0x00000fe0 #define NV50TCL_ZETA_ADDRESS_LOW 0x00000fe4 #define NV50TCL_ZETA_FORMAT 0x00000fe8 @@ -861,37 +875,45 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_BLEND_EQUATION_RGB_FUNC_SUBTRACT 0x0000800a #define NV50TCL_BLEND_EQUATION_RGB_FUNC_REVERSE_SUBTRACT 0x0000800b #define NV50TCL_BLEND_FUNC_SRC_RGB 0x00001344 -#define NV50TCL_BLEND_FUNC_SRC_RGB_ZERO 0x00000000 -#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE 0x00000001 -#define NV50TCL_BLEND_FUNC_SRC_RGB_SRC_COLOR 0x00000300 -#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_COLOR 0x00000301 -#define NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA 0x00000302 -#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_ALPHA 0x00000303 -#define NV50TCL_BLEND_FUNC_SRC_RGB_DST_ALPHA 0x00000304 -#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_ALPHA 0x00000305 -#define NV50TCL_BLEND_FUNC_SRC_RGB_DST_COLOR 0x00000306 -#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_COLOR 0x00000307 -#define NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA_SATURATE 0x00000308 -#define NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_COLOR 0x00008001 -#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_COLOR 0x00008002 -#define NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_ALPHA 0x00008003 -#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA 0x00008004 +#define NV50TCL_BLEND_FUNC_SRC_RGB_ZERO 0x00004000 +#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE 0x00004001 +#define NV50TCL_BLEND_FUNC_SRC_RGB_SRC_COLOR 0x00004300 +#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_COLOR 0x00004301 +#define NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA 0x00004302 +#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_ALPHA 0x00004303 +#define NV50TCL_BLEND_FUNC_SRC_RGB_DST_ALPHA 0x00004304 +#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_ALPHA 0x00004305 +#define NV50TCL_BLEND_FUNC_SRC_RGB_DST_COLOR 0x00004306 +#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_COLOR 0x00004307 +#define NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA_SATURATE 0x00004308 +#define NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_COLOR 0x0000c001 +#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_COLOR 0x0000c002 +#define NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_ALPHA 0x0000c003 +#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA 0x0000c004 +#define NV50TCL_BLEND_FUNC_SRC_RGB_SRC1_COLOR 0x0000c900 +#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC1_COLOR 0x0000c901 +#define NV50TCL_BLEND_FUNC_SRC_RGB_SRC1_ALPHA 0x0000c902 +#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC1_ALPHA 0x0000c903 #define NV50TCL_BLEND_FUNC_DST_RGB 0x00001348 -#define NV50TCL_BLEND_FUNC_DST_RGB_ZERO 0x00000000 -#define NV50TCL_BLEND_FUNC_DST_RGB_ONE 0x00000001 -#define NV50TCL_BLEND_FUNC_DST_RGB_SRC_COLOR 0x00000300 -#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_COLOR 0x00000301 -#define NV50TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA 0x00000302 -#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_ALPHA 0x00000303 -#define NV50TCL_BLEND_FUNC_DST_RGB_DST_ALPHA 0x00000304 -#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_ALPHA 0x00000305 -#define NV50TCL_BLEND_FUNC_DST_RGB_DST_COLOR 0x00000306 -#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_COLOR 0x00000307 -#define NV50TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA_SATURATE 0x00000308 -#define NV50TCL_BLEND_FUNC_DST_RGB_CONSTANT_COLOR 0x00008001 -#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_COLOR 0x00008002 -#define NV50TCL_BLEND_FUNC_DST_RGB_CONSTANT_ALPHA 0x00008003 -#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_ALPHA 0x00008004 +#define NV50TCL_BLEND_FUNC_DST_RGB_ZERO 0x00004000 +#define NV50TCL_BLEND_FUNC_DST_RGB_ONE 0x00004001 +#define NV50TCL_BLEND_FUNC_DST_RGB_SRC_COLOR 0x00004300 +#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_COLOR 0x00004301 +#define NV50TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA 0x00004302 +#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_ALPHA 0x00004303 +#define NV50TCL_BLEND_FUNC_DST_RGB_DST_ALPHA 0x00004304 +#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_ALPHA 0x00004305 +#define NV50TCL_BLEND_FUNC_DST_RGB_DST_COLOR 0x00004306 +#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_COLOR 0x00004307 +#define NV50TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA_SATURATE 0x00004308 +#define NV50TCL_BLEND_FUNC_DST_RGB_CONSTANT_COLOR 0x0000c001 +#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_COLOR 0x0000c002 +#define NV50TCL_BLEND_FUNC_DST_RGB_CONSTANT_ALPHA 0x0000c003 +#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_ALPHA 0x0000c004 +#define NV50TCL_BLEND_FUNC_DST_RGB_SRC1_COLOR 0x0000c900 +#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC1_COLOR 0x0000c901 +#define NV50TCL_BLEND_FUNC_DST_RGB_SRC1_ALPHA 0x0000c902 +#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC1_ALPHA 0x0000c903 #define NV50TCL_BLEND_EQUATION_ALPHA 0x0000134c #define NV50TCL_BLEND_EQUATION_ALPHA_FUNC_ADD 0x00008006 #define NV50TCL_BLEND_EQUATION_ALPHA_MIN 0x00008007 @@ -899,37 +921,45 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_BLEND_EQUATION_ALPHA_FUNC_SUBTRACT 0x0000800a #define NV50TCL_BLEND_EQUATION_ALPHA_FUNC_REVERSE_SUBTRACT 0x0000800b #define NV50TCL_BLEND_FUNC_SRC_ALPHA 0x00001350 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ZERO 0x00000000 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE 0x00000001 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_COLOR 0x00000300 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_COLOR 0x00000301 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA 0x00000302 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_ALPHA 0x00000303 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_DST_ALPHA 0x00000304 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_ALPHA 0x00000305 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_DST_COLOR 0x00000306 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_COLOR 0x00000307 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA_SATURATE 0x00000308 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_COLOR 0x00008001 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_COLOR 0x00008002 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_ALPHA 0x00008003 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_ALPHA 0x00008004 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ZERO 0x00004000 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE 0x00004001 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_COLOR 0x00004300 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_COLOR 0x00004301 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA 0x00004302 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_ALPHA 0x00004303 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_DST_ALPHA 0x00004304 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_ALPHA 0x00004305 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_DST_COLOR 0x00004306 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_COLOR 0x00004307 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA_SATURATE 0x00004308 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_COLOR 0x0000c001 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_COLOR 0x0000c002 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_ALPHA 0x0000c003 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_ALPHA 0x0000c004 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC1_COLOR 0x0000c900 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC1_COLOR 0x0000c901 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC1_ALPHA 0x0000c902 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC1_ALPHA 0x0000c903 #define NV50TCL_BLEND_FUNC_DST_ALPHA 0x00001358 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_ZERO 0x00000000 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE 0x00000001 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_COLOR 0x00000300 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_COLOR 0x00000301 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA 0x00000302 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_ALPHA 0x00000303 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_DST_ALPHA 0x00000304 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_ALPHA 0x00000305 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_DST_COLOR 0x00000306 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_COLOR 0x00000307 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA_SATURATE 0x00000308 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_COLOR 0x00008001 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_COLOR 0x00008002 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_ALPHA 0x00008003 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_ALPHA 0x00008004 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_ZERO 0x00004000 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE 0x00004001 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_COLOR 0x00004300 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_COLOR 0x00004301 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA 0x00004302 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_ALPHA 0x00004303 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_DST_ALPHA 0x00004304 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_ALPHA 0x00004305 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_DST_COLOR 0x00004306 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_COLOR 0x00004307 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA_SATURATE 0x00004308 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_COLOR 0x0000c001 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_COLOR 0x0000c002 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_ALPHA 0x0000c003 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_ALPHA 0x0000c004 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_SRC1_COLOR 0x0000c900 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC1_COLOR 0x0000c901 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_SRC1_ALPHA 0x0000c902 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC1_ALPHA 0x0000c903 #define NV50TCL_BLEND_ENABLE(x) (0x00001360+((x)*4)) #define NV50TCL_BLEND_ENABLE__SIZE 0x00000008 #define NV50TCL_STENCIL_FRONT_ENABLE 0x00001380 @@ -988,6 +1018,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_FP_START_ID 0x00001414 #define NV50TCL_GP_VERTEX_OUTPUT_COUNT 0x00001420 #define NV50TCL_VB_ELEMENT_BASE 0x00001434 +#define NV50TCL_INSTANCE_BASE 0x00001438 #define NV50TCL_CODE_CB_FLUSH 0x00001440 #define NV50TCL_BIND_TSC(x) (0x00001444+((x)*8)) #define NV50TCL_BIND_TSC__SIZE 0x00000003 @@ -1005,6 +1036,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_BIND_TIC_TIC_MASK 0x7ffffe00 #define NV50TCL_STRMOUT_MAP(x) (0x00001480+((x)*4)) #define NV50TCL_STRMOUT_MAP__SIZE 0x00000020 +#define NV50TCL_CLIPID_HEIGHT 0x00001504 #define NV50TCL_VP_CLIP_DISTANCE_ENABLE 0x00001510 #define NV50TCL_VP_CLIP_DISTANCE_ENABLE_0 (1 << 0) #define NV50TCL_VP_CLIP_DISTANCE_ENABLE_1 (1 << 1) @@ -1089,7 +1121,14 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_GP_BUILTIN_RESULT_EN 0x000015cc #define NV50TCL_GP_BUILTIN_RESULT_EN_VPORT_IDX (1 << 0) #define NV50TCL_GP_BUILTIN_RESULT_EN_LAYER_IDX (1 << 16) -#define NV50TCL_MULTISAMPLE_SAMPLES_LOG2 0x000015d0 +#define NV50TCL_MULTISAMPLE_MODE 0x000015d0 +#define NV50TCL_MULTISAMPLE_MODE_1X 0x00000000 +#define NV50TCL_MULTISAMPLE_MODE_2XMS 0x00000001 +#define NV50TCL_MULTISAMPLE_MODE_4XMS 0x00000002 +#define NV50TCL_MULTISAMPLE_MODE_8XMS 0x00000004 +#define NV50TCL_MULTISAMPLE_MODE_4XMS_4XCS 0x00000008 +#define NV50TCL_MULTISAMPLE_MODE_4XMS_12XCS 0x00000009 +#define NV50TCL_MULTISAMPLE_MODE_8XMS_8XCS 0x0000000a #define NV50TCL_VERTEX_BEGIN 0x000015dc #define NV50TCL_VERTEX_BEGIN_POINTS 0x00000000 #define NV50TCL_VERTEX_BEGIN_LINES 0x00000001 @@ -1105,6 +1144,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_VERTEX_BEGIN_LINE_STRIP_ADJACENCY 0x0000000b #define NV50TCL_VERTEX_BEGIN_TRIANGLES_ADJACENCY 0x0000000c #define NV50TCL_VERTEX_BEGIN_TRIANGLE_STRIP_ADJACENCY 0x0000000d +#define NV50TCL_VERTEX_BEGIN_PATCHES 0x0000000e #define NV50TCL_VERTEX_END 0x000015e0 #define NV50TCL_EDGEFLAG_ENABLE 0x000015e4 #define NV50TCL_VB_ELEMENT_U32 0x000015e8 @@ -1118,6 +1158,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_VB_ELEMENT_U16_I0_MASK 0x0000ffff #define NV50TCL_VB_ELEMENT_U16_I1_SHIFT 16 #define NV50TCL_VB_ELEMENT_U16_I1_MASK 0xffff0000 +#define NV50TCL_VERTEX_BASE_HIGH 0x000015f4 +#define NV50TCL_VERTEX_BASE_LOW 0x000015f8 #define NV50TCL_VERTEX_DATA 0x00001640 #define NV50TCL_PRIM_RESTART_ENABLE 0x00001644 #define NV50TCL_PRIM_RESTART_INDEX 0x00001648 @@ -1503,7 +1545,14 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_VIEWPORT_TRANSFORM_EN 0x0000192c #define NV50TCL_VIEW_VOLUME_CLIP_CTRL 0x0000193c #define NV50TCL_VIEWPORT_CLIP_RECTS_EN 0x0000194c +#define NV50TCL_VIEWPORT_CLIP_MODE 0x00001950 +#define NV50TCL_VIEWPORT_CLIP_MODE_INCLUDE 0x00000000 +#define NV50TCL_VIEWPORT_CLIP_MODE_EXCLUDE 0x00000001 +#define NV50TCL_VIEWPORT_CLIP_MODE_UNKNOWN 0x00000002 #define NV50TCL_FP_CTRL_UNK196C 0x0000196c +#define NV50TCL_CLIPID_ENABLE 0x0000197c +#define NV50TCL_CLIPID_WIDTH 0x00001980 +#define NV50TCL_CLIPID_ID 0x00001984 #define NV50TCL_FP_INTERPOLANT_CTRL 0x00001988 #define NV50TCL_FP_INTERPOLANT_CTRL_UMASK_SHIFT 24 #define NV50TCL_FP_INTERPOLANT_CTRL_UMASK_MASK 0xff000000 @@ -1604,19 +1653,20 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8 0x00c00000 #define NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16 0x00d80000 #define NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8 0x00e80000 +#define NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_2_10_10_10 0x01800000 #define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SHIFT 25 -#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_MASK 0x7e000000 -#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_FLOAT 0x7e000000 -#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM 0x24000000 -#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM 0x12000000 -#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_USCALED 0x5a000000 -#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SSCALED 0x6c000000 -#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UINT 0x48000000 -#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SINT 0x36000000 +#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_MASK 0x0e000000 +#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_FLOAT 0x0e000000 +#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM 0x02000000 +#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM 0x04000000 +#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_USCALED 0x0a000000 +#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SSCALED 0x0c000000 +#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UINT 0x08000000 +#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SINT 0x06000000 #define NV50TCL_VERTEX_ARRAY_ATTRIB_BGRA (1 << 31) #define NV50TCL_QUERY_ADDRESS_HIGH 0x00001b00 #define NV50TCL_QUERY_ADDRESS_LOW 0x00001b04 -#define NV50TCL_QUERY_COUNTER 0x00001b08 +#define NV50TCL_QUERY_SEQUENCE 0x00001b08 #define NV50TCL_QUERY_GET 0x00001b0c diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c index f37dd079acb..49af9b59beb 100644 --- a/src/gallium/drivers/nv50/nv50_screen.c +++ b/src/gallium/drivers/nv50/nv50_screen.c @@ -26,6 +26,7 @@ #include "nv50_context.h" #include "nv50_screen.h" #include "nv50_resource.h" +#include "nv50_program.h" #include "nouveau/nouveau_stateobj.h" @@ -34,75 +35,38 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen, enum pipe_format format, enum pipe_texture_target target, unsigned sample_count, - unsigned tex_usage, unsigned geom_flags) + unsigned usage, unsigned geom_flags) { if (sample_count > 1) return FALSE; - if (tex_usage & PIPE_BIND_RENDER_TARGET) { + if (!util_format_s3tc_enabled) { switch (format) { - case PIPE_FORMAT_B8G8R8X8_UNORM: - case PIPE_FORMAT_B8G8R8A8_UNORM: - case PIPE_FORMAT_B5G6R5_UNORM: - case PIPE_FORMAT_R16G16B16A16_SNORM: - case PIPE_FORMAT_R16G16B16A16_UNORM: - case PIPE_FORMAT_R32G32B32A32_FLOAT: - case PIPE_FORMAT_R16G16_SNORM: - case PIPE_FORMAT_R16G16_UNORM: - return TRUE; - default: - break; - } - } else - if (tex_usage & PIPE_BIND_DEPTH_STENCIL) { - switch (format) { - case PIPE_FORMAT_Z32_FLOAT: - case PIPE_FORMAT_S8_USCALED_Z24_UNORM: - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_Z24_UNORM_S8_USCALED: - return TRUE; - default: - break; - } - } else { - if (tex_usage & PIPE_BIND_SAMPLER_VIEW) { - switch (format) { - case PIPE_FORMAT_DXT1_RGB: - case PIPE_FORMAT_DXT1_RGBA: - case PIPE_FORMAT_DXT3_RGBA: - case PIPE_FORMAT_DXT5_RGBA: - return util_format_s3tc_enabled; - default: - break; - } - } - switch (format) { - case PIPE_FORMAT_B8G8R8A8_UNORM: - case PIPE_FORMAT_B8G8R8X8_UNORM: - case PIPE_FORMAT_B8G8R8A8_SRGB: - case PIPE_FORMAT_B8G8R8X8_SRGB: - case PIPE_FORMAT_B5G5R5A1_UNORM: - case PIPE_FORMAT_B4G4R4A4_UNORM: - case PIPE_FORMAT_B5G6R5_UNORM: - case PIPE_FORMAT_L8_UNORM: - case PIPE_FORMAT_A8_UNORM: - case PIPE_FORMAT_I8_UNORM: - case PIPE_FORMAT_L8A8_UNORM: - case PIPE_FORMAT_S8_USCALED_Z24_UNORM: - case PIPE_FORMAT_Z24_UNORM_S8_USCALED: - case PIPE_FORMAT_Z32_FLOAT: - case PIPE_FORMAT_R16G16B16A16_SNORM: - case PIPE_FORMAT_R16G16B16A16_UNORM: - case PIPE_FORMAT_R32G32B32A32_FLOAT: - case PIPE_FORMAT_R16G16_SNORM: - case PIPE_FORMAT_R16G16_UNORM: - return TRUE; + case PIPE_FORMAT_DXT1_RGB: + case PIPE_FORMAT_DXT1_RGBA: + case PIPE_FORMAT_DXT3_RGBA: + case PIPE_FORMAT_DXT5_RGBA: + return FALSE; default: break; } } - return FALSE; + switch (format) { + case PIPE_FORMAT_Z16_UNORM: + if ((nouveau_screen(pscreen)->device->chipset & 0xf0) != 0xa0) + return FALSE; + break; + default: + break; + } + + /* transfers & shared are always supported */ + usage &= ~(PIPE_BIND_TRANSFER_READ | + PIPE_BIND_TRANSFER_WRITE | + PIPE_BIND_SHARED); + + return (nv50_format_table[format].usage & usage) == usage; } static int @@ -142,6 +106,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_TEXTURE_MIRROR_CLAMP: case PIPE_CAP_TEXTURE_MIRROR_REPEAT: return 1; + case PIPE_CAP_TEXTURE_SWIZZLE: + return 1; case PIPE_CAP_BLEND_EQUATION_SEPARATE: return 1; case PIPE_CAP_INDEP_BLEND_ENABLE: @@ -165,10 +131,10 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) } static int -nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, enum pipe_shader_cap param) +nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, + enum pipe_shader_cap param) { - switch(shader) - { + switch(shader) { case PIPE_SHADER_FRAGMENT: case PIPE_SHADER_VERTEX: case PIPE_SHADER_GEOMETRY: @@ -186,7 +152,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, enum case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH: /* need stack bo */ return 4; case PIPE_SHADER_CAP_MAX_INPUTS: /* 128 / 4 with GP */ - if(shader == PIPE_SHADER_GEOMETRY) + if (shader == PIPE_SHADER_GEOMETRY) return 128 / 4; else return 64 / 4; @@ -197,7 +163,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, enum case PIPE_SHADER_CAP_MAX_PREDS: /* not yet handled */ return 0; case PIPE_SHADER_CAP_MAX_TEMPS: /* no spilling atm */ - return 128 / 4; + return NV50_CAP_MAX_PROGRAM_TEMPS; case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: return 1; default: @@ -301,14 +267,23 @@ nv50_screen_relocs(struct nv50_screen *screen) } } +#ifndef NOUVEAU_GETPARAM_GRAPH_UNITS +# define NOUVEAU_GETPARAM_GRAPH_UNITS 13 +#endif + +extern int nouveau_device_get_param(struct nouveau_device *dev, + uint64_t param, uint64_t *value); + struct pipe_screen * nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) { struct nv50_screen *screen = CALLOC_STRUCT(nv50_screen); struct nouveau_channel *chan; struct pipe_screen *pscreen; + uint64_t value; unsigned chipset = dev->chipset; unsigned tesla_class = 0; + unsigned stack_size, local_size, max_warps; int ret, i; const unsigned rl = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD; @@ -527,6 +502,41 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) OUT_RING (chan, 0x121 | (NV50_CB_PGP << 12)); OUT_RING (chan, 0x131 | (NV50_CB_PFP << 12)); + /* shader stack */ + nouveau_device_get_param(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value); + + max_warps = util_bitcount(value & 0xffff); + max_warps *= util_bitcount((value >> 24) & 0xf) * 32; + + stack_size = max_warps * 64 * 8; + + ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, + stack_size, &screen->stack_bo); + if (ret) { + nv50_screen_destroy(pscreen); + return NULL; + } + BEGIN_RING(chan, screen->tesla, NV50TCL_STACK_ADDRESS_HIGH, 3); + OUT_RELOCh(chan, screen->stack_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + OUT_RELOCl(chan, screen->stack_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + OUT_RING (chan, 4); + + local_size = (NV50_CAP_MAX_PROGRAM_TEMPS * 16) * max_warps * 32; + + ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, + local_size, &screen->local_bo); + if (ret) { + nv50_screen_destroy(pscreen); + return NULL; + } + + local_size = NV50_CAP_MAX_PROGRAM_TEMPS * 16; + + BEGIN_RING(chan, screen->tesla, NV50TCL_LOCAL_ADDRESS_HIGH, 3); + OUT_RELOCh(chan, screen->local_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + OUT_RELOCl(chan, screen->local_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + OUT_RING (chan, util_unsigned_logbase2(local_size / 8)); + /* Vertex array limits - max them out */ for (i = 0; i < 16; i++) { BEGIN_RING(chan, screen->tesla, diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h index fbf15a75967..ad6bdeb27c8 100644 --- a/src/gallium/drivers/nv50/nv50_screen.h +++ b/src/gallium/drivers/nv50/nv50_screen.h @@ -22,11 +22,12 @@ struct nv50_screen { struct nouveau_resource *immd_heap; - struct pipe_resource *strm_vbuf[16]; - struct nouveau_bo *tic; struct nouveau_bo *tsc; + struct nouveau_bo *stack_bo; /* control flow stack */ + struct nouveau_bo *local_bo; /* l[] memory */ + boolean force_push; }; @@ -38,4 +39,13 @@ nv50_screen(struct pipe_screen *screen) extern void nv50_screen_relocs(struct nv50_screen *); +struct nv50_format { + uint32_t rt; + uint32_t tic; + uint32_t vtx; + uint32_t usage; +}; + +extern const struct nv50_format nv50_format_table[]; + #endif diff --git a/src/gallium/drivers/nv50/nv50_shader_state.c b/src/gallium/drivers/nv50/nv50_shader_state.c new file mode 100644 index 00000000000..564f7e53246 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_shader_state.c @@ -0,0 +1,626 @@ +/* + * Copyright 2008 Ben Skeggs + * Copyright 2010 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "pipe/p_context.h" +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "util/u_inlines.h" + +#include "nv50_context.h" +#include "nv50_transfer.h" + +static void +nv50_transfer_constbuf(struct nv50_context *nv50, + struct pipe_resource *buf, unsigned size, unsigned cbi) +{ + struct pipe_context *pipe = &nv50->pipe; + struct pipe_transfer *transfer; + struct nouveau_channel *chan = nv50->screen->base.channel; + struct nouveau_grobj *tesla = nv50->screen->tesla; + uint32_t *map; + unsigned count, start; + + map = pipe_buffer_map(pipe, buf, PIPE_TRANSFER_READ, &transfer); + if (!map) + return; + + count = (buf->width0 + 3) / 4; + start = 0; + + while (count) { + unsigned nr = count; + nr = MIN2(nr, 2047); + + /* FIXME: emit relocs for unsuiTed MM */ + BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); + OUT_RING (chan, (start << 8) | cbi); + BEGIN_RING_NI(chan, tesla, NV50TCL_CB_DATA(0), nr); + OUT_RINGp (chan, map, nr); + + count -= nr; + start += nr; + map += nr; + } + + pipe_buffer_unmap(pipe, buf, transfer); +} + +static void +nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p) +{ + struct nouveau_channel *chan = nv50->screen->base.channel; + struct nouveau_grobj *tesla = nv50->screen->tesla; + unsigned cbi; + + if (p->immd_size) { + uint32_t *data = p->immd; + unsigned count = p->immd_size / 4; + unsigned start = 0; + + while (count) { + unsigned nr = count; + nr = MIN2(nr, 2047); + + BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); + OUT_RING (chan, (start << 8) | NV50_CB_PMISC); + BEGIN_RING_NI(chan, tesla, NV50TCL_CB_DATA(0), nr); + OUT_RINGp (chan, data, nr); + + count -= nr; + start += nr; + data += nr; + } + } + + /* If the state tracker doesn't change the constbuf, and it is first + * validated with a program that doesn't use it, this check prevents + * it from even being uploaded. */ + /* + if (p->parm_size == 0) + return; + */ + + switch (p->type) { + case PIPE_SHADER_VERTEX: + cbi = NV50_CB_PVP; + break; + case PIPE_SHADER_FRAGMENT: + cbi = NV50_CB_PFP; + break; + case PIPE_SHADER_GEOMETRY: + cbi = NV50_CB_PGP; + break; + default: + assert(0); + break; + } + + nv50_transfer_constbuf(nv50, nv50->constbuf[p->type], p->parm_size, cbi); +} + +static void +nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) +{ + struct nouveau_channel *chan = nv50->screen->base.channel; + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nouveau_grobj *eng2d = nv50->screen->eng2d; + int ret; + unsigned offset; + unsigned size = p->code_size; + uint32_t *data = p->code; + + assert(p->translated); + + /* TODO: use a single bo (for each type) for shader code */ + if (p->bo) + return; + ret = nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100, size, &p->bo); + assert(!ret); + + offset = p->code_start = 0; + + BEGIN_RING(chan, eng2d, NV50_2D_DST_FORMAT, 2); + OUT_RING (chan, NV50_2D_DST_FORMAT_R8_UNORM); + OUT_RING (chan, 1); + BEGIN_RING(chan, eng2d, NV50_2D_DST_PITCH, 1); + OUT_RING (chan, 0x40000); + BEGIN_RING(chan, eng2d, NV50_2D_DST_WIDTH, 2); + OUT_RING (chan, 0x10000); + OUT_RING (chan, 1); + + while (size) { + unsigned nr = size / 4; + + if (AVAIL_RING(chan) < 32) + FIRE_RING(chan); + + nr = MIN2(nr, AVAIL_RING(chan) - 18); + nr = MIN2(nr, 1792); + if (nr < (size / 4)) + nr &= ~0x3f; + assert(!(size & 3)); + + BEGIN_RING(chan, eng2d, NV50_2D_DST_ADDRESS_HIGH, 2); + OUT_RELOCh(chan, p->bo, offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + OUT_RELOCl(chan, p->bo, offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + BEGIN_RING(chan, eng2d, NV50_2D_SIFC_BITMAP_ENABLE, 2); + OUT_RING (chan, 0); + OUT_RING (chan, NV50_2D_SIFC_FORMAT_R8_UNORM); + BEGIN_RING(chan, eng2d, NV50_2D_SIFC_WIDTH, 10); + OUT_RING (chan, nr * 4); + OUT_RING (chan, 1); + OUT_RING (chan, 0); + OUT_RING (chan, 1); + OUT_RING (chan, 0); + OUT_RING (chan, 1); + OUT_RING (chan, 0); + OUT_RING (chan, 0); + OUT_RING (chan, 0); + OUT_RING (chan, 0); + + BEGIN_RING_NI(chan, eng2d, NV50_2D_SIFC_DATA, nr); + OUT_RINGp (chan, data, nr); + + data += nr; + offset += nr * 4; + size -= nr * 4; + } + + BEGIN_RING(chan, tesla, NV50TCL_CODE_CB_FLUSH, 1); + OUT_RING (chan, 0); +} + +static void +nv50_vp_update_stateobj(struct nv50_context *nv50, struct nv50_program *p) +{ + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nouveau_stateobj *so = so_new(5, 7, 2); + + nv50_program_validate_code(nv50, p); + + so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2); + so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | + NOUVEAU_BO_HIGH, 0, 0); + so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | + NOUVEAU_BO_LOW, 0, 0); + so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2); + so_data (so, p->vp.attrs[0]); + so_data (so, p->vp.attrs[1]); + so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1); + so_data (so, p->max_out); + so_method(so, tesla, NV50TCL_VP_REG_ALLOC_TEMP, 1); + so_data (so, p->max_gpr); + so_method(so, tesla, NV50TCL_VP_START_ID, 1); + so_data (so, p->code_start); + + so_ref(so, &p->so); + so_ref(NULL, &so); +} + +static void +nv50_fp_update_stateobj(struct nv50_context *nv50, struct nv50_program *p) +{ + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nouveau_stateobj *so = so_new(6, 7, 2); + + nv50_program_validate_code(nv50, p); + + so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2); + so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | + NOUVEAU_BO_HIGH, 0, 0); + so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | + NOUVEAU_BO_LOW, 0, 0); + so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1); + so_data (so, p->max_gpr); + so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1); + so_data (so, p->max_out); + so_method(so, tesla, NV50TCL_FP_CONTROL, 1); + so_data (so, p->fp.flags[0]); + so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1); + so_data (so, p->fp.flags[1]); + so_method(so, tesla, NV50TCL_FP_START_ID, 1); + so_data (so, p->code_start); + + so_ref(so, &p->so); + so_ref(NULL, &so); +} + +static void +nv50_gp_update_stateobj(struct nv50_context *nv50, struct nv50_program *p) +{ + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nouveau_stateobj *so = so_new(6, 7, 2); + + nv50_program_validate_code(nv50, p); + + so_method(so, tesla, NV50TCL_GP_ADDRESS_HIGH, 2); + so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | + NOUVEAU_BO_HIGH, 0, 0); + so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | + NOUVEAU_BO_LOW, 0, 0); + so_method(so, tesla, NV50TCL_GP_REG_ALLOC_TEMP, 1); + so_data (so, p->max_gpr); + so_method(so, tesla, NV50TCL_GP_REG_ALLOC_RESULT, 1); + so_data (so, p->max_out); + so_method(so, tesla, NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE, 1); + so_data (so, p->gp.prim_type); + so_method(so, tesla, NV50TCL_GP_VERTEX_OUTPUT_COUNT, 1); + so_data (so, p->gp.vert_count); + so_method(so, tesla, NV50TCL_GP_START_ID, 1); + so_data (so, p->code_start); + + so_ref(so, &p->so); + so_ref(NULL, &so); +} + +static boolean +nv50_program_validate(struct nv50_program *p) +{ + p->translated = nv50_program_tx(p); + assert(p->translated); + return p->translated; +} + +struct nouveau_stateobj * +nv50_vertprog_validate(struct nv50_context *nv50) +{ + struct nv50_program *p = nv50->vertprog; + struct nouveau_stateobj *so = NULL; + + if (!p->translated) { + if (nv50_program_validate(p)) + nv50_vp_update_stateobj(nv50, p); + else + return NULL; + } + + if (nv50->dirty & NV50_NEW_VERTPROG_CB) + nv50_program_validate_data(nv50, p); + + if (!(nv50->dirty & NV50_NEW_VERTPROG)) + return NULL; + + nv50_program_validate_code(nv50, p); + + so_ref(p->so, &so); + return so; +} + +struct nouveau_stateobj * +nv50_fragprog_validate(struct nv50_context *nv50) +{ + struct nv50_program *p = nv50->fragprog; + struct nouveau_stateobj *so = NULL; + + if (!p->translated) { + if (nv50_program_validate(p)) + nv50_fp_update_stateobj(nv50, p); + else + return NULL; + } + + if (nv50->dirty & NV50_NEW_FRAGPROG_CB) + nv50_program_validate_data(nv50, p); + + if (!(nv50->dirty & NV50_NEW_FRAGPROG)) + return NULL; + + nv50_program_validate_code(nv50, p); + + so_ref(p->so, &so); + return so; +} + +struct nouveau_stateobj * +nv50_geomprog_validate(struct nv50_context *nv50) +{ + struct nv50_program *p = nv50->geomprog; + struct nouveau_stateobj *so = NULL; + + if (!p->translated) { + if (nv50_program_validate(p)) + nv50_gp_update_stateobj(nv50, p); + else + return NULL; + } + + if (nv50->dirty & NV50_NEW_GEOMPROG_CB) + nv50_program_validate_data(nv50, p); + + if (!(nv50->dirty & NV50_NEW_GEOMPROG)) + return NULL; + + nv50_program_validate_code(nv50, p); + + so_ref(p->so, &so); + return so; +} + +/* XXX: this might not work correctly in all cases yet: we assume that + * an FP generic input that is not written in the VP is gl_PointCoord. + */ +static uint32_t +nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned m) +{ + struct nv50_program *vp = nv50->vertprog; + struct nv50_program *fp = nv50->fragprog; + unsigned i, c; + + memset(pntc, 0, 8 * sizeof(uint32_t)); + + if (nv50->geomprog) + vp = nv50->geomprog; + + for (i = 0; i < fp->in_nr; i++) { + unsigned j, n = util_bitcount(fp->in[i].mask); + + if (fp->in[i].sn != TGSI_SEMANTIC_GENERIC) { + m += n; + continue; + } + + for (j = 0; j < vp->out_nr; ++j) + if (vp->out[j].sn == fp->in[i].sn && vp->out[j].si == fp->in[i].si) + break; + + if (j < vp->out_nr) { + uint32_t en = nv50->rasterizer->pipe.sprite_coord_enable; + + if (!(en & (1 << vp->out[j].si))) { + m += n; + continue; + } + } + + /* this is either PointCoord or replaced by sprite coords */ + for (c = 0; c < 4; c++) { + if (!(fp->in[i].mask & (1 << c))) + continue; + pntc[m / 8] |= (c + 1) << ((m % 8) * 4); + ++m; + } + } + if (nv50->rasterizer->pipe.sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT) + return 0; + return (1 << 4); +} + +static int +nv50_vec4_map(uint32_t *map32, int mid, uint32_t lin[4], + struct nv50_varying *in, struct nv50_varying *out) +{ + int c; + uint8_t mv = out->mask, mf = in->mask, oid = out->hw; + uint8_t *map = (uint8_t *)map32; + + for (c = 0; c < 4; ++c) { + if (mf & 1) { + if (in->linear) + lin[mid / 32] |= 1 << (mid % 32); + if (mv & 1) + map[mid] = oid; + else + if (c == 3) + map[mid] |= 1; + ++mid; + } + + oid += mv & 1; + mf >>= 1; + mv >>= 1; + } + + return mid; +} + +struct nouveau_stateobj * +nv50_fp_linkage_validate(struct nv50_context *nv50) +{ + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nv50_program *vp; + struct nv50_program *fp = nv50->fragprog; + struct nouveau_stateobj *so; + struct nv50_varying dummy; + int i, n, c, m; + + uint32_t map[16], lin[4], pntc[8]; + + uint32_t interp = fp->fp.interp; + uint32_t colors = fp->fp.colors; + uint32_t clip = 0x04; + uint32_t psiz = 0x000; + uint32_t primid = 0; + uint32_t sysval = 0; + + if (nv50->geomprog) { + vp = nv50->geomprog; + memset(map, 0x80, sizeof(map)); + } else { + vp = nv50->vertprog; + memset(map, 0x40, sizeof(map)); + } + memset(lin, 0, sizeof(lin)); + + dummy.linear = 0; + dummy.mask = 0xf; /* map all components of HPOS */ + m = nv50_vec4_map(map, 0, lin, &dummy, &vp->out[0]); + + if (vp->vp.clpd < 0x40) { + for (c = 0; c < vp->vp.clpd_nr; ++c) { + map[m / 4] |= (vp->vp.clpd + c) << ((m % 4) * 8); + ++m; + } + clip |= vp->vp.clpd_nr << 8; + } + + colors |= m << 8; /* adjust BFC0 id */ + + /* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */ + if (nv50->rasterizer->pipe.light_twoside) { + for (i = 0; i < 2; ++i) + m = nv50_vec4_map(map, m, lin, + &fp->in[fp->vp.bfc[i]], + &vp->out[vp->vp.bfc[i]]); + } + + colors += m - 4; /* adjust FFC0 id */ + interp |= m << 8; /* set mid where 'normal' FP inputs start */ + + dummy.mask = 0x0; + for (i = 0; i < fp->in_nr; i++) { + for (n = 0; n < vp->out_nr; ++n) + if (vp->out[n].sn == fp->in[i].sn && + vp->out[n].si == fp->in[i].si) + break; + + m = nv50_vec4_map(map, m, lin, + &fp->in[i], (n < vp->out_nr) ? &vp->out[n] : &dummy); + } + + /* PrimitiveID either is replaced by the system value, or + * written by the geometry shader into an output register + */ + if (fp->gp.primid < 0x40) { + i = (m % 4) * 8; + map[m / 4] = (map[m / 4] & ~(0xff << i)) | (vp->gp.primid << i); + primid = m++; + } + + if (nv50->rasterizer->pipe.point_size_per_vertex) { + i = (m % 4) * 8; + map[m / 4] = (map[m / 4] & ~(0xff << i)) | (vp->vp.psiz << i); + psiz = (m++ << 4) | 1; + } + + /* now fill the stateobj (at most 28 so_data) */ + so = so_new(10, 54, 0); + + n = (m + 3) / 4; + assert(m <= 64); + if (vp->type == PIPE_SHADER_GEOMETRY) { + so_method(so, tesla, NV50TCL_GP_RESULT_MAP_SIZE, 1); + so_data (so, m); + so_method(so, tesla, NV50TCL_GP_RESULT_MAP(0), n); + so_datap (so, map, n); + } else { + so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1); + so_data (so, vp->vp.attrs[2]); + + so_method(so, tesla, NV50TCL_MAP_SEMANTIC_4, 1); + so_data (so, primid); + + so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); + so_data (so, m); + so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n); + so_datap (so, map, n); + } + + so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4); + so_data (so, colors); + so_data (so, clip); + so_data (so, sysval); + so_data (so, psiz); + + so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1); + so_data (so, interp); + + so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4); + so_datap (so, lin, 4); + + if (nv50->rasterizer->pipe.point_quad_rasterization) { + so_method(so, tesla, NV50TCL_POINT_SPRITE_CTRL, 1); + so_data (so, + nv50_pntc_replace(nv50, pntc, (interp >> 8) & 0xff)); + + so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8); + so_datap (so, pntc, 8); + } + + so_method(so, tesla, NV50TCL_GP_ENABLE, 1); + so_data (so, (vp->type == PIPE_SHADER_GEOMETRY) ? 1 : 0); + + return so; +} + +static int +nv50_vp_gp_mapping(uint32_t *map32, int m, + struct nv50_program *vp, struct nv50_program *gp) +{ + uint8_t *map = (uint8_t *)map32; + int i, j, c; + + for (i = 0; i < gp->in_nr; ++i) { + uint8_t oid = 0, mv = 0, mg = gp->in[i].mask; + + for (j = 0; j < vp->out_nr; ++j) { + if (vp->out[j].sn == gp->in[i].sn && + vp->out[j].si == gp->in[i].si) { + mv = vp->out[j].mask; + oid = vp->out[j].hw; + break; + } + } + + for (c = 0; c < 4; ++c, mv >>= 1, mg >>= 1) { + if (mg & mv & 1) + map[m++] = oid; + else + if (mg & 1) + map[m++] = (c == 3) ? 0x41 : 0x40; + oid += mv & 1; + } + } + return m; +} + +struct nouveau_stateobj * +nv50_gp_linkage_validate(struct nv50_context *nv50) +{ + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nouveau_stateobj *so; + struct nv50_program *vp = nv50->vertprog; + struct nv50_program *gp = nv50->geomprog; + uint32_t map[16]; + int m = 0; + + if (!gp) + return NULL; + memset(map, 0, sizeof(map)); + + m = nv50_vp_gp_mapping(map, m, vp, gp); + + so = so_new(3, 24 - 3, 0); + + so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1); + so_data (so, vp->vp.attrs[2] | gp->vp.attrs[2]); + + assert(m <= 32); + so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); + so_data (so, m); + + m = (m + 3) / 4; + so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), m); + so_datap (so, map, m); + + return so; +} diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c index ec0c0ff2838..3afce06557a 100644 --- a/src/gallium/drivers/nv50/nv50_state.c +++ b/src/gallium/drivers/nv50/nv50_state.c @@ -48,6 +48,53 @@ nv50_colormask(unsigned mask) return cmask; } +static INLINE uint32_t +nv50_blend_func(unsigned factor) +{ + switch (factor) { + case PIPE_BLENDFACTOR_ZERO: + return NV50TCL_BLEND_FUNC_SRC_RGB_ZERO; + case PIPE_BLENDFACTOR_ONE: + return NV50TCL_BLEND_FUNC_SRC_RGB_ONE; + case PIPE_BLENDFACTOR_SRC_COLOR: + return NV50TCL_BLEND_FUNC_SRC_RGB_SRC_COLOR; + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_COLOR; + case PIPE_BLENDFACTOR_SRC_ALPHA: + return NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_ALPHA; + case PIPE_BLENDFACTOR_DST_ALPHA: + return NV50TCL_BLEND_FUNC_SRC_RGB_DST_ALPHA; + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_ALPHA; + case PIPE_BLENDFACTOR_DST_COLOR: + return NV50TCL_BLEND_FUNC_SRC_RGB_DST_COLOR; + case PIPE_BLENDFACTOR_INV_DST_COLOR: + return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_COLOR; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + return NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA_SATURATE; + case PIPE_BLENDFACTOR_CONST_COLOR: + return NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_COLOR; + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_COLOR; + case PIPE_BLENDFACTOR_CONST_ALPHA: + return NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_ALPHA; + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA; + case PIPE_BLENDFACTOR_SRC1_COLOR: + return NV50TCL_BLEND_FUNC_SRC_RGB_SRC1_COLOR; + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC1_COLOR; + case PIPE_BLENDFACTOR_SRC1_ALPHA: + return NV50TCL_BLEND_FUNC_SRC_RGB_SRC1_ALPHA; + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC1_ALPHA; + default: + return NV50TCL_BLEND_FUNC_SRC_RGB_ZERO; + } +} + static void * nv50_blend_state_create(struct pipe_context *pipe, const struct pipe_blend_state *cso) @@ -80,12 +127,12 @@ nv50_blend_state_create(struct pipe_context *pipe, if (blend_enabled) { so_method(so, tesla, NV50TCL_BLEND_EQUATION_RGB, 5); so_data (so, nvgl_blend_eqn(cso->rt[0].rgb_func)); - so_data (so, 0x4000 | nvgl_blend_func(cso->rt[0].rgb_src_factor)); - so_data (so, 0x4000 | nvgl_blend_func(cso->rt[0].rgb_dst_factor)); + so_data (so, nv50_blend_func(cso->rt[0].rgb_src_factor)); + so_data (so, nv50_blend_func(cso->rt[0].rgb_dst_factor)); so_data (so, nvgl_blend_eqn(cso->rt[0].alpha_func)); - so_data (so, 0x4000 | nvgl_blend_func(cso->rt[0].alpha_src_factor)); + so_data (so, nv50_blend_func(cso->rt[0].alpha_src_factor)); so_method(so, tesla, NV50TCL_BLEND_FUNC_DST_ALPHA, 1); - so_data (so, 0x4000 | nvgl_blend_func(cso->rt[0].alpha_dst_factor)); + so_data (so, nv50_blend_func(cso->rt[0].alpha_dst_factor)); } if (cso->logicop_enable == 0 ) { @@ -546,7 +593,6 @@ nv50_vp_state_create(struct pipe_context *pipe, p->pipe.tokens = tgsi_dup_tokens(cso->tokens); p->type = PIPE_SHADER_VERTEX; - tgsi_scan_shader(p->pipe.tokens, &p->info); return (void *)p; } @@ -578,7 +624,6 @@ nv50_fp_state_create(struct pipe_context *pipe, p->pipe.tokens = tgsi_dup_tokens(cso->tokens); p->type = PIPE_SHADER_FRAGMENT; - tgsi_scan_shader(p->pipe.tokens, &p->info); return (void *)p; } @@ -610,7 +655,6 @@ nv50_gp_state_create(struct pipe_context *pipe, p->pipe.tokens = tgsi_dup_tokens(cso->tokens); p->type = PIPE_SHADER_GEOMETRY; - tgsi_scan_shader(p->pipe.tokens, &p->info); return (void *)p; } diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c index 524696f35d1..f1d8202dffa 100644 --- a/src/gallium/drivers/nv50/nv50_state_validate.c +++ b/src/gallium/drivers/nv50/nv50_state_validate.c @@ -56,6 +56,8 @@ validate_fb(struct nv50_context *nv50) assert(h == fb->cbufs[i]->height); } + assert(nv50_format_table[fb->cbufs[i]->format].rt); + so_method(so, tesla, NV50TCL_RT_HORIZ(i), 2); so_data (so, fb->cbufs[i]->width); so_data (so, fb->cbufs[i]->height); @@ -65,39 +67,9 @@ validate_fb(struct nv50_context *nv50) NOUVEAU_BO_HIGH | NOUVEAU_BO_RDWR, 0, 0); so_reloc (so, bo, fb->cbufs[i]->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW | NOUVEAU_BO_RDWR, 0, 0); - switch (fb->cbufs[i]->format) { - case PIPE_FORMAT_B8G8R8A8_UNORM: - so_data(so, NV50TCL_RT_FORMAT_A8R8G8B8_UNORM); - break; - case PIPE_FORMAT_B8G8R8X8_UNORM: - so_data(so, NV50TCL_RT_FORMAT_X8R8G8B8_UNORM); - break; - case PIPE_FORMAT_B5G6R5_UNORM: - so_data(so, NV50TCL_RT_FORMAT_R5G6B5_UNORM); - break; - case PIPE_FORMAT_R16G16B16A16_SNORM: - so_data(so, NV50TCL_RT_FORMAT_R16G16B16A16_SNORM); - break; - case PIPE_FORMAT_R16G16B16A16_UNORM: - so_data(so, NV50TCL_RT_FORMAT_R16G16B16A16_UNORM); - break; - case PIPE_FORMAT_R32G32B32A32_FLOAT: - so_data(so, NV50TCL_RT_FORMAT_R32G32B32A32_FLOAT); - break; - case PIPE_FORMAT_R16G16_SNORM: - so_data(so, NV50TCL_RT_FORMAT_R16G16_SNORM); - break; - case PIPE_FORMAT_R16G16_UNORM: - so_data(so, NV50TCL_RT_FORMAT_R16G16_UNORM); - break; - default: - NOUVEAU_ERR("AIIII unknown format %s\n", - util_format_name(fb->cbufs[i]->format)); - so_data(so, NV50TCL_RT_FORMAT_X8R8G8B8_UNORM); - break; - } - so_data(so, nv50_miptree(pt)-> - level[fb->cbufs[i]->level].tile_mode << 4); + so_data (so, nv50_format_table[fb->cbufs[i]->format].rt); + so_data (so, nv50_miptree(pt)-> + level[fb->cbufs[i]->level].tile_mode << 4); so_data(so, 0x00000000); so_method(so, tesla, NV50TCL_RT_ARRAY_MODE, 1); @@ -117,33 +89,17 @@ validate_fb(struct nv50_context *nv50) assert(h == fb->zsbuf->height); } + assert(nv50_format_table[fb->zsbuf->format].rt); + so_method(so, tesla, NV50TCL_ZETA_ADDRESS_HIGH, 5); so_reloc (so, bo, fb->zsbuf->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_HIGH | NOUVEAU_BO_RDWR, 0, 0); so_reloc (so, bo, fb->zsbuf->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW | NOUVEAU_BO_RDWR, 0, 0); - switch (fb->zsbuf->format) { - case PIPE_FORMAT_Z24_UNORM_S8_USCALED: - so_data(so, NV50TCL_ZETA_FORMAT_S8Z24_UNORM); - break; - case PIPE_FORMAT_Z24X8_UNORM: - so_data(so, NV50TCL_ZETA_FORMAT_X8Z24_UNORM); - break; - case PIPE_FORMAT_S8_USCALED_Z24_UNORM: - so_data(so, NV50TCL_ZETA_FORMAT_Z24S8_UNORM); - break; - case PIPE_FORMAT_Z32_FLOAT: - so_data(so, NV50TCL_ZETA_FORMAT_Z32_FLOAT); - break; - default: - NOUVEAU_ERR("AIIII unknown format %s\n", - util_format_name(fb->zsbuf->format)); - so_data(so, NV50TCL_ZETA_FORMAT_S8Z24_UNORM); - break; - } - so_data(so, nv50_miptree(pt)-> - level[fb->zsbuf->level].tile_mode << 4); - so_data(so, 0x00000000); + so_data (so, nv50_format_table[fb->zsbuf->format].rt); + so_data (so, nv50_miptree(pt)-> + level[fb->zsbuf->level].tile_mode << 4); + so_data (so, 0x00000000); so_method(so, tesla, NV50TCL_ZETA_ENABLE, 1); so_data (so, 1); diff --git a/src/gallium/drivers/nv50/nv50_tex.c b/src/gallium/drivers/nv50/nv50_tex.c index d41d9c51029..658324ec5be 100644 --- a/src/gallium/drivers/nv50/nv50_tex.c +++ b/src/gallium/drivers/nv50/nv50_tex.c @@ -29,56 +29,6 @@ #include "util/u_format.h" -#define _MIXED(pf, t0, t1, t2, t3, cr, cg, cb, ca, f) \ -[PIPE_FORMAT_##pf] = ( \ - NV50TIC_0_0_MAPR_##cr | NV50TIC_0_0_TYPER_##t0 | \ - NV50TIC_0_0_MAPG_##cg | NV50TIC_0_0_TYPEG_##t1 | \ - NV50TIC_0_0_MAPB_##cb | NV50TIC_0_0_TYPEB_##t2 | \ - NV50TIC_0_0_MAPA_##ca | NV50TIC_0_0_TYPEA_##t3 | \ - NV50TIC_0_0_FMT_##f) - -#define _(pf, t, cr, cg, cb, ca, f) _MIXED(pf, t, t, t, t, cr, cg, cb, ca, f) - -static const uint32_t nv50_texture_formats[PIPE_FORMAT_COUNT] = -{ - _(B8G8R8A8_UNORM, UNORM, C2, C1, C0, C3, 8_8_8_8), - _(B8G8R8A8_SRGB, UNORM, C2, C1, C0, C3, 8_8_8_8), - _(B8G8R8X8_UNORM, UNORM, C2, C1, C0, ONE, 8_8_8_8), - _(B8G8R8X8_SRGB, UNORM, C2, C1, C0, ONE, 8_8_8_8), - _(B5G5R5A1_UNORM, UNORM, C2, C1, C0, C3, 1_5_5_5), - _(B4G4R4A4_UNORM, UNORM, C2, C1, C0, C3, 4_4_4_4), - - _(B5G6R5_UNORM, UNORM, C2, C1, C0, ONE, 5_6_5), - - _(L8_UNORM, UNORM, C0, C0, C0, ONE, 8), - _(L8_SRGB, UNORM, C0, C0, C0, ONE, 8), - _(A8_UNORM, UNORM, ZERO, ZERO, ZERO, C0, 8), - _(I8_UNORM, UNORM, C0, C0, C0, C0, 8), - - _(L8A8_UNORM, UNORM, C0, C0, C0, C1, 8_8), - _(L8A8_SRGB, UNORM, C0, C0, C0, C1, 8_8), - - _(DXT1_RGB, UNORM, C0, C1, C2, ONE, DXT1), - _(DXT1_RGBA, UNORM, C0, C1, C2, C3, DXT1), - _(DXT3_RGBA, UNORM, C0, C1, C2, C3, DXT3), - _(DXT5_RGBA, UNORM, C0, C1, C2, C3, DXT5), - - _MIXED(S8_USCALED_Z24_UNORM, UINT, UNORM, UINT, UINT, C1, C1, C1, ONE, 24_8), - _MIXED(Z24_UNORM_S8_USCALED, UNORM, UINT, UINT, UINT, C0, C0, C0, ONE, 8_24), - - _(R16G16B16A16_SNORM, UNORM, C0, C1, C2, C3, 16_16_16_16), - _(R16G16B16A16_UNORM, SNORM, C0, C1, C2, C3, 16_16_16_16), - _(R32G32B32A32_FLOAT, FLOAT, C0, C1, C2, C3, 32_32_32_32), - - _(R16G16_SNORM, SNORM, C0, C1, ZERO, ONE, 16_16), - _(R16G16_UNORM, UNORM, C0, C1, ZERO, ONE, 16_16), - - _MIXED(Z32_FLOAT, FLOAT, UINT, UINT, UINT, C0, C0, C0, ONE, 32_DEPTH) -}; - -#undef _ -#undef _MIXED - static INLINE uint32_t nv50_tic_swizzle(uint32_t tc, unsigned swz) { @@ -106,7 +56,7 @@ nv50_tex_construct(struct nv50_sampler_view *view) struct nv50_miptree *mt = nv50_miptree(view->pipe.texture); uint32_t swz[4], *tic = view->tic; - tic[0] = nv50_texture_formats[view->pipe.format]; + tic[0] = nv50_format_table[view->pipe.format].tic; swz[0] = nv50_tic_swizzle(tic[0], view->pipe.swizzle_r); swz[1] = nv50_tic_swizzle(tic[0], view->pipe.swizzle_g); diff --git a/src/gallium/drivers/nv50/nv50_texture.h b/src/gallium/drivers/nv50/nv50_texture.h index 3475d3e4326..b4939943e8a 100644 --- a/src/gallium/drivers/nv50/nv50_texture.h +++ b/src/gallium/drivers/nv50/nv50_texture.h @@ -45,24 +45,32 @@ #define NV50TIC_0_0_TYPEA_SNORM 0x00008000 #define NV50TIC_0_0_TYPEA_SINT 0x00018000 #define NV50TIC_0_0_TYPEA_UINT 0x00020000 +#define NV50TIC_0_0_TYPEA_SSCALED 0x00028000 +#define NV50TIC_0_0_TYPEA_USCALED 0x00030000 #define NV50TIC_0_0_TYPEA_FLOAT 0x00038000 #define NV50TIC_0_0_TYPEB_MASK 0x00007000 #define NV50TIC_0_0_TYPEB_UNORM 0x00002000 #define NV50TIC_0_0_TYPEB_SNORM 0x00001000 #define NV50TIC_0_0_TYPEB_SINT 0x00003000 #define NV50TIC_0_0_TYPEB_UINT 0x00004000 +#define NV50TIC_0_0_TYPEB_SSCALED 0x00005000 +#define NV50TIC_0_0_TYPEB_USCALED 0x00006000 #define NV50TIC_0_0_TYPEB_FLOAT 0x00007000 #define NV50TIC_0_0_TYPEG_MASK 0x00000e00 #define NV50TIC_0_0_TYPEG_UNORM 0x00000400 #define NV50TIC_0_0_TYPEG_SNORM 0x00000200 #define NV50TIC_0_0_TYPEG_SINT 0x00000600 #define NV50TIC_0_0_TYPEG_UINT 0x00000800 +#define NV50TIC_0_0_TYPEG_SSCALED 0x00000a00 +#define NV50TIC_0_0_TYPEG_USCALED 0x00000c00 #define NV50TIC_0_0_TYPEG_FLOAT 0x00000e00 #define NV50TIC_0_0_TYPER_MASK 0x000001c0 #define NV50TIC_0_0_TYPER_UNORM 0x00000080 #define NV50TIC_0_0_TYPER_SNORM 0x00000040 #define NV50TIC_0_0_TYPER_SINT 0x000000c0 #define NV50TIC_0_0_TYPER_UINT 0x00000100 +#define NV50TIC_0_0_TYPER_SSCALED 0x00000140 +#define NV50TIC_0_0_TYPER_USCALED 0x00000180 #define NV50TIC_0_0_TYPER_FLOAT 0x000001c0 #define NV50TIC_0_0_FMT_MASK 0x0000003f #define NV50TIC_0_0_FMT_32_32_32_32 0x00000001 @@ -90,6 +98,7 @@ #define NV50TIC_0_0_FMT_8_24 0x0000002a #define NV50TIC_0_0_FMT_32_DEPTH 0x0000002f #define NV50TIC_0_0_FMT_32_8 0x00000030 +#define NV50TIC_0_0_FMT_16_DEPTH 0x0000003a #define NV50TIC_0_1_OFFSET_LOW_MASK 0xffffffff #define NV50TIC_0_1_OFFSET_LOW_SHIFT 0 diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c new file mode 100644 index 00000000000..90d81d3e174 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -0,0 +1,2050 @@ +/* + * Copyright 2010 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* #define NV50_TGSI2NC_DEBUG */ + +#include <unistd.h> + +#include "nv50_context.h" +#include "nv50_pc.h" + +#include "pipe/p_shader_tokens.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_util.h" + +#include "util/u_simple_list.h" +#include "tgsi/tgsi_dump.h" + +#define BLD_MAX_TEMPS 64 +#define BLD_MAX_ADDRS 4 +#define BLD_MAX_PREDS 4 +#define BLD_MAX_IMMDS 128 + +#define BLD_MAX_COND_NESTING 8 +#define BLD_MAX_LOOP_NESTING 4 +#define BLD_MAX_CALL_NESTING 2 + +/* collects all values assigned to the same TGSI register */ +struct bld_value_stack { + struct nv_value *top; + struct nv_value **body; + unsigned size; + uint16_t loop_use; /* 1 bit per loop level, indicates if used/defd */ + uint16_t loop_def; +}; + +static INLINE void +bld_vals_push_val(struct bld_value_stack *stk, struct nv_value *val) +{ + assert(!stk->size || (stk->body[stk->size - 1] != val)); + + if (!(stk->size % 8)) { + unsigned old_sz = (stk->size + 0) * sizeof(struct nv_value *); + unsigned new_sz = (stk->size + 8) * sizeof(struct nv_value *); + stk->body = (struct nv_value **)REALLOC(stk->body, old_sz, new_sz); + } + stk->body[stk->size++] = val; +} + +static INLINE boolean +bld_vals_del_val(struct bld_value_stack *stk, struct nv_value *val) +{ + unsigned i; + + for (i = stk->size; i > 0; --i) + if (stk->body[i - 1] == val) + break; + if (!i) + return FALSE; + + if (i != stk->size) + stk->body[i - 1] = stk->body[stk->size - 1]; + + --stk->size; /* XXX: old size in REALLOC */ + return TRUE; +} + +static INLINE void +bld_vals_push(struct bld_value_stack *stk) +{ + bld_vals_push_val(stk, stk->top); + stk->top = NULL; +} + +static INLINE void +bld_push_values(struct bld_value_stack *stacks, int n) +{ + int i, c; + + for (i = 0; i < n; ++i) + for (c = 0; c < 4; ++c) + if (stacks[i * 4 + c].top) + bld_vals_push(&stacks[i * 4 + c]); +} + +struct bld_context { + struct nv50_translation_info *ti; + + struct nv_pc *pc; + struct nv_basic_block *b; + + struct tgsi_parse_context parse[BLD_MAX_CALL_NESTING]; + int call_lvl; + + struct nv_basic_block *cond_bb[BLD_MAX_COND_NESTING]; + struct nv_basic_block *join_bb[BLD_MAX_COND_NESTING]; + struct nv_basic_block *else_bb[BLD_MAX_COND_NESTING]; + int cond_lvl; + struct nv_basic_block *loop_bb[BLD_MAX_LOOP_NESTING]; + struct nv_basic_block *brkt_bb[BLD_MAX_LOOP_NESTING]; + int loop_lvl; + + ubyte out_kind; /* CFG_EDGE_FORWARD, or FAKE in case of BREAK/CONT */ + + struct bld_value_stack tvs[BLD_MAX_TEMPS][4]; /* TGSI_FILE_TEMPORARY */ + struct bld_value_stack avs[BLD_MAX_ADDRS][4]; /* TGSI_FILE_ADDRESS */ + struct bld_value_stack pvs[BLD_MAX_PREDS][4]; /* TGSI_FILE_PREDICATE */ + struct bld_value_stack ovs[PIPE_MAX_SHADER_OUTPUTS][4]; + + uint32_t outputs_written[(PIPE_MAX_SHADER_OUTPUTS + 7) / 8]; + + struct nv_value *frgcrd[4]; + struct nv_value *sysval[4]; + + /* wipe on new BB */ + struct nv_value *saved_addr[4][2]; + struct nv_value *saved_inputs[128]; + struct nv_value *saved_immd[BLD_MAX_IMMDS]; + uint num_immds; +}; + +static INLINE ubyte +bld_stack_file(struct bld_context *bld, struct bld_value_stack *stk) +{ + if (stk < &bld->avs[0][0]) + return NV_FILE_GPR; + else + if (stk < &bld->pvs[0][0]) + return NV_FILE_ADDR; + else + if (stk < &bld->ovs[0][0]) + return NV_FILE_FLAGS; + else + return NV_FILE_OUT; +} + +static INLINE struct nv_value * +bld_fetch(struct bld_context *bld, struct bld_value_stack *stk, int i, int c) +{ + stk[i * 4 + c].loop_use |= 1 << bld->loop_lvl; + + return stk[i * 4 + c].top; +} + +static struct nv_value * +bld_loop_phi(struct bld_context *, struct bld_value_stack *, struct nv_value *); + +/* If a variable is defined in a loop without prior use, we don't need + * a phi in the loop header to account for backwards flow. + * + * However, if this variable is then also used outside the loop, we do + * need a phi after all. But we must not use this phi's def inside the + * loop, so we can eliminate the phi if it is unused later. + */ +static INLINE void +bld_store(struct bld_context *bld, struct bld_value_stack *stk, int i, int c, + struct nv_value *val) +{ + const uint16_t m = 1 << bld->loop_lvl; + + stk = &stk[i * 4 + c]; + + if (bld->loop_lvl && !(m & (stk->loop_def | stk->loop_use))) + bld_loop_phi(bld, stk, val); + + stk->top = val; + stk->loop_def |= 1 << bld->loop_lvl; +} + +static INLINE void +bld_clear_def_use(struct bld_value_stack *stk, int n, int lvl) +{ + int i; + const uint16_t mask = ~(1 << lvl); + + for (i = 0; i < n * 4; ++i) { + stk[i].loop_def &= mask; + stk[i].loop_use &= mask; + } +} + +#define FETCH_TEMP(i, c) bld_fetch(bld, &bld->tvs[0][0], i, c) +#define STORE_TEMP(i, c, v) bld_store(bld, &bld->tvs[0][0], i, c, (v)) +#define FETCH_ADDR(i, c) bld_fetch(bld, &bld->avs[0][0], i, c) +#define STORE_ADDR(i, c, v) bld_store(bld, &bld->avs[0][0], i, c, (v)) +#define FETCH_PRED(i, c) bld_fetch(bld, &bld->pvs[0][0], i, c) +#define STORE_PRED(i, c, v) bld_store(bld, &bld->pvs[0][0], i, c, (v)) + +#define STORE_OUTR(i, c, v) \ + do { \ + bld->ovs[i][c].top = (v); \ + bld->outputs_written[(i) / 8] |= 1 << (((i) * 4 + (c)) % 32); \ + } while (0) + +static INLINE void +bld_warn_uninitialized(struct bld_context *bld, int kind, + struct bld_value_stack *stk, struct nv_basic_block *b) +{ +#ifdef NV50_TGSI2NC_DEBUG + long i = (stk - &bld->tvs[0][0]) / 4; + long c = (stk - &bld->tvs[0][0]) & 3; + + if (c == 3) + c = -1; + + debug_printf("WARNING: TEMP[%li].%c %s used uninitialized in BB:%i\n", + i, (int)('x' + c), kind ? "may be" : "is", b->id); +#endif +} + +static INLINE struct nv_value * +bld_def(struct nv_instruction *i, int c, struct nv_value *value) +{ + i->def[c] = value; + value->insn = i; + return value; +} + +static INLINE struct nv_value * +find_by_bb(struct bld_value_stack *stack, struct nv_basic_block *b) +{ + int i; + + if (stack->top && stack->top->insn->bb == b) + return stack->top; + + for (i = stack->size - 1; i >= 0; --i) + if (stack->body[i]->insn->bb == b) + return stack->body[i]; + return NULL; +} + +/* fetch value from stack that was defined in the specified basic block, + * or search for first definitions in all of its predecessors + */ +static void +fetch_by_bb(struct bld_value_stack *stack, + struct nv_value **vals, int *n, + struct nv_basic_block *b) +{ + int i; + struct nv_value *val; + + assert(*n < 16); /* MAX_COND_NESTING */ + + val = find_by_bb(stack, b); + if (val) { + for (i = 0; i < *n; ++i) + if (vals[i] == val) + return; + vals[(*n)++] = val; + return; + } + for (i = 0; i < b->num_in; ++i) + if (!IS_WALL_EDGE(b->in_kind[i])) + fetch_by_bb(stack, vals, n, b->in[i]); +} + +static INLINE struct nv_value * +bld_load_imm_u32(struct bld_context *bld, uint32_t u); + +static INLINE struct nv_value * +bld_undef(struct bld_context *bld, ubyte file) +{ + struct nv_instruction *nvi = new_instruction(bld->pc, NV_OP_UNDEF); + + return bld_def(nvi, 0, new_value(bld->pc, file, NV_TYPE_U32)); +} + +static struct nv_value * +bld_phi(struct bld_context *bld, struct nv_basic_block *b, + struct bld_value_stack *stack) +{ + struct nv_basic_block *in; + struct nv_value *vals[16], *val; + struct nv_instruction *phi; + int i, j, n; + + do { + i = n = 0; + fetch_by_bb(stack, vals, &n, b); + + if (!n) { + bld_warn_uninitialized(bld, 0, stack, b); + return NULL; + } + + if (n == 1) { + if (nvbb_dominated_by(b, vals[0]->insn->bb)) + break; + + bld_warn_uninitialized(bld, 1, stack, b); + + /* back-tracking to insert missing value of other path */ + in = b; + while (in->in[0]) { + if (in->num_in == 1) { + in = in->in[0]; + } else { + if (!nvbb_reachable_by(in->in[0], vals[0]->insn->bb, b)) + in = in->in[0]; + else + if (!nvbb_reachable_by(in->in[1], vals[0]->insn->bb, b)) + in = in->in[1]; + else + in = in->in[0]; + } + } + bld->pc->current_block = in; + + /* should make this a no-op */ + bld_vals_push_val(stack, bld_undef(bld, vals[0]->reg.file)); + continue; + } + + for (i = 0; i < n; ++i) { + /* if value dominates b, continue to the redefinitions */ + if (nvbb_dominated_by(b, vals[i]->insn->bb)) + continue; + + /* if value dominates any in-block, b should be the dom frontier */ + for (j = 0; j < b->num_in; ++j) + if (nvbb_dominated_by(b->in[j], vals[i]->insn->bb)) + break; + /* otherwise, find the dominance frontier and put the phi there */ + if (j == b->num_in) { + in = nvbb_dom_frontier(vals[i]->insn->bb); + val = bld_phi(bld, in, stack); + bld_vals_push_val(stack, val); + break; + } + } + } while(i < n); + + bld->pc->current_block = b; + + if (n == 1) + return vals[0]; + + phi = new_instruction(bld->pc, NV_OP_PHI); + + bld_def(phi, 0, new_value(bld->pc, vals[0]->reg.file, vals[0]->reg.type)); + for (i = 0; i < n; ++i) + phi->src[i] = new_ref(bld->pc, vals[i]); + + return phi->def[0]; +} + +/* Insert a phi function in the loop header. + * For nested loops, we need to insert phi functions in all the outer + * loop headers if they don't have one yet. + * + * @def: redefinition from inside loop, or NULL if to be replaced later + */ +static struct nv_value * +bld_loop_phi(struct bld_context *bld, struct bld_value_stack *stack, + struct nv_value *def) +{ + struct nv_instruction *phi; + struct nv_basic_block *bb = bld->pc->current_block; + struct nv_value *val = NULL; + + if (bld->loop_lvl > 1) { + --bld->loop_lvl; + if (!((stack->loop_def | stack->loop_use) & (1 << bld->loop_lvl))) + val = bld_loop_phi(bld, stack, NULL); + ++bld->loop_lvl; + } + + if (!val) + val = bld_phi(bld, bld->pc->current_block, stack); /* old definition */ + if (!val) { + bld->pc->current_block = bld->loop_bb[bld->loop_lvl - 1]->in[0]; + val = bld_undef(bld, bld_stack_file(bld, stack)); + } + + bld->pc->current_block = bld->loop_bb[bld->loop_lvl - 1]; + + phi = new_instruction(bld->pc, NV_OP_PHI); + + bld_def(phi, 0, new_value_like(bld->pc, val)); + if (!def) + def = phi->def[0]; + + bld_vals_push_val(stack, phi->def[0]); + + phi->target = (struct nv_basic_block *)stack; /* cheat */ + + nv_reference(bld->pc, &phi->src[0], val); + nv_reference(bld->pc, &phi->src[1], def); + + bld->pc->current_block = bb; + + return phi->def[0]; +} + +static INLINE struct nv_value * +bld_fetch_global(struct bld_context *bld, struct bld_value_stack *stack) +{ + const uint16_t m = 1 << bld->loop_lvl; + const uint16_t use = stack->loop_use; + + stack->loop_use |= m; + + /* If neither used nor def'd inside the loop, build a phi in foresight, + * so we don't have to replace stuff later on, which requires tracking. + */ + if (bld->loop_lvl && !((use | stack->loop_def) & m)) + return bld_loop_phi(bld, stack, NULL); + + return bld_phi(bld, bld->pc->current_block, stack); +} + +static INLINE struct nv_value * +bld_imm_u32(struct bld_context *bld, uint32_t u) +{ + int i; + unsigned n = bld->num_immds; + + for (i = 0; i < n; ++i) + if (bld->saved_immd[i]->reg.imm.u32 == u) + return bld->saved_immd[i]; + assert(n < BLD_MAX_IMMDS); + + bld->num_immds++; + + bld->saved_immd[n] = new_value(bld->pc, NV_FILE_IMM, NV_TYPE_U32); + bld->saved_immd[n]->reg.imm.u32 = u; + return bld->saved_immd[n]; +} + +static void +bld_replace_value(struct nv_pc *, struct nv_basic_block *, struct nv_value *, + struct nv_value *); + +/* Replace the source of the phi in the loop header by the last assignment, + * or eliminate the phi function if there is no assignment inside the loop. + * + * Redundancy situation 1 - (used) but (not redefined) value: + * %3 = phi %0, %3 = %3 is used + * %3 = phi %0, %4 = is new definition + * + * Redundancy situation 2 - (not used) but (redefined) value: + * %3 = phi %0, %2 = %2 is used, %3 could be used outside, deleted by DCE + */ +static void +bld_loop_end(struct bld_context *bld, struct nv_basic_block *bb) +{ + struct nv_basic_block *save = bld->pc->current_block; + struct nv_instruction *phi, *next; + struct nv_value *val; + struct bld_value_stack *stk; + int i, s, n; + + for (phi = bb->phi; phi && phi->opcode == NV_OP_PHI; phi = next) { + next = phi->next; + + stk = (struct bld_value_stack *)phi->target; + phi->target = NULL; + + for (s = 1, n = 0; n < bb->num_in; ++n) { + if (bb->in_kind[n] != CFG_EDGE_BACK) + continue; + + assert(s < 4); + bld->pc->current_block = bb->in[n]; + val = bld_fetch_global(bld, stk); + + for (i = 0; i < 4; ++i) + if (phi->src[i] && phi->src[i]->value == val) + break; + if (i == 4) + nv_reference(bld->pc, &phi->src[s++], val); + } + bld->pc->current_block = save; + + if (phi->src[0]->value == phi->def[0] || + phi->src[0]->value == phi->src[1]->value) + s = 1; + else + if (phi->src[1]->value == phi->def[0]) + s = 0; + else + continue; + + if (s >= 0) { + /* eliminate the phi */ + bld_vals_del_val(stk, phi->def[0]); + + ++bld->pc->pass_seq; + bld_replace_value(bld->pc, bb, phi->def[0], phi->src[s]->value); + + nv_nvi_delete(phi); + } + } +} + +static INLINE struct nv_value * +bld_imm_f32(struct bld_context *bld, float f) +{ + return bld_imm_u32(bld, fui(f)); +} + +#define SET_TYPE(v, t) ((v)->reg.type = (v)->reg.as_type = (t)) + +static struct nv_value * +bld_insn_1(struct bld_context *bld, uint opcode, struct nv_value *src0) +{ + struct nv_instruction *insn = new_instruction(bld->pc, opcode); + + nv_reference(bld->pc, &insn->src[0], src0); + + return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.as_type)); +} + +static struct nv_value * +bld_insn_2(struct bld_context *bld, uint opcode, + struct nv_value *src0, struct nv_value *src1) +{ + struct nv_instruction *insn = new_instruction(bld->pc, opcode); + + nv_reference(bld->pc, &insn->src[0], src0); + nv_reference(bld->pc, &insn->src[1], src1); + + return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.as_type)); +} + +static struct nv_value * +bld_insn_3(struct bld_context *bld, uint opcode, + struct nv_value *src0, struct nv_value *src1, + struct nv_value *src2) +{ + struct nv_instruction *insn = new_instruction(bld->pc, opcode); + + nv_reference(bld->pc, &insn->src[0], src0); + nv_reference(bld->pc, &insn->src[1], src1); + nv_reference(bld->pc, &insn->src[2], src2); + + return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.as_type)); +} + +static struct nv_value * +bld_duplicate_insn(struct bld_context *bld, struct nv_instruction *nvi) +{ + struct nv_instruction *dupi = new_instruction(bld->pc, nvi->opcode); + int c; + + if (nvi->def[0]) + bld_def(dupi, 0, new_value_like(bld->pc, nvi->def[0])); + + if (nvi->flags_def) { + dupi->flags_def = new_value_like(bld->pc, nvi->flags_def); + dupi->flags_def->insn = dupi; + } + + for (c = 0; c < 5; ++c) + if (nvi->src[c]) + nv_reference(bld->pc, &dupi->src[c], nvi->src[c]->value); + if (nvi->flags_src) + nv_reference(bld->pc, &dupi->flags_src, nvi->flags_src->value); + + dupi->cc = nvi->cc; + dupi->saturate = nvi->saturate; + dupi->centroid = nvi->centroid; + dupi->flat = nvi->flat; + + return dupi->def[0]; +} + +static void +bld_lmem_store(struct bld_context *bld, struct nv_value *ptr, int ofst, + struct nv_value *val) +{ + struct nv_instruction *insn = new_instruction(bld->pc, NV_OP_STA); + struct nv_value *loc; + + loc = new_value(bld->pc, NV_FILE_MEM_L, NV_TYPE_U32); + + loc->reg.id = ofst * 4; + + nv_reference(bld->pc, &insn->src[0], loc); + nv_reference(bld->pc, &insn->src[1], val); + nv_reference(bld->pc, &insn->src[4], ptr); +} + +static struct nv_value * +bld_lmem_load(struct bld_context *bld, struct nv_value *ptr, int ofst) +{ + struct nv_value *loc, *val; + + loc = new_value(bld->pc, NV_FILE_MEM_L, NV_TYPE_U32); + + loc->reg.id = ofst * 4; + + val = bld_insn_1(bld, NV_OP_LDA, loc); + + nv_reference(bld->pc, &val->insn->src[4], ptr); + + return val; +} + +#define BLD_INSN_1_EX(d, op, dt, s0, s0t) \ + do { \ + (d) = bld_insn_1(bld, (NV_OP_##op), (s0)); \ + SET_TYPE(d, NV_TYPE_##dt); \ + (d)->insn->src[0]->typecast = NV_TYPE_##s0t; \ + } while(0) + +#define BLD_INSN_2_EX(d, op, dt, s0, s0t, s1, s1t) \ + do { \ + (d) = bld_insn_2(bld, (NV_OP_##op), (s0), (s1)); \ + SET_TYPE(d, NV_TYPE_##dt); \ + (d)->insn->src[0]->typecast = NV_TYPE_##s0t; \ + (d)->insn->src[1]->typecast = NV_TYPE_##s1t; \ + } while(0) + +static struct nv_value * +bld_pow(struct bld_context *bld, struct nv_value *x, struct nv_value *e) +{ + struct nv_value *val; + + BLD_INSN_1_EX(val, LG2, F32, x, F32); + BLD_INSN_2_EX(val, MUL, F32, e, F32, val, F32); + val = bld_insn_1(bld, NV_OP_PREEX2, val); + val = bld_insn_1(bld, NV_OP_EX2, val); + + return val; +} + +static INLINE struct nv_value * +bld_load_imm_f32(struct bld_context *bld, float f) +{ + struct nv_value *imm = bld_insn_1(bld, NV_OP_MOV, bld_imm_f32(bld, f)); + + SET_TYPE(imm, NV_TYPE_F32); + return imm; +} + +static INLINE struct nv_value * +bld_load_imm_u32(struct bld_context *bld, uint32_t u) +{ + return bld_insn_1(bld, NV_OP_MOV, bld_imm_u32(bld, u)); +} + +static struct nv_value * +bld_get_address(struct bld_context *bld, int id, struct nv_value *indirect) +{ + int i; + struct nv_instruction *nvi; + struct nv_value *val; + + for (i = 0; i < 4; ++i) { + if (!bld->saved_addr[i][0]) + break; + if (bld->saved_addr[i][1] == indirect) { + nvi = bld->saved_addr[i][0]->insn; + if (nvi->src[0]->value->reg.imm.u32 == id) + return bld->saved_addr[i][0]; + } + } + i &= 3; + + val = bld_imm_u32(bld, id); + if (indirect) + val = bld_insn_2(bld, NV_OP_ADD, indirect, val); + else + val = bld_insn_1(bld, NV_OP_MOV, val); + + bld->saved_addr[i][0] = val; + bld->saved_addr[i][0]->reg.file = NV_FILE_ADDR; + bld->saved_addr[i][0]->reg.type = NV_TYPE_U16; + bld->saved_addr[i][1] = indirect; + return bld->saved_addr[i][0]; +} + + +static struct nv_value * +bld_predicate(struct bld_context *bld, struct nv_value *src, boolean bool_only) +{ + struct nv_instruction *s0i, *nvi = src->insn; + + if (!nvi) { + nvi = bld_insn_1(bld, + (src->reg.file == NV_FILE_IMM) ? NV_OP_MOV : NV_OP_LDA, + src)->insn; + src = nvi->def[0]; + } else + if (bool_only) { + while (nvi->opcode == NV_OP_ABS || nvi->opcode == NV_OP_NEG || + nvi->opcode == NV_OP_CVT) { + s0i = nvi->src[0]->value->insn; + if (!s0i || !nv50_op_can_write_flags(s0i->opcode)) + break; + nvi = s0i; + assert(!nvi->flags_src); + } + } + + if (!nv50_op_can_write_flags(nvi->opcode) || + nvi->bb != bld->pc->current_block) { + nvi = new_instruction(bld->pc, NV_OP_CVT); + nv_reference(bld->pc, &nvi->src[0], src); + } + + if (!nvi->flags_def) { + nvi->flags_def = new_value(bld->pc, NV_FILE_FLAGS, NV_TYPE_U16); + nvi->flags_def->insn = nvi; + } + return nvi->flags_def; +} + +static void +bld_kil(struct bld_context *bld, struct nv_value *src) +{ + struct nv_instruction *nvi; + + src = bld_predicate(bld, src, FALSE); + nvi = new_instruction(bld->pc, NV_OP_KIL); + nvi->fixed = 1; + nvi->flags_src = new_ref(bld->pc, src); + nvi->cc = NV_CC_LT; +} + +static void +bld_flow(struct bld_context *bld, uint opcode, ubyte cc, + struct nv_value *src, struct nv_basic_block *target, + boolean plan_reconverge) +{ + struct nv_instruction *nvi; + + if (plan_reconverge) + new_instruction(bld->pc, NV_OP_JOINAT)->fixed = 1; + + nvi = new_instruction(bld->pc, opcode); + nvi->is_terminator = 1; + nvi->cc = cc; + nvi->target = target; + if (src) + nvi->flags_src = new_ref(bld->pc, src); +} + +static ubyte +translate_setcc(unsigned opcode) +{ + switch (opcode) { + case TGSI_OPCODE_SLT: return NV_CC_LT; + case TGSI_OPCODE_SGE: return NV_CC_GE; + case TGSI_OPCODE_SEQ: return NV_CC_EQ; + case TGSI_OPCODE_SGT: return NV_CC_GT; + case TGSI_OPCODE_SLE: return NV_CC_LE; + case TGSI_OPCODE_SNE: return NV_CC_NE | NV_CC_U; + case TGSI_OPCODE_STR: return NV_CC_TR; + case TGSI_OPCODE_SFL: return NV_CC_FL; + + case TGSI_OPCODE_ISLT: return NV_CC_LT; + case TGSI_OPCODE_ISGE: return NV_CC_GE; + case TGSI_OPCODE_USEQ: return NV_CC_EQ; + case TGSI_OPCODE_USGE: return NV_CC_GE; + case TGSI_OPCODE_USLT: return NV_CC_LT; + case TGSI_OPCODE_USNE: return NV_CC_NE; + default: + assert(0); + return NV_CC_FL; + } +} + +static uint +translate_opcode(uint opcode) +{ + switch (opcode) { + case TGSI_OPCODE_ABS: return NV_OP_ABS; + case TGSI_OPCODE_ADD: + case TGSI_OPCODE_SUB: + case TGSI_OPCODE_UADD: return NV_OP_ADD; + case TGSI_OPCODE_AND: return NV_OP_AND; + case TGSI_OPCODE_EX2: return NV_OP_EX2; + case TGSI_OPCODE_CEIL: return NV_OP_CEIL; + case TGSI_OPCODE_FLR: return NV_OP_FLOOR; + case TGSI_OPCODE_TRUNC: return NV_OP_TRUNC; + case TGSI_OPCODE_COS: return NV_OP_COS; + case TGSI_OPCODE_SIN: return NV_OP_SIN; + case TGSI_OPCODE_DDX: return NV_OP_DFDX; + case TGSI_OPCODE_DDY: return NV_OP_DFDY; + case TGSI_OPCODE_F2I: + case TGSI_OPCODE_F2U: + case TGSI_OPCODE_I2F: + case TGSI_OPCODE_U2F: return NV_OP_CVT; + case TGSI_OPCODE_INEG: return NV_OP_NEG; + case TGSI_OPCODE_LG2: return NV_OP_LG2; + case TGSI_OPCODE_ISHR: + case TGSI_OPCODE_USHR: return NV_OP_SHR; + case TGSI_OPCODE_MAD: + case TGSI_OPCODE_UMAD: return NV_OP_MAD; + case TGSI_OPCODE_MAX: + case TGSI_OPCODE_IMAX: + case TGSI_OPCODE_UMAX: return NV_OP_MAX; + case TGSI_OPCODE_MIN: + case TGSI_OPCODE_IMIN: + case TGSI_OPCODE_UMIN: return NV_OP_MIN; + case TGSI_OPCODE_MUL: + case TGSI_OPCODE_UMUL: return NV_OP_MUL; + case TGSI_OPCODE_OR: return NV_OP_OR; + case TGSI_OPCODE_RCP: return NV_OP_RCP; + case TGSI_OPCODE_RSQ: return NV_OP_RSQ; + case TGSI_OPCODE_SAD: return NV_OP_SAD; + case TGSI_OPCODE_SHL: return NV_OP_SHL; + case TGSI_OPCODE_SLT: + case TGSI_OPCODE_SGE: + case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_SGT: + case TGSI_OPCODE_SLE: + case TGSI_OPCODE_SNE: + case TGSI_OPCODE_ISLT: + case TGSI_OPCODE_ISGE: + case TGSI_OPCODE_USEQ: + case TGSI_OPCODE_USGE: + case TGSI_OPCODE_USLT: + case TGSI_OPCODE_USNE: return NV_OP_SET; + case TGSI_OPCODE_TEX: return NV_OP_TEX; + case TGSI_OPCODE_TXP: return NV_OP_TEX; + case TGSI_OPCODE_TXB: return NV_OP_TXB; + case TGSI_OPCODE_TXL: return NV_OP_TXL; + case TGSI_OPCODE_XOR: return NV_OP_XOR; + default: + return NV_OP_NOP; + } +} + +static ubyte +infer_src_type(unsigned opcode) +{ + switch (opcode) { + case TGSI_OPCODE_MOV: + case TGSI_OPCODE_AND: + case TGSI_OPCODE_OR: + case TGSI_OPCODE_XOR: + case TGSI_OPCODE_SAD: + case TGSI_OPCODE_U2F: + case TGSI_OPCODE_UADD: + case TGSI_OPCODE_UDIV: + case TGSI_OPCODE_UMOD: + case TGSI_OPCODE_UMAD: + case TGSI_OPCODE_UMUL: + case TGSI_OPCODE_UMAX: + case TGSI_OPCODE_UMIN: + case TGSI_OPCODE_USEQ: + case TGSI_OPCODE_USGE: + case TGSI_OPCODE_USLT: + case TGSI_OPCODE_USNE: + case TGSI_OPCODE_USHR: + return NV_TYPE_U32; + case TGSI_OPCODE_I2F: + case TGSI_OPCODE_IDIV: + case TGSI_OPCODE_IMAX: + case TGSI_OPCODE_IMIN: + case TGSI_OPCODE_INEG: + case TGSI_OPCODE_ISGE: + case TGSI_OPCODE_ISHR: + case TGSI_OPCODE_ISLT: + return NV_TYPE_S32; + default: + return NV_TYPE_F32; + } +} + +static ubyte +infer_dst_type(unsigned opcode) +{ + switch (opcode) { + case TGSI_OPCODE_MOV: + case TGSI_OPCODE_F2U: + case TGSI_OPCODE_AND: + case TGSI_OPCODE_OR: + case TGSI_OPCODE_XOR: + case TGSI_OPCODE_SAD: + case TGSI_OPCODE_UADD: + case TGSI_OPCODE_UDIV: + case TGSI_OPCODE_UMOD: + case TGSI_OPCODE_UMAD: + case TGSI_OPCODE_UMUL: + case TGSI_OPCODE_UMAX: + case TGSI_OPCODE_UMIN: + case TGSI_OPCODE_USEQ: + case TGSI_OPCODE_USGE: + case TGSI_OPCODE_USLT: + case TGSI_OPCODE_USNE: + case TGSI_OPCODE_USHR: + return NV_TYPE_U32; + case TGSI_OPCODE_F2I: + case TGSI_OPCODE_IDIV: + case TGSI_OPCODE_IMAX: + case TGSI_OPCODE_IMIN: + case TGSI_OPCODE_INEG: + case TGSI_OPCODE_ISGE: + case TGSI_OPCODE_ISHR: + case TGSI_OPCODE_ISLT: + return NV_TYPE_S32; + default: + return NV_TYPE_F32; + } +} + +static void +emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst, + unsigned chan, struct nv_value *value) +{ + struct nv_value *ptr; + const struct tgsi_full_dst_register *reg = &inst->Dst[0]; + + if (reg->Register.Indirect) { + ptr = FETCH_ADDR(reg->Indirect.Index, + tgsi_util_get_src_register_swizzle(®->Indirect, 0)); + } else { + ptr = NULL; + } + + assert(chan < 4); + + if (inst->Instruction.Opcode != TGSI_OPCODE_MOV) + value->reg.type = infer_dst_type(inst->Instruction.Opcode); + + switch (inst->Instruction.Saturate) { + case TGSI_SAT_NONE: + break; + case TGSI_SAT_ZERO_ONE: + BLD_INSN_1_EX(value, SAT, F32, value, F32); + break; + case TGSI_SAT_MINUS_PLUS_ONE: + value->reg.as_type = NV_TYPE_F32; + value = bld_insn_2(bld, NV_OP_MAX, value, bld_load_imm_f32(bld, -1.0f)); + value = bld_insn_2(bld, NV_OP_MIN, value, bld_load_imm_f32(bld, 1.0f)); + break; + } + + switch (reg->Register.File) { + case TGSI_FILE_OUTPUT: + if (!value->insn && (bld->ti->output_file == NV_FILE_OUT)) + value = bld_insn_1(bld, NV_OP_MOV, value); + value = bld_insn_1(bld, NV_OP_MOV, value); + value->reg.file = bld->ti->output_file; + + if (bld->ti->p->type == PIPE_SHADER_FRAGMENT) { + STORE_OUTR(reg->Register.Index, chan, value); + } else { + value->insn->fixed = 1; + value->reg.id = bld->ti->output_map[reg->Register.Index][chan]; + } + break; + case TGSI_FILE_TEMPORARY: + assert(reg->Register.Index < BLD_MAX_TEMPS); + if (!value->insn || (value->insn->bb != bld->pc->current_block)) + value = bld_insn_1(bld, NV_OP_MOV, value); + value->reg.file = NV_FILE_GPR; + + if (bld->ti->store_to_memory) + bld_lmem_store(bld, ptr, reg->Register.Index * 4 + chan, value); + else + STORE_TEMP(reg->Register.Index, chan, value); + break; + case TGSI_FILE_ADDRESS: + assert(reg->Register.Index < BLD_MAX_ADDRS); + value->reg.file = NV_FILE_ADDR; + value->reg.type = NV_TYPE_U16; + STORE_ADDR(reg->Register.Index, chan, value); + break; + } +} + +static INLINE uint32_t +bld_is_output_written(struct bld_context *bld, int i, int c) +{ + if (c < 0) + return bld->outputs_written[i / 8] & (0xf << ((i * 4) % 32)); + return bld->outputs_written[i / 8] & (1 << ((i * 4 + c) % 32)); +} + +static void +bld_export_outputs(struct bld_context *bld) +{ + struct nv_value *vals[4]; + struct nv_instruction *nvi; + int i, c, n; + + bld_push_values(&bld->ovs[0][0], PIPE_MAX_SHADER_OUTPUTS); + + for (i = 0; i < PIPE_MAX_SHADER_OUTPUTS; ++i) { + if (!bld_is_output_written(bld, i, -1)) + continue; + for (n = 0, c = 0; c < 4; ++c) { + if (!bld_is_output_written(bld, i, c)) + continue; + vals[n] = bld_fetch_global(bld, &bld->ovs[i][c]); + assert(vals[n]); + vals[n] = bld_insn_1(bld, NV_OP_MOV, vals[n]); + vals[n++]->reg.id = bld->ti->output_map[i][c]; + } + assert(n); + + (nvi = new_instruction(bld->pc, NV_OP_EXPORT))->fixed = 1; + + for (c = 0; c < n; ++c) + nvi->src[c] = new_ref(bld->pc, vals[c]); + } +} + +static void +bld_new_block(struct bld_context *bld, struct nv_basic_block *b) +{ + int i; + + bld_push_values(&bld->tvs[0][0], BLD_MAX_TEMPS); + bld_push_values(&bld->avs[0][0], BLD_MAX_ADDRS); + bld_push_values(&bld->pvs[0][0], BLD_MAX_PREDS); + bld_push_values(&bld->ovs[0][0], PIPE_MAX_SHADER_OUTPUTS); + + bld->pc->current_block = b; + + for (i = 0; i < 4; ++i) + bld->saved_addr[i][0] = NULL; + + for (i = 0; i < 128; ++i) + bld->saved_inputs[i] = NULL; + + bld->out_kind = CFG_EDGE_FORWARD; +} + +static struct nv_value * +bld_saved_input(struct bld_context *bld, unsigned i, unsigned c) +{ + unsigned idx = bld->ti->input_map[i][c]; + + if (bld->ti->p->type != PIPE_SHADER_FRAGMENT) + return NULL; + if (bld->saved_inputs[idx]) + return bld->saved_inputs[idx]; + return NULL; +} + +static struct nv_value * +bld_interpolate(struct bld_context *bld, unsigned mode, struct nv_value *val) +{ + if (val->reg.id == 255) { + /* gl_FrontFacing: 0/~0 to -1.0/+1.0 */ + val = bld_insn_1(bld, NV_OP_LINTERP, val); + val = bld_insn_2(bld, NV_OP_SHL, val, bld_imm_u32(bld, 31)); + val->insn->src[0]->typecast = NV_TYPE_U32; + val = bld_insn_2(bld, NV_OP_XOR, val, bld_imm_f32(bld, -1.0f)); + val->insn->src[0]->typecast = NV_TYPE_U32; + } else + if (mode & (NV50_INTERP_LINEAR | NV50_INTERP_FLAT)) + val = bld_insn_1(bld, NV_OP_LINTERP, val); + else + val = bld_insn_2(bld, NV_OP_PINTERP, val, bld->frgcrd[3]); + + val->insn->flat = (mode & NV50_INTERP_FLAT) ? 1 : 0; + val->insn->centroid = (mode & NV50_INTERP_CENTROID) ? 1 : 0; + return val; +} + +static struct nv_value * +emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn, + const unsigned s, const unsigned chan) +{ + const struct tgsi_full_src_register *src = &insn->Src[s]; + struct nv_value *res; + struct nv_value *ptr = NULL; + unsigned idx, swz, dim_idx, ind_idx, ind_swz, sgn; + ubyte type = infer_src_type(insn->Instruction.Opcode); + + idx = src->Register.Index; + swz = tgsi_util_get_full_src_register_swizzle(src, chan); + dim_idx = -1; + ind_idx = -1; + ind_swz = 0; + + if (src->Register.Indirect) { + ind_idx = src->Indirect.Index; + ind_swz = tgsi_util_get_src_register_swizzle(&src->Indirect, 0); + + ptr = FETCH_ADDR(ind_idx, ind_swz); + } + if (idx >= (128 / 4) && src->Register.File == TGSI_FILE_CONSTANT) + ptr = bld_get_address(bld, (idx * 16) & ~0x1ff, ptr); + + switch (src->Register.File) { + case TGSI_FILE_CONSTANT: + dim_idx = src->Dimension.Index ? src->Dimension.Index + 2 : 1; + assert(dim_idx < 14); + assert(dim_idx == 1); /* for now */ + + res = new_value(bld->pc, NV_FILE_MEM_C(dim_idx), type); + SET_TYPE(res, type); + res->reg.id = (idx * 4 + swz) & 127; + res = bld_insn_1(bld, NV_OP_LDA, res); + + if (ptr) + res->insn->src[4] = new_ref(bld->pc, ptr); + break; + case TGSI_FILE_IMMEDIATE: + assert(idx < bld->ti->immd32_nr); + res = bld_load_imm_u32(bld, bld->ti->immd32[idx * 4 + swz]); + + switch (bld->ti->immd32_ty[idx]) { + case TGSI_IMM_FLOAT32: SET_TYPE(res, NV_TYPE_F32); break; + case TGSI_IMM_UINT32: SET_TYPE(res, NV_TYPE_U32); break; + case TGSI_IMM_INT32: SET_TYPE(res, NV_TYPE_S32); break; + default: + SET_TYPE(res, type); + break; + } + break; + case TGSI_FILE_INPUT: + res = bld_saved_input(bld, idx, swz); + if (res && (insn->Instruction.Opcode != TGSI_OPCODE_TXP)) + return res; + + res = new_value(bld->pc, bld->ti->input_file, type); + res->reg.id = bld->ti->input_map[idx][swz]; + + if (res->reg.file == NV_FILE_MEM_V) { + res = bld_interpolate(bld, bld->ti->interp_mode[idx], res); + } else { + assert(src->Dimension.Dimension == 0); + res = bld_insn_1(bld, NV_OP_LDA, res); + assert(res->reg.type == type); + } + bld->saved_inputs[bld->ti->input_map[idx][swz]] = res; + break; + case TGSI_FILE_TEMPORARY: + if (bld->ti->store_to_memory) + res = bld_lmem_load(bld, ptr, idx * 4 + swz); + else + res = bld_fetch_global(bld, &bld->tvs[idx][swz]); + break; + case TGSI_FILE_ADDRESS: + res = bld_fetch_global(bld, &bld->avs[idx][swz]); + break; + case TGSI_FILE_PREDICATE: + res = bld_fetch_global(bld, &bld->pvs[idx][swz]); + break; + default: + NOUVEAU_ERR("illegal/unhandled src reg file: %d\n", src->Register.File); + abort(); + break; + } + if (!res) + return bld_undef(bld, NV_FILE_GPR); + + sgn = tgsi_util_get_full_src_register_sign_mode(src, chan); + + if (insn->Instruction.Opcode != TGSI_OPCODE_MOV) + res->reg.as_type = type; + else + if (sgn != TGSI_UTIL_SIGN_KEEP) /* apparently "MOV A, -B" assumes float */ + res->reg.as_type = NV_TYPE_F32; + + switch (sgn) { + case TGSI_UTIL_SIGN_KEEP: + break; + case TGSI_UTIL_SIGN_CLEAR: + res = bld_insn_1(bld, NV_OP_ABS, res); + break; + case TGSI_UTIL_SIGN_TOGGLE: + res = bld_insn_1(bld, NV_OP_NEG, res); + break; + case TGSI_UTIL_SIGN_SET: + res = bld_insn_1(bld, NV_OP_ABS, res); + res = bld_insn_1(bld, NV_OP_NEG, res); + break; + default: + NOUVEAU_ERR("illegal/unhandled src reg sign mode\n"); + abort(); + break; + } + + return res; +} + +static void +bld_lit(struct bld_context *bld, struct nv_value *dst0[4], + const struct tgsi_full_instruction *insn) +{ + struct nv_value *val0, *zero; + unsigned mask = insn->Dst[0].Register.WriteMask; + + if (mask & ((1 << 0) | (1 << 3))) + dst0[3] = dst0[0] = bld_load_imm_f32(bld, 1.0f); + + if (mask & (3 << 1)) { + zero = bld_load_imm_f32(bld, 0.0f); + val0 = bld_insn_2(bld, NV_OP_MAX, emit_fetch(bld, insn, 0, 0), zero); + + if (mask & (1 << 1)) + dst0[1] = val0; + } + + if (mask & (1 << 2)) { + struct nv_value *val1, *val3, *src1, *src3; + struct nv_value *pos128 = bld_load_imm_f32(bld, 127.999999f); + struct nv_value *neg128 = bld_load_imm_f32(bld, -127.999999f); + + src1 = emit_fetch(bld, insn, 0, 1); + src3 = emit_fetch(bld, insn, 0, 3); + + val0->insn->flags_def = new_value(bld->pc, NV_FILE_FLAGS, NV_TYPE_U16); + val0->insn->flags_def->insn = val0->insn; + + val1 = bld_insn_2(bld, NV_OP_MAX, src1, zero); + val3 = bld_insn_2(bld, NV_OP_MAX, src3, neg128); + val3 = bld_insn_2(bld, NV_OP_MIN, val3, pos128); + val3 = bld_pow(bld, val1, val3); + + dst0[2] = bld_insn_1(bld, NV_OP_MOV, zero); + dst0[2]->insn->cc = NV_CC_LE; + dst0[2]->insn->flags_src = new_ref(bld->pc, val0->insn->flags_def); + + dst0[2] = bld_insn_2(bld, NV_OP_SELECT, val3, dst0[2]); + } +} + +static INLINE void +get_tex_dim(const struct tgsi_full_instruction *insn, int *dim, int *arg) +{ + switch (insn->Texture.Texture) { + case TGSI_TEXTURE_1D: + *arg = *dim = 1; + break; + case TGSI_TEXTURE_SHADOW1D: + *dim = 1; + *arg = 2; + break; + case TGSI_TEXTURE_UNKNOWN: + case TGSI_TEXTURE_2D: + case TGSI_TEXTURE_RECT: + *arg = *dim = 2; + break; + case TGSI_TEXTURE_SHADOW2D: + case TGSI_TEXTURE_SHADOWRECT: + *dim = 2; + *arg = 3; + break; + case TGSI_TEXTURE_3D: + case TGSI_TEXTURE_CUBE: + *dim = *arg = 3; + break; + default: + assert(0); + break; + } +} + +static void +load_proj_tex_coords(struct bld_context *bld, + struct nv_value *t[4], int dim, int arg, + const struct tgsi_full_instruction *insn) +{ + int c, mask; + + mask = (1 << dim) - 1; + if (arg != dim) + mask |= 4; /* depth comparison value */ + + t[3] = emit_fetch(bld, insn, 0, 3); + + if (t[3]->insn->opcode == NV_OP_PINTERP) { + t[3] = bld_duplicate_insn(bld, t[3]->insn); + t[3]->insn->opcode = NV_OP_LINTERP; + nv_reference(bld->pc, &t[3]->insn->src[1], NULL); + } + + t[3] = bld_insn_1(bld, NV_OP_RCP, t[3]); + + for (c = 0; c < 4; ++c) { + if (!(mask & (1 << c))) + continue; + t[c] = emit_fetch(bld, insn, 0, c); + + if (t[c]->insn->opcode != NV_OP_LINTERP && + t[c]->insn->opcode != NV_OP_PINTERP) + continue; + t[c] = bld_duplicate_insn(bld, t[c]->insn); + t[c]->insn->opcode = NV_OP_PINTERP; + nv_reference(bld->pc, &t[c]->insn->src[1], t[3]); + + mask &= ~(1 << c); + } + + for (c = 0; mask; ++c, mask >>= 1) { + if (!(mask & 1)) + continue; + t[c] = bld_insn_2(bld, NV_OP_MUL, t[c], t[3]); + } +} + +/* For a quad of threads / top left, top right, bottom left, bottom right + * pixels, do a different operation, and take src0 from a specific thread. + */ +#define QOP_ADD 0 +#define QOP_SUBR 1 +#define QOP_SUB 2 +#define QOP_MOV1 3 + +#define QOP(a, b, c, d) \ + ((QOP_##a << 0) | (QOP_##b << 2) | (QOP_##c << 4) | (QOP_##d << 6)) + +static INLINE struct nv_value * +bld_quadop(struct bld_context *bld, ubyte qop, struct nv_value *src0, int lane, + struct nv_value *src1, boolean wp) +{ + struct nv_value *val = bld_insn_2(bld, NV_OP_QUADOP, src0, src1); + val->insn->lanes = lane; + val->insn->quadop = qop; + if (wp) { + val->insn->flags_def = new_value(bld->pc, NV_FILE_FLAGS, NV_TYPE_U16); + val->insn->flags_def->insn = val->insn; + } + return val; +} + +static INLINE struct nv_value * +bld_cmov(struct bld_context *bld, + struct nv_value *src, ubyte cc, struct nv_value *cr) +{ + src = bld_insn_1(bld, NV_OP_MOV, src); + + src->insn->cc = cc; + src->insn->flags_src = new_ref(bld->pc, cr); + + return src; +} + +static struct nv_instruction * +emit_tex(struct bld_context *bld, uint opcode, + struct nv_value *dst[4], struct nv_value *t_in[4], + int argc, int tic, int tsc, int cube) +{ + struct nv_value *t[4]; + struct nv_instruction *nvi; + int c; + + /* the inputs to a tex instruction must be separate values */ + for (c = 0; c < argc; ++c) { + t[c] = bld_insn_1(bld, NV_OP_MOV, t_in[c]); + SET_TYPE(t[c], NV_TYPE_F32); + t[c]->insn->fixed = 1; + } + + nvi = new_instruction(bld->pc, opcode); + + for (c = 0; c < 4; ++c) + dst[c] = bld_def(nvi, c, new_value(bld->pc, NV_FILE_GPR, NV_TYPE_F32)); + + for (c = 0; c < argc; ++c) + nvi->src[c] = new_ref(bld->pc, t[c]); + + nvi->tex_t = tic; + nvi->tex_s = tsc; + nvi->tex_mask = 0xf; + nvi->tex_cube = cube; + nvi->tex_live = 0; + nvi->tex_argc = argc; + + return nvi; +} + +static void +bld_texlod_sequence(struct bld_context *bld, + struct nv_value *dst[4], struct nv_value *t[4], int arg, + int tic, int tsc, int cube) +{ + emit_tex(bld, NV_OP_TXL, dst, t, arg, tic, tsc, cube); /* TODO */ +} + + +/* The lanes of a quad are grouped by the bit in the condition register + * they have set, which is selected by differing bias values. + * Move the input values for TEX into a new register set for each group + * and execute TEX only for a specific group. + * We always need to use 4 new registers for the inputs/outputs because + * the implicitly calculated derivatives must be correct. + */ +static void +bld_texbias_sequence(struct bld_context *bld, + struct nv_value *dst[4], struct nv_value *t[4], int arg, + int tic, int tsc, int cube) +{ + struct nv_instruction *sel, *tex; + struct nv_value *bit[4], *cr[4], *res[4][4], *val; + int l, c; + + const ubyte cc[4] = { NV_CC_EQ, NV_CC_S, NV_CC_C, NV_CC_O }; + + for (l = 0; l < 4; ++l) { + bit[l] = bld_load_imm_u32(bld, 1 << l); + + val = bld_quadop(bld, QOP(SUBR, SUBR, SUBR, SUBR), + t[arg - 1], l, t[arg - 1], TRUE); + + cr[l] = bld_cmov(bld, bit[l], NV_CC_EQ, val->insn->flags_def); + + cr[l]->reg.file = NV_FILE_FLAGS; + SET_TYPE(cr[l], NV_TYPE_U16); + } + + sel = new_instruction(bld->pc, NV_OP_SELECT); + + for (l = 0; l < 4; ++l) + sel->src[l] = new_ref(bld->pc, cr[l]); + + bld_def(sel, 0, new_value(bld->pc, NV_FILE_FLAGS, NV_TYPE_U16)); + + for (l = 0; l < 4; ++l) { + tex = emit_tex(bld, NV_OP_TXB, dst, t, arg, tic, tsc, cube); + + tex->cc = cc[l]; + tex->flags_src = new_ref(bld->pc, sel->def[0]); + + for (c = 0; c < 4; ++c) + res[l][c] = tex->def[c]; + } + + for (l = 0; l < 4; ++l) + for (c = 0; c < 4; ++c) + res[l][c] = bld_cmov(bld, res[l][c], cc[l], sel->def[0]); + + for (c = 0; c < 4; ++c) { + sel = new_instruction(bld->pc, NV_OP_SELECT); + + for (l = 0; l < 4; ++l) + sel->src[l] = new_ref(bld->pc, res[l][c]); + + bld_def(sel, 0, (dst[c] = new_value(bld->pc, NV_FILE_GPR, NV_TYPE_F32))); + } +} + +static boolean +bld_is_constant(struct nv_value *val) +{ + if (val->reg.file == NV_FILE_IMM) + return TRUE; + return val->insn && nvcg_find_constant(val->insn->src[0]); +} + +static void +bld_tex(struct bld_context *bld, struct nv_value *dst0[4], + const struct tgsi_full_instruction *insn) +{ + struct nv_value *t[4], *s[3]; + uint opcode = translate_opcode(insn->Instruction.Opcode); + int arg, dim, c; + const int tic = insn->Src[1].Register.Index; + const int tsc = 0; + const int cube = (insn->Texture.Texture == TGSI_TEXTURE_CUBE) ? 1 : 0; + + get_tex_dim(insn, &dim, &arg); + + if (!cube && insn->Instruction.Opcode == TGSI_OPCODE_TXP) + load_proj_tex_coords(bld, t, dim, arg, insn); + else { + for (c = 0; c < dim; ++c) + t[c] = emit_fetch(bld, insn, 0, c); + if (arg != dim) + t[dim] = emit_fetch(bld, insn, 0, 2); + } + + if (cube) { + assert(dim >= 3); + for (c = 0; c < 3; ++c) + s[c] = bld_insn_1(bld, NV_OP_ABS, t[c]); + + s[0] = bld_insn_2(bld, NV_OP_MAX, s[0], s[1]); + s[0] = bld_insn_2(bld, NV_OP_MAX, s[0], s[2]); + s[0] = bld_insn_1(bld, NV_OP_RCP, s[0]); + + for (c = 0; c < 3; ++c) + t[c] = bld_insn_2(bld, NV_OP_MUL, t[c], s[0]); + } + + if (opcode == NV_OP_TXB || opcode == NV_OP_TXL) { + t[arg++] = emit_fetch(bld, insn, 0, 3); + + if ((bld->ti->p->type == PIPE_SHADER_FRAGMENT) && + !bld_is_constant(t[arg - 1])) { + if (opcode == NV_OP_TXB) + bld_texbias_sequence(bld, dst0, t, arg, tic, tsc, cube); + else + bld_texlod_sequence(bld, dst0, t, arg, tic, tsc, cube); + return; + } + } + + emit_tex(bld, opcode, dst0, t, arg, tic, tsc, cube); +} + +static INLINE struct nv_value * +bld_dot(struct bld_context *bld, const struct tgsi_full_instruction *insn, + int n) +{ + struct nv_value *dotp, *src0, *src1; + int c; + + src0 = emit_fetch(bld, insn, 0, 0); + src1 = emit_fetch(bld, insn, 1, 0); + dotp = bld_insn_2(bld, NV_OP_MUL, src0, src1); + + for (c = 1; c < n; ++c) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + dotp = bld_insn_3(bld, NV_OP_MAD, src0, src1, dotp); + } + return dotp; +} + +#define FOR_EACH_DST0_ENABLED_CHANNEL(chan, inst) \ + for (chan = 0; chan < 4; ++chan) \ + if ((inst)->Dst[0].Register.WriteMask & (1 << chan)) + +static void +bld_instruction(struct bld_context *bld, + const struct tgsi_full_instruction *insn) +{ + struct nv_value *src0; + struct nv_value *src1; + struct nv_value *src2; + struct nv_value *dst0[4]; + struct nv_value *temp; + int c; + uint opcode = translate_opcode(insn->Instruction.Opcode); + +#ifdef NV50_TGSI2NC_DEBUG + debug_printf("bld_instruction:"); tgsi_dump_instruction(insn, 1); +#endif + + switch (insn->Instruction.Opcode) { + case TGSI_OPCODE_ADD: + case TGSI_OPCODE_MAX: + case TGSI_OPCODE_MIN: + case TGSI_OPCODE_MUL: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + dst0[c] = bld_insn_2(bld, opcode, src0, src1); + } + break; + case TGSI_OPCODE_ARL: + src1 = bld_imm_u32(bld, 4); + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + temp = bld_insn_1(bld, NV_OP_FLOOR, src0); + SET_TYPE(temp, NV_TYPE_S32); + dst0[c] = bld_insn_2(bld, NV_OP_SHL, temp, src1); + } + break; + case TGSI_OPCODE_CMP: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + src2 = emit_fetch(bld, insn, 2, c); + src0 = bld_predicate(bld, src0, FALSE); + + src1 = bld_insn_1(bld, NV_OP_MOV, src1); + src1->insn->flags_src = new_ref(bld->pc, src0); + src1->insn->cc = NV_CC_LT; + + src2 = bld_insn_1(bld, NV_OP_MOV, src2); + src2->insn->flags_src = new_ref(bld->pc, src0); + src2->insn->cc = NV_CC_GE; + + dst0[c] = bld_insn_2(bld, NV_OP_SELECT, src1, src2); + } + break; + case TGSI_OPCODE_COS: + case TGSI_OPCODE_SIN: + src0 = emit_fetch(bld, insn, 0, 0); + temp = bld_insn_1(bld, NV_OP_PRESIN, src0); + if (insn->Dst[0].Register.WriteMask & 7) + temp = bld_insn_1(bld, opcode, temp); + for (c = 0; c < 3; ++c) + if (insn->Dst[0].Register.WriteMask & (1 << c)) + dst0[c] = temp; + if (!(insn->Dst[0].Register.WriteMask & (1 << 3))) + break; + src0 = emit_fetch(bld, insn, 0, 3); + temp = bld_insn_1(bld, NV_OP_PRESIN, src0); + dst0[3] = bld_insn_1(bld, opcode, temp); + break; + case TGSI_OPCODE_DP2: + temp = bld_dot(bld, insn, 2); + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; + case TGSI_OPCODE_DP3: + temp = bld_dot(bld, insn, 3); + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; + case TGSI_OPCODE_DP4: + temp = bld_dot(bld, insn, 4); + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; + case TGSI_OPCODE_DPH: + src0 = bld_dot(bld, insn, 3); + src1 = emit_fetch(bld, insn, 1, 3); + temp = bld_insn_2(bld, NV_OP_ADD, src0, src1); + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; + case TGSI_OPCODE_DST: + if (insn->Dst[0].Register.WriteMask & 1) + dst0[0] = bld_imm_f32(bld, 1.0f); + if (insn->Dst[0].Register.WriteMask & 2) { + src0 = emit_fetch(bld, insn, 0, 1); + src1 = emit_fetch(bld, insn, 1, 1); + dst0[1] = bld_insn_2(bld, NV_OP_MUL, src0, src1); + } + if (insn->Dst[0].Register.WriteMask & 4) + dst0[2] = emit_fetch(bld, insn, 0, 2); + if (insn->Dst[0].Register.WriteMask & 8) + dst0[3] = emit_fetch(bld, insn, 1, 3); + break; + case TGSI_OPCODE_EXP: + src0 = emit_fetch(bld, insn, 0, 0); + temp = bld_insn_1(bld, NV_OP_FLOOR, src0); + + if (insn->Dst[0].Register.WriteMask & 2) + dst0[1] = bld_insn_2(bld, NV_OP_SUB, src0, temp); + if (insn->Dst[0].Register.WriteMask & 1) { + temp = bld_insn_1(bld, NV_OP_PREEX2, temp); + dst0[0] = bld_insn_1(bld, NV_OP_EX2, temp); + } + if (insn->Dst[0].Register.WriteMask & 4) { + temp = bld_insn_1(bld, NV_OP_PREEX2, src0); + dst0[2] = bld_insn_1(bld, NV_OP_EX2, temp); + } + if (insn->Dst[0].Register.WriteMask & 8) + dst0[3] = bld_imm_f32(bld, 1.0f); + break; + case TGSI_OPCODE_EX2: + src0 = emit_fetch(bld, insn, 0, 0); + temp = bld_insn_1(bld, NV_OP_PREEX2, src0); + temp = bld_insn_1(bld, NV_OP_EX2, temp); + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; + case TGSI_OPCODE_FRC: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + dst0[c] = bld_insn_1(bld, NV_OP_FLOOR, src0); + dst0[c] = bld_insn_2(bld, NV_OP_SUB, src0, dst0[c]); + } + break; + case TGSI_OPCODE_KIL: + for (c = 0; c < 4; ++c) { + src0 = emit_fetch(bld, insn, 0, c); + bld_kil(bld, src0); + } + break; + case TGSI_OPCODE_KILP: + (new_instruction(bld->pc, NV_OP_KIL))->fixed = 1; + break; + case TGSI_OPCODE_IF: + { + struct nv_basic_block *b = new_basic_block(bld->pc); + + assert(bld->cond_lvl < BLD_MAX_COND_NESTING); + + nvbb_attach_block(bld->pc->current_block, b, CFG_EDGE_FORWARD); + + bld->join_bb[bld->cond_lvl] = bld->pc->current_block; + bld->cond_bb[bld->cond_lvl] = bld->pc->current_block; + + src1 = bld_predicate(bld, emit_fetch(bld, insn, 0, 0), TRUE); + + bld_flow(bld, NV_OP_BRA, NV_CC_EQ, src1, NULL, (bld->cond_lvl == 0)); + + ++bld->cond_lvl; + bld_new_block(bld, b); + } + break; + case TGSI_OPCODE_ELSE: + { + struct nv_basic_block *b = new_basic_block(bld->pc); + + --bld->cond_lvl; + nvbb_attach_block(bld->join_bb[bld->cond_lvl], b, CFG_EDGE_FORWARD); + + bld->cond_bb[bld->cond_lvl]->exit->target = b; + bld->cond_bb[bld->cond_lvl] = bld->pc->current_block; + + new_instruction(bld->pc, NV_OP_BRA)->is_terminator = 1; + + ++bld->cond_lvl; + bld_new_block(bld, b); + } + break; + case TGSI_OPCODE_ENDIF: + { + struct nv_basic_block *b = new_basic_block(bld->pc); + + --bld->cond_lvl; + nvbb_attach_block(bld->pc->current_block, b, bld->out_kind); + nvbb_attach_block(bld->cond_bb[bld->cond_lvl], b, CFG_EDGE_FORWARD); + + bld->cond_bb[bld->cond_lvl]->exit->target = b; + + bld_new_block(bld, b); + + if (!bld->cond_lvl && bld->join_bb[bld->cond_lvl]) { + bld->join_bb[bld->cond_lvl]->exit->prev->target = b; + new_instruction(bld->pc, NV_OP_JOIN)->is_join = TRUE; + } + } + break; + case TGSI_OPCODE_BGNLOOP: + { + struct nv_basic_block *bl = new_basic_block(bld->pc); + struct nv_basic_block *bb = new_basic_block(bld->pc); + + assert(bld->loop_lvl < BLD_MAX_LOOP_NESTING); + + bld->loop_bb[bld->loop_lvl] = bl; + bld->brkt_bb[bld->loop_lvl] = bb; + + bld_flow(bld, NV_OP_BREAKADDR, NV_CC_TR, NULL, bb, FALSE); + + nvbb_attach_block(bld->pc->current_block, bl, CFG_EDGE_LOOP_ENTER); + + bld_new_block(bld, bld->loop_bb[bld->loop_lvl++]); + + if (bld->loop_lvl == bld->pc->loop_nesting_bound) + bld->pc->loop_nesting_bound++; + + bld_clear_def_use(&bld->tvs[0][0], BLD_MAX_TEMPS, bld->loop_lvl); + bld_clear_def_use(&bld->avs[0][0], BLD_MAX_ADDRS, bld->loop_lvl); + bld_clear_def_use(&bld->pvs[0][0], BLD_MAX_PREDS, bld->loop_lvl); + } + break; + case TGSI_OPCODE_BRK: + { + struct nv_basic_block *bb = bld->brkt_bb[bld->loop_lvl - 1]; + + bld_flow(bld, NV_OP_BREAK, NV_CC_TR, NULL, bb, FALSE); + + if (bld->out_kind == CFG_EDGE_FORWARD) /* else we already had BRK/CONT */ + nvbb_attach_block(bld->pc->current_block, bb, CFG_EDGE_LOOP_LEAVE); + + bld->out_kind = CFG_EDGE_FAKE; + } + break; + case TGSI_OPCODE_CONT: + { + struct nv_basic_block *bb = bld->loop_bb[bld->loop_lvl - 1]; + + bld_flow(bld, NV_OP_BRA, NV_CC_TR, NULL, bb, FALSE); + + nvbb_attach_block(bld->pc->current_block, bb, CFG_EDGE_BACK); + + if ((bb = bld->join_bb[bld->cond_lvl - 1])) { + bld->join_bb[bld->cond_lvl - 1] = NULL; + nv_nvi_delete(bb->exit->prev); + } + bld->out_kind = CFG_EDGE_FAKE; + } + break; + case TGSI_OPCODE_ENDLOOP: + { + struct nv_basic_block *bb = bld->loop_bb[bld->loop_lvl - 1]; + + bld_flow(bld, NV_OP_BRA, NV_CC_TR, NULL, bb, FALSE); + + nvbb_attach_block(bld->pc->current_block, bb, CFG_EDGE_BACK); + + bld_loop_end(bld, bb); /* replace loop-side operand of the phis */ + + bld_new_block(bld, bld->brkt_bb[--bld->loop_lvl]); + } + break; + case TGSI_OPCODE_ABS: + case TGSI_OPCODE_CEIL: + case TGSI_OPCODE_FLR: + case TGSI_OPCODE_TRUNC: + case TGSI_OPCODE_DDX: + case TGSI_OPCODE_DDY: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + dst0[c] = bld_insn_1(bld, opcode, src0); + } + break; + case TGSI_OPCODE_LIT: + bld_lit(bld, dst0, insn); + break; + case TGSI_OPCODE_LRP: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + src2 = emit_fetch(bld, insn, 2, c); + dst0[c] = bld_insn_2(bld, NV_OP_SUB, src1, src2); + dst0[c] = bld_insn_3(bld, NV_OP_MAD, dst0[c], src0, src2); + } + break; + case TGSI_OPCODE_MOV: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = emit_fetch(bld, insn, 0, c); + break; + case TGSI_OPCODE_MAD: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + src2 = emit_fetch(bld, insn, 2, c); + dst0[c] = bld_insn_3(bld, opcode, src0, src1, src2); + } + break; + case TGSI_OPCODE_POW: + src0 = emit_fetch(bld, insn, 0, 0); + src1 = emit_fetch(bld, insn, 1, 0); + temp = bld_pow(bld, src0, src1); + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; + case TGSI_OPCODE_LOG: + src0 = emit_fetch(bld, insn, 0, 0); + src0 = bld_insn_1(bld, NV_OP_ABS, src0); + temp = bld_insn_1(bld, NV_OP_LG2, src0); + dst0[2] = temp; + if (insn->Dst[0].Register.WriteMask & 3) { + temp = bld_insn_1(bld, NV_OP_FLOOR, temp); + dst0[0] = temp; + } + if (insn->Dst[0].Register.WriteMask & 2) { + temp = bld_insn_1(bld, NV_OP_PREEX2, temp); + temp = bld_insn_1(bld, NV_OP_EX2, temp); + temp = bld_insn_1(bld, NV_OP_RCP, temp); + dst0[1] = bld_insn_2(bld, NV_OP_MUL, src0, temp); + } + if (insn->Dst[0].Register.WriteMask & 8) + dst0[3] = bld_imm_f32(bld, 1.0f); + break; + case TGSI_OPCODE_RCP: + case TGSI_OPCODE_LG2: + src0 = emit_fetch(bld, insn, 0, 0); + temp = bld_insn_1(bld, opcode, src0); + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; + case TGSI_OPCODE_RSQ: + src0 = emit_fetch(bld, insn, 0, 0); + temp = bld_insn_1(bld, NV_OP_ABS, src0); + temp = bld_insn_1(bld, NV_OP_RSQ, temp); + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; + case TGSI_OPCODE_SLT: + case TGSI_OPCODE_SGE: + case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_SGT: + case TGSI_OPCODE_SLE: + case TGSI_OPCODE_SNE: + case TGSI_OPCODE_ISLT: + case TGSI_OPCODE_ISGE: + case TGSI_OPCODE_USEQ: + case TGSI_OPCODE_USGE: + case TGSI_OPCODE_USLT: + case TGSI_OPCODE_USNE: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + dst0[c] = bld_insn_2(bld, NV_OP_SET, src0, src1); + dst0[c]->insn->set_cond = translate_setcc(insn->Instruction.Opcode); + SET_TYPE(dst0[c], infer_dst_type(insn->Instruction.Opcode)); + + dst0[c]->insn->src[0]->typecast = + dst0[c]->insn->src[1]->typecast = + infer_src_type(insn->Instruction.Opcode); + + if (dst0[c]->reg.type != NV_TYPE_F32) + break; + dst0[c]->reg.as_type = NV_TYPE_S32; + dst0[c] = bld_insn_1(bld, NV_OP_ABS, dst0[c]); + dst0[c] = bld_insn_1(bld, NV_OP_CVT, dst0[c]); + SET_TYPE(dst0[c], NV_TYPE_F32); + } + break; + case TGSI_OPCODE_SCS: + if (insn->Dst[0].Register.WriteMask & 0x3) { + src0 = emit_fetch(bld, insn, 0, 0); + temp = bld_insn_1(bld, NV_OP_PRESIN, src0); + if (insn->Dst[0].Register.WriteMask & 0x1) + dst0[0] = bld_insn_1(bld, NV_OP_COS, temp); + if (insn->Dst[0].Register.WriteMask & 0x2) + dst0[1] = bld_insn_1(bld, NV_OP_SIN, temp); + } + if (insn->Dst[0].Register.WriteMask & 0x4) + dst0[2] = bld_imm_f32(bld, 0.0f); + if (insn->Dst[0].Register.WriteMask & 0x8) + dst0[3] = bld_imm_f32(bld, 1.0f); + break; + case TGSI_OPCODE_SSG: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = bld_predicate(bld, src0, FALSE); + temp = bld_insn_2(bld, NV_OP_AND, src0, bld_imm_u32(bld, 0x80000000)); + temp = bld_insn_2(bld, NV_OP_OR, temp, bld_imm_f32(bld, 1.0f)); + dst0[c] = bld_insn_2(bld, NV_OP_XOR, temp, temp); + dst0[c]->insn->cc = NV_CC_EQ; + nv_reference(bld->pc, &dst0[c]->insn->flags_src, src1); + } + break; + case TGSI_OPCODE_SUB: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + dst0[c] = bld_insn_2(bld, NV_OP_ADD, src0, src1); + dst0[c]->insn->src[1]->mod ^= NV_MOD_NEG; + } + break; + case TGSI_OPCODE_TEX: + case TGSI_OPCODE_TXB: + case TGSI_OPCODE_TXL: + case TGSI_OPCODE_TXP: + bld_tex(bld, dst0, insn); + break; + case TGSI_OPCODE_XPD: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + if (c == 3) { + dst0[3] = bld_imm_f32(bld, 1.0f); + break; + } + src0 = emit_fetch(bld, insn, 1, (c + 1) % 3); + src1 = emit_fetch(bld, insn, 0, (c + 2) % 3); + dst0[c] = bld_insn_2(bld, NV_OP_MUL, src0, src1); + + src0 = emit_fetch(bld, insn, 0, (c + 1) % 3); + src1 = emit_fetch(bld, insn, 1, (c + 2) % 3); + dst0[c] = bld_insn_3(bld, NV_OP_MAD, src0, src1, dst0[c]); + + dst0[c]->insn->src[2]->mod ^= NV_MOD_NEG; + } + break; + case TGSI_OPCODE_RET: + (new_instruction(bld->pc, NV_OP_RET))->fixed = 1; + break; + case TGSI_OPCODE_END: + if (bld->ti->p->type == PIPE_SHADER_FRAGMENT) + bld_export_outputs(bld); + break; + default: + NOUVEAU_ERR("unhandled opcode %u\n", insn->Instruction.Opcode); + abort(); + break; + } + + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + emit_store(bld, insn, c, dst0[c]); +} + +static INLINE void +bld_free_value_trackers(struct bld_value_stack *base, int n) +{ + int i, c; + + for (i = 0; i < n; ++i) + for (c = 0; c < 4; ++c) + if (base[i * 4 + c].body) + FREE(base[i * 4 + c].body); +} + +int +nv50_tgsi_to_nc(struct nv_pc *pc, struct nv50_translation_info *ti) +{ + struct bld_context *bld = CALLOC_STRUCT(bld_context); + int c; + unsigned ip; + + pc->root[0] = pc->current_block = new_basic_block(pc); + + bld->pc = pc; + bld->ti = ti; + + pc->loop_nesting_bound = 1; + + c = util_bitcount(bld->ti->p->fp.interp >> 24); + if (c && ti->p->type == PIPE_SHADER_FRAGMENT) { + bld->frgcrd[3] = new_value(pc, NV_FILE_MEM_V, NV_TYPE_F32); + bld->frgcrd[3]->reg.id = c - 1; + bld->frgcrd[3] = bld_insn_1(bld, NV_OP_LINTERP, bld->frgcrd[3]); + bld->frgcrd[3] = bld_insn_1(bld, NV_OP_RCP, bld->frgcrd[3]); + } + + for (ip = 0; ip < ti->inst_nr; ++ip) + bld_instruction(bld, &ti->insns[ip]); + + bld_free_value_trackers(&bld->tvs[0][0], BLD_MAX_TEMPS); + bld_free_value_trackers(&bld->avs[0][0], BLD_MAX_ADDRS); + bld_free_value_trackers(&bld->pvs[0][0], BLD_MAX_PREDS); + + bld_free_value_trackers(&bld->ovs[0][0], PIPE_MAX_SHADER_OUTPUTS); + + FREE(bld); + return 0; +} + +/* If a variable is assigned in a loop, replace all references to the value + * from outside the loop with a phi value. + */ +static void +bld_replace_value(struct nv_pc *pc, struct nv_basic_block *b, + struct nv_value *old_val, + struct nv_value *new_val) +{ + struct nv_instruction *nvi; + + for (nvi = b->phi ? b->phi : b->entry; nvi; nvi = nvi->next) { + int s; + for (s = 0; s < 5; ++s) { + if (!nvi->src[s]) + continue; + if (nvi->src[s]->value == old_val) + nv_reference(pc, &nvi->src[s], new_val); + } + if (nvi->flags_src && nvi->flags_src->value == old_val) + nv_reference(pc, &nvi->flags_src, new_val); + } + + b->pass_seq = pc->pass_seq; + + if (b->out[0] && b->out[0]->pass_seq < pc->pass_seq) + bld_replace_value(pc, b->out[0], old_val, new_val); + + if (b->out[1] && b->out[1]->pass_seq < pc->pass_seq) + bld_replace_value(pc, b->out[1], old_val, new_val); +} diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c index 1f119501999..d41a59d05db 100644 --- a/src/gallium/drivers/nv50/nv50_vbo.c +++ b/src/gallium/drivers/nv50/nv50_vbo.c @@ -29,96 +29,6 @@ #include "nv50_context.h" #include "nv50_resource.h" -static INLINE uint32_t -nv50_vbo_type_to_hw(enum pipe_format format) -{ - const struct util_format_description *desc; - - desc = util_format_description(format); - assert(desc); - - switch (desc->channel[0].type) { - case UTIL_FORMAT_TYPE_FLOAT: - return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_FLOAT; - case UTIL_FORMAT_TYPE_UNSIGNED: - if (desc->channel[0].normalized) { - return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM; - } - return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_USCALED; - case UTIL_FORMAT_TYPE_SIGNED: - if (desc->channel[0].normalized) { - return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM; - } - return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SSCALED; - /* - case PIPE_FORMAT_TYPE_UINT: - return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UINT; - case PIPE_FORMAT_TYPE_SINT: - return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SINT; */ - default: - return 0; - } -} - -static INLINE uint32_t -nv50_vbo_size_to_hw(unsigned size, unsigned nr_c) -{ - static const uint32_t hw_values[] = { - 0, 0, 0, 0, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8_8, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8_8_8, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16_16, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16_16_16, - 0, 0, 0, 0, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32_32, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32_32_32 }; - - /* we'd also have R11G11B10 and R10G10B10A2 */ - - assert(nr_c > 0 && nr_c <= 4); - - if (size > 32) - return 0; - size >>= (3 - 2); - - return hw_values[size + (nr_c - 1)]; -} - -static INLINE uint32_t -nv50_vbo_vtxelt_to_hw(struct pipe_vertex_element *ve) -{ - uint32_t hw_type, hw_size; - enum pipe_format pf = ve->src_format; - const struct util_format_description *desc; - unsigned size, nr_components; - - desc = util_format_description(pf); - assert(desc); - - size = util_format_get_component_bits(pf, UTIL_FORMAT_COLORSPACE_RGB, 0); - nr_components = util_format_get_nr_components(pf); - - hw_type = nv50_vbo_type_to_hw(pf); - hw_size = nv50_vbo_size_to_hw(size, nr_components); - - if (!hw_type || !hw_size) { - NOUVEAU_ERR("unsupported vbo format: %s\n", util_format_name(pf)); - abort(); - return 0x24e80000; - } - - if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_Z) /* BGRA */ - hw_size |= (1 << 31); /* no real swizzle bits :-( */ - - return (hw_type | hw_size); -} - struct instance { struct nouveau_bo *bo; unsigned delta; @@ -533,7 +443,7 @@ nv50_vbo_static_attrib(struct nv50_context *nv50, unsigned attrib, so_data (so, fui(v[1])); break; case 1: - if (attrib == nv50->vertprog->cfg.edgeflag_in) { + if (attrib == nv50->vertprog->vp.edgeflag) { so_method(so, tesla, NV50TCL_EDGEFLAG_ENABLE, 1); so_data (so, v[0] ? 1 : 0); } @@ -554,11 +464,8 @@ nv50_vtxelt_construct(struct nv50_vtxelt_stateobj *cso) { unsigned i; - for (i = 0; i < cso->num_elements; ++i) { - struct pipe_vertex_element *ve = &cso->pipe[i]; - - cso->hw[i] = nv50_vbo_vtxelt_to_hw(ve); - } + for (i = 0; i < cso->num_elements; ++i) + cso->hw[i] = nv50_format_table[cso->pipe[i].src_format].vtx; } struct nouveau_stateobj * @@ -574,7 +481,7 @@ nv50_vbo_validate(struct nv50_context *nv50) nv50->vbo_fifo = 0; if (nv50->screen->force_push || - nv50->vertprog->cfg.edgeflag_in < 16) + nv50->vertprog->vp.edgeflag < 16) nv50->vbo_fifo = 0xffff; for (i = 0; i < nv50->vtxbuf_nr; i++) { |